xiangzai commited on 7 days ago

Commit

7803bdf

verified ·

1 Parent(s): 3c1ccbd

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

__pycache__/pic_npz.cpython-311.pyc +0 -0
__pycache__/pipeline_stable_diffusion_3.cpython-310.pyc +0 -0
__pycache__/sample_sd3_lora_rn_pair_ddp.cpython-311.pyc +0 -0
__pycache__/train_rectified_noise.cpython-310.pyc +0 -0
__pycache__/visualize_lora_rn_4x8.cpython-311.pyc +0 -0
accelerate_config.yaml +16 -0
cc3m_render.log +0 -0
cc3m_render.py +155 -0
download.log +0 -0
download_sd3_models.py +71 -0
eval_baseline.log +24 -0
eval_rectified_noise_new_batch_2.log +24 -0
evaluate.sh +11 -0
evaluator_base copy.py +680 -0
evaluator_base.log +5 -0
evaluator_base.py +685 -0
evaluator_rf.py +685 -0
evaluator_rf_iter22.log +25 -0
pic_npz copy.py +259 -0
pic_npz.py +157 -0
pipeline_stable_diffusion_3.py +1378 -0
rectified-noise-batch-2/checkpoint-100000/sit_weights/sit_config.json +10 -0
rectified-noise-batch-2/checkpoint-120000/sit_weights/sit_config.json +10 -0
rectified-noise-batch-2/checkpoint-140000/sit_weights/sit_config.json +10 -0
rectified-noise-batch-2/checkpoint-160000/sit_weights/sit_config.json +10 -0
rectified-noise-batch-2/checkpoint-180000/sit_weights/sit_config.json +10 -0
rectified-noise-batch-2/checkpoint-200000/sit_weights/sit_config.json +10 -0
run_sd3_lora_rn_pair_sampling.sh +50 -0
run_sd3_lora_sampling.log +0 -0
run_sd3_lora_sampling.sh +94 -0
run_sd3_rectified_sampling.sh +55 -0
run_sd3_rectified_sampling_old.sh +72 -0
sample_sd3_lora_checkpoint_ddp.py +818 -0
sample_sd3_lora_ddp.py +675 -0
sample_sd3_lora_rn_pair_ddp.py +417 -0
sample_sd3_rectified_ddp.py +1316 -0
sample_sd3_rectified_ddp_old.py +1317 -0
sd3_rectified_samples_batch2_2200005011.01.01.0cfg_cond_true.txt +5 -0
train_lora_sd3.py +1597 -0
train_lora_sd3_new.py +1422 -0
train_rectified_noise.py +0 -0
train_rectified_noise.sh +104 -0
train_rectified_noise2.py +0 -0
train_sd3_lora.log +27 -0
train_sd3_lora.sh +109 -0
train_sd3_lora2.log +216 -0
train_sd3_lora2.sh +107 -0
visual.sh +78 -0
visualize_lora_rn_4x8.py +406 -0
visualize_sitf2_noise_evolution.py +169 -0

__pycache__/pic_npz.cpython-311.pyc ADDED Viewed

Binary file (7.6 kB). View file

__pycache__/pipeline_stable_diffusion_3.cpython-310.pyc ADDED Viewed

Binary file (39.5 kB). View file

__pycache__/sample_sd3_lora_rn_pair_ddp.cpython-311.pyc ADDED Viewed

Binary file (27.1 kB). View file

__pycache__/train_rectified_noise.cpython-310.pyc ADDED Viewed

Binary file (58.7 kB). View file

__pycache__/visualize_lora_rn_4x8.cpython-311.pyc ADDED Viewed

Binary file (23.9 kB). View file

accelerate_config.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

cc3m_render.log ADDED Viewed

The diff for this file is too large to render. See raw diff

cc3m_render.py ADDED Viewed

	@@ -0,0 +1,155 @@

+#!/usr/bin/env python
+# coding: utf-8
+"""
+在 `data_root/` 下已经有 `train/` 和 `validation/` 两个文件夹时：
+分别在这两个文件夹内生成对应的 `metadata.jsonl`，不复制任何图片。
+`metadata.jsonl` 每行格式：
+  {"file_name": "subdir/000026831.jpg", "caption": "..."}
+其中 `file_name` 是相对当前 split 目录（train/ 或 validation/）的路径。
+"""
+import argparse
+import json
+import os
+from concurrent.futures import ThreadPoolExecutor
+from itertools import islice
+from pathlib import Path
+from typing import Optional, Tuple
+from tqdm import tqdm
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Generate per-split metadata.jsonl for imagefolder (no copy)")
+    parser.add_argument(
+        "--data_root",
+        type=str,
+        default="/gemini/space/hsd/project/dataset/cc3m-wds",
+        help="数据根目录（必须包含 train/ 和 validation/）",
+    )
+    parser.add_argument(
+        "--jsonl_name",
+        type=str,
+        default="metadata.jsonl",
+        help="每个 split 下生成的 jsonl 文件名（默认 metadata.jsonl）",
+    )
+    parser.add_argument(
+        "--use_txt_caption",
+        action="store_true",
+        default=True,
+        help="优先读取同名 .txt 作为 caption（默认开启），否则回落到 .json",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=32,
+        help="线程数（I/O 密集型建议 8~64 之间按机器调整）",
+    )
+    parser.add_argument(
+        "--max_images",
+        type=int,
+        default=None,
+        help="每个 split 最多处理多少张图片（None 表示全部，调试可用）",
+    )
+    return parser.parse_args()
+def read_caption_from_txt(txt_path: Path) -> Optional[str]:
+    if not txt_path.exists():
+        return None
+    try:
+        with txt_path.open("r", encoding="utf-8") as f:
+            caption = f.read().strip()
+        return caption or None
+    except Exception:
+        return None
+def read_caption_from_json(json_path: Path) -> Optional[str]:
+    if not json_path.exists():
+        return None
+    try:
+        with json_path.open("r", encoding="utf-8") as f:
+            data = json.load(f)
+        for key in ["caption", "text", "description"]:
+            if key in data and isinstance(data[key], str) and data[key].strip():
+                return data[key].strip()
+    except Exception:
+        return None
+    return None
+def main() -> None:
+    args = parse_args()
+    data_root = Path(args.data_root).resolve()
+    if not data_root.exists():
+        raise FileNotFoundError(f"数据根目录不存在：{data_root}")
+    splits = [("train", data_root / "train"), ("validation", data_root / "validation")]
+    for split_name, split_dir in splits:
+        if not split_dir.exists():
+            raise FileNotFoundError(f"缺少目录：{split_dir}（需要 train/ 和 validation/）")
+    def iter_images(split_dir: Path):
+        for root, _dirs, files in os.walk(split_dir):
+            for name in files:
+                if name.lower().endswith((".jpg", ".jpeg", ".png")):
+                    yield Path(root) / name
+    def process_one(img_path: Path, split_dir: Path) -> Optional[Tuple[str, str]]:
+        txt_path = img_path.with_suffix(".txt")
+        json_path = img_path.with_suffix(".json")
+        caption = None
+        if args.use_txt_caption:
+            caption = read_caption_from_txt(txt_path)
+            if caption is None:
+                caption = read_caption_from_json(json_path)
+        else:
+            caption = read_caption_from_json(json_path)
+            if caption is None:
+                caption = read_caption_from_txt(txt_path)
+        if caption is None:
+            return None
+        rel = img_path.relative_to(split_dir)
+        return str(rel).replace(os.sep, "/"), caption
+    for split_name, split_dir in splits:
+        jsonl_path = split_dir / args.jsonl_name
+        img_iter = iter_images(split_dir)
+        if args.max_images is not None:
+            img_iter = islice(img_iter, args.max_images)
+        # tqdm 需要可迭代对象，这里不预先收集列表以节省内存
+        # 进度条显示 processed 数量（total 可能未知）
+        def _task_iter():
+            for p in img_iter:
+                yield p
+        written = 0
+        with jsonl_path.open("w", encoding="utf-8") as f, ThreadPoolExecutor(max_workers=args.num_workers) as ex:
+            # executor.map 保持输入顺序；tqdm 显示处理进度
+            for result in tqdm(
+                ex.map(lambda p: process_one(p, split_dir), _task_iter()),
+                desc=f"[{split_name}] Processing",
+            ):
+                if result is None:
+                    continue
+                file_name, caption = result
+                f.write(json.dumps({"file_name": file_name, "caption": caption}, ensure_ascii=False) + "\n")
+                written += 1
+        print(f"{split_name}: 写入 {written} 条 -> {jsonl_path}")
+if __name__ == "__main__":
+    main()
+# nohup python cc3m_render.py > cc3m_render.log 2>&1 &

download.log ADDED Viewed

File without changes

download_sd3_models.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/usr/bin/env python
+# coding: utf-8
+"""
+只负责“按 train_lora_sd3.py 相同的方式”下载 SD3 及相关组件到默认 HF cache：
+通过依次调用 `from_pretrained(..., subfolder=...)` 来触发下载。
+会下载的子目录（与 train_lora_sd3.py 一致）：
+  tokenizer, tokenizer_2, tokenizer_3,
+  text_encoder, text_encoder_2, text_encoder_3,
+  scheduler, vae, transformer
+用法：
+  python download_sd3_models.py --pretrained_model_name_or_path stabilityai/stable-diffusion-3-medium-diffusers
+下载完成后训练可离线：
+  export HF_HUB_OFFLINE=1 TRANSFORMERS_OFFLINE=1
+  python train_lora_sd3.py --pretrained_model_name_or_path stabilityai/stable-diffusion-3-medium-diffusers ...
+或直接指向你已经下载好的本地 repo 目录。
+"""
+import argparse
+import gc
+from typing import Optional
+import torch
+from diffusers import StableDiffusion3Pipeline
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Download SD3 via a single from_pretrained call (cache warmup only).")
+    p.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default="stabilityai/stable-diffusion-3-medium-diffusers",
+        help="模型 repo id 或本地路径（与 train_lora_sd3.py 参数一致",
+    )
+    p.add_argument("--revision", type=str, default=None, help="可选：下载特定 revision/branch/tag")
+    p.add_argument("--variant", type=str, default=None, help="可选：如 fp16 等 variant（若仓库提供）")
+    p.add_argument("--cache_dir", type=str, default=None, help="可选：自定义 HF cache_dir；默认用系统/用户默认")
+    return p.parse_args()
+def main() -> None:
+    args = parse_args()
+    model = args.pretrained_model_name_or_path
+    # 最简单：直接加载整条 pipeline，触发把其依赖的全部组件下载进默认 HF cache
+    # 注意：from_pretrained 下载的是 pipeline 会用到的文件；本脚本目的就是让训练时不再联网。
+    pipe = StableDiffusion3Pipeline.from_pretrained(
+        model,
+        revision=args.revision,
+        variant=args.variant,
+        cache_dir=args.cache_dir,
+        low_cpu_mem_usage=True,
+    )
+    # 释放内存（下载已完成）
+    del pipe
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    print("下载/缓存预热完成。后续训练可设置离线环境变量避免联网：")
+    print("  export HF_HUB_OFFLINE=1 TRANSFORMERS_OFFLINE=1")
+if __name__ == "__main__":
+    main()

eval_baseline.log ADDED Viewed

@@ -0,0 +1,24 @@
  0%|          | 0/1 [00:00<?, ?it/s]2026-03-21 21:11:45.919794: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled
  0%|          | 0/211 [00:00<?, ?it/s]
  0%|          | 1/211 [00:02<07:04,  2.02s/it]
  1%|          | 2/211 [00:03<06:56,  2.00s/it]
  1%|▏         | 3/211 [00:05<06:50,  1.98s/it]
  2%|▏         | 4/211 [00:07<06:42,  1.95s/it]
  2%|▏         | 5/211 [00:09<06:25,  1.87s/it]
  3%|▎         | 6/211 [00:11<06:20,  1.85s/it]
  3%|▎         | 7/211 [00:13<06:46,  1.99s/it]
  4%|▍         | 8/211 [00:15<06:56,  2.05s/it]
  4%|▍         | 9/211 [00:30<20:28,  6.08s/it]
  5%|▍         | 10/211 [00:32<15:51,  4.73s/it]
  5%|▌         | 11/211 [00:34<12:42,  3.81s/it]
  6%|▌         | 12/211 [00:35<10:26,  3.15s/it]
  6%|▌         | 13/211 [00:37<08:55,  2.71s/it]
  7%|▋         | 14/211 [00:39<08:09,  2.49s/it]
  7%|▋         | 15/211 [00:41<07:32,  2.31s/it]
  8%|▊         | 16/211 [00:43<06:52,  2.12s/it]
  8%|▊         | 17/211 [00:44<06:35,  2.04s/it]
  9%|▊         | 18/211 [00:46<06:18,  1.96s/it]
  9%|▉         | 19/211 [00:48<06:16,  1.96s/it]
  9%|▉         | 20/211 [00:50<06:20,  1.99s/it]
 10%|▉         | 21/211 [00:52<06:01,  1.91s/it]
 10%|█         | 22/211 [00:54<05:51,  1.86s/it]
 11%|█         | 23/211 [00:56<05:48,  1.85s/it]
 11%|█▏        | 24/211 [00:57<05:44,  1.84s/it]
 12%|█▏        | 25/211 [00:59<05:44,  1.85s/it]
 12%|█▏        | 26/211 [01:01<05:38,  1.83s/it]
 13%|█▎        | 27/211 [01:03<05:33,  1.81s/it]
 13%|█▎        | 28/211 [01:05<06:02,  1.98s/it]
 14%|█▎        | 29/211 [01:07<05:49,  1.92s/it]
 14%|█▍        | 30/211 [01:09<05:33,  1.84s/it]
 15%|█▍        | 31/211 [01:11<05:38,  1.88s/it]
 15%|█▌        | 32/211 [01:12<05:32,  1.86s/it]
 16%|█▌        | 33/211 [01:14<05:26,  1.83s/it]
 16%|█▌        | 34/211 [01:16<05:20,  1.81s/it]
 17%|█▋        | 35/211 [01:18<05:16,  1.80s/it]
 17%|█▋        | 36/211 [01:19<05:11,  1.78s/it]
 18%|█▊        | 37/211 [01:21<05:06,  1.76s/it]
 18%|█▊        | 38/211 [01:23<05:22,  1.86s/it]
 18%|█▊        | 39/211 [01:25<05:12,  1.82s/it]
 19%|█▉        | 40/211 [01:35<11:51,  4.16s/it]
 19%|█▉        | 41/211 [01:36<09:45,  3.44s/it]
 20%|█▉        | 42/211 [01:38<08:27,  3.00s/it]
 20%|██        | 43/211 [01:40<07:25,  2.65s/it]
 21%|██        | 44/211 [01:42<06:46,  2.43s/it]
 21%|██▏       | 45/211 [01:44<06:28,  2.34s/it]
 22%|██▏       | 46/211 [01:46<05:56,  2.16s/it]
 22%|██▏       | 47/211 [01:48<05:55,  2.17s/it]
 23%|██▎       | 48/211 [01:50<05:30,  2.03s/it]
 23%|██▎       | 49/211 [01:52<05:22,  1.99s/it]
 24%|██▎       | 50/211 [01:54<05:11,  1.94s/it]
 24%|██▍       | 51/211 [01:56<05:18,  1.99s/it]
 25%|██▍       | 52/211 [01:57<05:02,  1.90s/it]
 25%|██▌       | 53/211 [01:59<04:52,  1.85s/it]
 26%|██▌       | 54/211 [02:01<04:42,  1.80s/it]
 26%|██▌       | 55/211 [02:03<04:40,  1.80s/it]
 27%|██▋       | 56/211 [02:05<04:47,  1.85s/it]
 27%|██▋       | 57/211 [02:06<04:38,  1.81s/it]
 27%|██▋       | 58/211 [02:08<04:40,  1.83s/it]
 28%|██▊       | 59/211 [02:10<04:34,  1.81s/it]
 28%|██▊       | 60/211 [02:12<04:31,  1.80s/it]
 29%|██▉       | 61/211 [02:13<04:24,  1.76s/it]
 29%|██▉       | 62/211 [02:15<04:19,  1.74s/it]
 30%|██▉       | 63/211 [02:17<04:25,  1.79s/it]
 30%|███       | 64/211 [02:19<04:22,  1.78s/it]
 31%|███       | 65/211 [02:21<04:21,  1.79s/it]
 31%|███▏      | 66/211 [02:22<04:16,  1.77s/it]
 32%|███▏      | 67/211 [02:24<04:27,  1.85s/it]
 32%|███▏      | 68/211 [02:26<04:23,  1.84s/it]
 33%|███▎      | 69/211 [02:28<04:21,  1.84s/it]
 33%|███▎      | 70/211 [02:30<04:15,  1.81s/it]
 34%|███▎      | 71/211 [02:32<04:13,  1.81s/it]
 34%|███▍      | 72/211 [02:33<04:09,  1.79s/it]
 35%|███▍      | 73/211 [02:35<04:04,  1.77s/it]
 35%|███▌      | 74/211 [02:53<15:24,  6.75s/it]
 36%|███▌      | 75/211 [02:55<11:58,  5.28s/it]
 36%|███▌      | 76/211 [02:57<09:31,  4.24s/it]
 36%|███▋      | 77/211 [02:59<07:46,  3.48s/it]
 37%|███▋      | 78/211 [03:00<06:33,  2.96s/it]
 37%|███▋      | 79/211 [03:02<05:47,  2.64s/it]
 38%|███▊      | 80/211 [03:04<05:19,  2.44s/it]
 38%|███▊      | 81/211 [03:06<04:47,  2.22s/it]
 39%|███▉      | 82/211 [03:08<04:27,  2.07s/it]
 39%|███▉      | 83/211 [03:09<04:12,  1.97s/it]
 40%|███▉      | 84/211 [03:11<04:02,  1.91s/it]
 40%|████      | 85/211 [03:13<03:53,  1.85s/it]
 41%|████      | 86/211 [03:15<03:48,  1.83s/it]
 41%|████      | 87/211 [03:16<03:42,  1.79s/it]
 42%|████▏     | 88/211 [03:18<03:40,  1.79s/it]
 42%|████▏     | 89/211 [03:20<03:35,  1.77s/it]
 43%|████▎     | 90/211 [03:22<03:33,  1.77s/it]
 43%|████▎     | 91/211 [03:23<03:30,  1.76s/it]
 44%|████▎     | 92/211 [03:25<03:35,  1.81s/it]
 44%|████▍     | 93/211 [03:27<03:28,  1.76s/it]
 45%|████▍     | 94/211 [03:29<03:25,  1.75s/it]
 45%|████▌     | 95/211 [03:31<03:22,  1.74s/it]
 45%|████▌     | 96/211 [03:32<03:24,  1.77s/it]
 46%|████▌     | 97/211 [03:34<03:21,  1.77s/it]
 46%|████▋     | 98/211 [03:36<03:19,  1.77s/it]
 47%|████▋     | 99/211 [03:38<03:18,  1.77s/it]
 47%|████▋     | 100/211 [03:39<03:14,  1.75s/it]
 48%|████▊     | 101/211 [03:41<03:13,  1.75s/it]
 48%|████▊     | 102/211 [03:43<03:13,  1.77s/it]
 49%|████▉     | 103/211 [03:45<03:08,  1.75s/it]
 49%|████▉     | 104/211 [03:47<03:12,  1.79s/it]
 50%|████▉     | 105/211 [03:48<03:07,  1.77s/it]
 50%|█████     | 106/211 [04:10<13:22,  7.64s/it]
 51%|█████     | 107/211 [04:12<10:18,  5.94s/it]
 51%|█████     | 108/211 [04:13<08:02,  4.68s/it]
 52%|█████▏    | 109/211 [04:15<06:26,  3.79s/it]
 52%|█████▏    | 110/211 [04:17<05:18,  3.16s/it]
 53%|█████▎    | 111/211 [04:18<04:33,  2.74s/it]
 53%|█████▎    | 112/211 [04:20<04:09,  2.52s/it]
 54%|█████▎    | 113/211 [04:22<03:50,  2.35s/it]
 54%|█████▍    | 114/211 [04:24<03:30,  2.17s/it]
 55%|█████▍    | 115/211 [04:26<03:22,  2.11s/it]
 55%|█████▍    | 116/211 [04:46<11:49,  7.47s/it]
 55%|█████▌    | 117/211 [04:48<08:58,  5.73s/it]
 56%|█████▌    | 118/211 [04:50<07:08,  4.61s/it]
 56%|█████▋    | 119/211 [04:51<05:41,  3.71s/it]
 57%|█████▋    | 120/211 [04:53<04:42,  3.10s/it]
 57%|█████▋    | 121/211 [04:55<04:01,  2.69s/it]
 58%|█████▊    | 122/211 [04:57<03:33,  2.40s/it]
 58%|█████▊    | 123/211 [04:58<03:13,  2.20s/it]
 59%|█████▉    | 124/211 [05:00<03:01,  2.08s/it]
 59%|█████▉    | 125/211 [05:02<02:50,  1.98s/it]
 60%|█████▉    | 126/211 [05:03<02:38,  1.87s/it]
 60%|██████    | 127/211 [05:05<02:31,  1.80s/it]
 61%|██████    | 128/211 [05:07<02:25,  1.75s/it]
 61%|██████    | 129/211 [05:09<02:29,  1.82s/it]
 62%|██████▏   | 130/211 [05:10<02:24,  1.79s/it]
 62%|██████▏   | 131/211 [05:12<02:20,  1.76s/it]
 63%|██████▎   | 132/211 [05:14<02:17,  1.74s/it]
 63%|██████▎   | 133/211 [05:15<02:14,  1.73s/it]
 64%|██████▎   | 134/211 [05:18<02:23,  1.87s/it]
 64%|██████▍   | 135/211 [05:20<02:22,  1.87s/it]
 64%|██████▍   | 136/211 [05:21<02:15,  1.80s/it]
 65%|██████▍   | 137/211 [05:23<02:13,  1.81s/it]
 65%|██████▌   | 138/211 [05:25<02:10,  1.79s/it]
 66%|██████▌   | 139/211 [05:26<02:06,  1.75s/it]
 66%|██████▋   | 140/211 [05:28<02:03,  1.73s/it]
 67%|██████▋   | 141/211 [05:30<02:04,  1.78s/it]
 67%|██████▋   | 142/211 [05:32<02:05,  1.82s/it]
 68%|██████▊   | 143/211 [05:34<02:04,  1.83s/it]
 68%|██��███▊   | 144/211 [05:35<01:59,  1.78s/it]
 69%|██████▊   | 145/211 [05:37<01:57,  1.78s/it]
 69%|██████▉   | 146/211 [05:55<07:10,  6.63s/it]
 70%|██████▉   | 147/211 [05:57<05:30,  5.17s/it]
 70%|███████   | 148/211 [05:59<04:23,  4.17s/it]
 71%|███████   | 149/211 [06:00<03:31,  3.42s/it]
 71%|███████   | 150/211 [06:02<02:58,  2.92s/it]
 72%|███████▏  | 151/211 [06:04<02:33,  2.56s/it]
 72%|███████▏  | 152/211 [06:06<02:15,  2.29s/it]
 73%|███████▎  | 153/211 [06:07<02:02,  2.12s/it]
 73%|███████▎  | 154/211 [06:10<02:15,  2.38s/it]
 73%|███████▎  | 155/211 [06:12<02:00,  2.15s/it]
 74%|███████▍  | 156/211 [06:14<01:50,  2.00s/it]
 74%|███████▍  | 157/211 [06:15<01:43,  1.91s/it]
 75%|███████▍  | 158/211 [06:17<01:38,  1.86s/it]
 75%|███████▌  | 159/211 [06:19<01:37,  1.88s/it]
 76%|███████▌  | 160/211 [06:21<01:32,  1.82s/it]
 76%|███████▋  | 161/211 [06:22<01:29,  1.79s/it]
 77%|███████▋  | 162/211 [06:24<01:31,  1.87s/it]
 77%|███████▋  | 163/211 [06:26<01:27,  1.83s/it]
 78%|███████▊  | 164/211 [06:29<01:36,  2.05s/it]
 78%|███████▊  | 165/211 [06:31<01:32,  2.01s/it]
 79%|███████▊  | 166/211 [06:32<01:27,  1.94s/it]
 79%|███████▉  | 167/211 [06:34<01:21,  1.85s/it]
 80%|███████▉  | 168/211 [06:36<01:17,  1.79s/it]
 80%|████████  | 169/211 [06:38<01:19,  1.89s/it]
 81%|████████  | 170/211 [06:40<01:17,  1.88s/it]
 81%|████████  | 171/211 [06:41<01:14,  1.85s/it]
 82%|████████▏ | 172/211 [06:43<01:11,  1.84s/it]
 82%|████████▏ | 173/211 [06:45<01:08,  1.81s/it]
 82%|████████▏ | 174/211 [06:47<01:07,  1.84s/it]
 83%|████████▎ | 175/211 [06:49<01:04,  1.78s/it]
 83%|████████▎ | 176/211 [06:50<01:02,  1.79s/it]
 84%|████████▍ | 177/211 [06:52<01:00,  1.77s/it]
 84%|████████▍ | 178/211 [06:54<01:03,  1.92s/it]
 85%|████████▍ | 179/211 [06:56<00:59,  1.87s/it]
 85%|████████▌ | 180/211 [06:58<00:56,  1.81s/it]
 86%|████████▌ | 181/211 [06:59<00:53,  1.77s/it]
 86%|████████▋ | 182/211 [07:01<00:53,  1.84s/it]
 87%|████████▋ | 183/211 [07:03<00:50,  1.81s/it]
 87%|████████▋ | 184/211 [07:05<00:49,  1.83s/it]
 88%|████████▊ | 185/211 [07:07<00:48,  1.86s/it]
 88%|████████▊ | 186/211 [07:09<00:45,  1.83s/it]
 89%|████████▊ | 187/211 [07:11<00:43,  1.82s/it]
 89%|████████▉ | 188/211 [07:12<00:42,  1.83s/it]
 90%|████████▉ | 189/211 [07:14<00:39,  1.77s/it]
 90%|█████████ | 190/211 [07:16<00:37,  1.77s/it]
 91%|█████████ | 191/211 [07:17<00:34,  1.73s/it]
 91%|█████████ | 192/211 [07:19<00:32,  1.71s/it]
 91%|█████████▏| 193/211 [07:21<00:30,  1.72s/it]
 92%|█████████▏| 194/211 [07:23<00:29,  1.72s/it]
 92%|█████████▏| 195/211 [07:24<00:27,  1.72s/it]
 93%|█████████▎| 196/211 [07:26<00:25,  1.70s/it]
 93%|█████████▎| 197/211 [07:28<00:24,  1.75s/it]
 94%|█████████▍| 198/211 [07:30<00:23,  1.78s/it]
 94%|█████████▍| 199/211 [07:31<00:21,  1.76s/it]
 95%|█████████▍| 200/211 [07:33<00:19,  1.78s/it]
 95%|█████████▌| 201/211 [07:35<00:17,  1.76s/it]
 96%|█████████▌| 202/211 [07:37<00:16,  1.80s/it]
 96%|█████████▌| 203/211 [07:39<00:14,  1.79s/it]
 97%|█████████▋| 204/211 [07:40<00:12,  1.76s/it]
 97%|█████████▋| 205/211 [07:42<00:10,  1.74s/it]
 98%|█████████▊| 206/211 [07:44<00:08,  1.75s/it]
 98%|█████████▊| 207/211 [07:45<00:06,  1.72s/it]
 99%|█████████▊| 208/211 [07:47<00:05,  1.70s/it]
 99%|█████████▉| 209/211 [07:49<00:03,  1.68s/it]
  0%|          | 0/469 [00:00<?, ?it/s]
  0%|          | 1/469 [00:02<18:09,  2.33s/it]
  0%|          | 2/469 [00:04<15:10,  1.95s/it]
  1%|          | 3/469 [00:05<14:14,  1.83s/it]
  1%|          | 4/469 [00:07<13:52,  1.79s/it]
  1%|          | 5/469 [00:09<13:23,  1.73s/it]
  1%|▏         | 6/469 [00:12<17:01,  2.21s/it]
  1%|▏         | 7/469 [00:14<16:26,  2.14s/it]
  2%|▏         | 8/469 [00:15<15:33,  2.02s/it]
  2%|▏         | 9/469 [00:17<15:18,  2.00s/it]
  2%|▏         | 10/469 [00:19<14:43,  1.92s/it]
  2%|▏         | 11/469 [00:21<14:04,  1.84s/it]
  3%|▎         | 12/469 [00:23<14:21,  1.89s/it]
  3%|▎         | 13/469 [00:25<14:16,  1.88s/it]
  3%|▎         | 14/469 [00:27<14:15,  1.88s/it]
  3%|▎         | 15/469 [00:28<14:05,  1.86s/it]
  3%|▎         | 16/469 [00:30<14:24,  1.91s/it]
  4%|▎         | 17/469 [00:32<14:05,  1.87s/it]
  4%|▍         | 18/469 [00:34<14:13,  1.89s/it]
  4%|▍         | 19/469 [00:36<13:44,  1.83s/it]
  4%|▍         | 20/469 [00:37<13:14,  1.77s/it]
  4%|▍         | 21/469 [00:39<13:07,  1.76s/it]
  5%|▍         | 22/469 [00:41<12:51,  1.72s/it]
  5%|▍         | 23/469 [00:43<12:56,  1.74s/it]
  5%|▌         | 24/469 [00:44<12:53,  1.74s/it]
  5%|▌         | 25/469 [00:46<12:43,  1.72s/it]
  6%|▌         | 26/469 [00:48<12:47,  1.73s/it]
  6%|▌         | 27/469 [00:49<12:34,  1.71s/it]
  6%|▌         | 28/469 [00:51<12:53,  1.75s/it]
  6%|▌         | 29/469 [00:53<12:38,  1.72s/it]
  6%|▋         | 30/469 [00:55<12:33,  1.72s/it]
  7%|▋         | 31/469 [00:56<12:17,  1.68s/it]
  7%|▋         | 32/469 [00:58<12:27,  1.71s/it]
  7%|▋         | 33/469 [01:00<12:22,  1.70s/it]
  7%|▋         | 34/469 [01:03<16:09,  2.23s/it]
  7%|▋         | 35/469 [01:05<15:00,  2.08s/it]
  8%|▊         | 36/469 [01:07<14:10,  1.96s/it]
  8%|▊         | 37/469 [01:08<13:48,  1.92s/it]
  8%|▊         | 38/469 [01:10<13:13,  1.84s/it]
  8%|▊         | 39/469 [01:12<13:28,  1.88s/it]
  9%|▊         | 40/469 [01:14<13:04,  1.83s/it]
  9%|▊         | 41/469 [01:16<13:04,  1.83s/it]
  9%|▉         | 42/469 [01:19<15:53,  2.23s/it]
  9%|▉         | 43/469 [01:21<15:36,  2.20s/it]
  9%|▉         | 44/469 [01:23<14:48,  2.09s/it]
 10%|▉         | 45/469 [01:24<13:55,  1.97s/it]
 10%|▉         | 46/469 [01:26<13:20,  1.89s/it]
 10%|█         | 47/469 [01:28<13:24,  1.91s/it]
 10%|█         | 48/469 [01:30<13:15,  1.89s/it]
 10%|█         | 49/469 [01:31<12:39,  1.81s/it]
 11%|█         | 50/469 [01:33<12:26,  1.78s/it]
 11%|█         | 51/469 [01:35<12:47,  1.84s/it]
 11%|█         | 52/469 [01:37<12:28,  1.79s/it]
 11%|█▏        | 53/469 [01:39<12:09,  1.75s/it]
 12%|█▏        | 54/469 [01:40<12:13,  1.77s/it]
 12%|█▏        | 55/469 [01:42<12:03,  1.75s/it]
 12%|█▏        | 56/469 [01:44<12:13,  1.78s/it]
 12%|█▏        | 57/469 [01:46<12:27,  1.81s/it]
 12%|█▏        | 58/469 [01:48<12:53,  1.88s/it]
 13%|█▎        | 59/469 [01:50<12:46,  1.87s/it]
 13%|█▎        | 60/469 [01:51<12:25,  1.82s/it]
 13%|█▎        | 61/469 [01:53<12:01,  1.77s/it]
 13%|█▎        | 62/469 [01:55<11:49,  1.74s/it]
 13%|█▎        | 63/469 [01:56<11:43,  1.73s/it]
 14%|█▎        | 64/469 [01:58<11:37,  1.72s/it]
 14%|█▍        | 65/469 [02:00<11:38,  1.73s/it]
 14%|█▍        | 66/469 [02:02<11:28,  1.71s/it]
 14%|█▍        | 67/469 [02:03<11:30,  1.72s/it]
 14%|█▍        | 68/469 [02:05<11:30,  1.72s/it]
 15%|█▍        | 69/469 [02:07<12:03,  1.81s/it]
 15%|█▍        | 70/469 [02:09<11:46,  1.77s/it]
 15%|█▌        | 71/469 [02:10<11:36,  1.75s/it]
 15%|█▌        | 72/469 [02:12<11:34,  1.75s/it]
 16%|█▌        | 73/469 [02:14<12:02,  1.82s/it]
 16%|█▌        | 74/469 [02:16<11:41,  1.78s/it]
 16%|█▌        | 75/469 [02:18<12:10,  1.85s/it]
 16%|█▌        | 76/469 [02:20<12:04,  1.84s/it]
 16%|█▋        | 77/469 [02:21<11:44,  1.80s/it]
 17%|█▋        | 78/469 [02:23<11:44,  1.80s/it]
 17%|█▋        | 79/469 [02:25<11:44,  1.81s/it]
 17%|█▋        | 80/469 [02:27<11:43,  1.81s/it]
 17%|█▋        | 81/469 [02:28<11:22,  1.76s/it]
 17%|█▋        | 82/469 [02:30<11:10,  1.73s/it]
 18%|█▊        | 83/469 [02:32<11:12,  1.74s/it]
 18%|█▊        | 84/469 [02:34<11:11,  1.75s/it]
 18%|█▊        | 85/469 [02:36<11:32,  1.80s/it]
 18%|█▊        | 86/469 [02:37<11:32,  1.81s/it]
 19%|█▊        | 87/469 [02:39<11:16,  1.77s/it]
 19%|█▉        | 88/469 [02:41<11:07,  1.75s/it]
 19%|█▉        | 89/469 [02:42<11:00,  1.74s/it]
 19%|█▉        | 90/469 [02:44<11:09,  1.77s/it]
 19%|█▉        | 91/469 [02:46<11:15,  1.79s/it]
 20%|█▉        | 92/469 [02:48<11:09,  1.78s/it]
 20%|█▉        | 93/469 [02:50<10:57,  1.75s/it]
 20%|██        | 94/469 [02:51<10:55,  1.75s/it]
 20%|██        | 95/469 [02:53<10:50,  1.74s/it]
 20%|██        | 96/469 [02:55<10:42,  1.72s/it]
 21%|██        | 97/469 [02:56<10:37,  1.71s/it]
 21%|██        | 98/469 [02:58<10:51,  1.76s/it]
 21%|██        | 99/469 [03:00<10:52,  1.76s/it]
 21%|██▏       | 100/469 [03:02<11:08,  1.81s/it]
 22%|██▏       | 101/469 [03:04<10:52,  1.77s/it]
 22%|██▏       | 102/469 [03:05<10:48,  1.77s/it]
 22%|██▏       | 103/469 [03:07<10:35,  1.74s/it]
 22%|██▏       | 104/469 [03:09<10:28,  1.72s/it]
 22%|██▏       | 105/469 [03:11<10:38,  1.75s/it]
 23%|██▎       | 106/469 [03:12<10:29,  1.73s/it]
 23%|██▎       | 107/469 [03:14<10:22,  1.72s/it]
 23%|██▎       | 108/469 [03:16<10:41,  1.78s/it]
 23%|██▎       | 109/469 [03:18<10:29,  1.75s/it]
 23%|██▎       | 110/469 [03:19<10:32,  1.76s/it]
 24%|██▎       | 111/469 [03:21<10:25,  1.75s/it]
 24%|██▍       | 112/469 [03:23<10:24,  1.75s/it]
 24%|██▍       | 113/469 [03:25<10:37,  1.79s/it]
 24%|██▍       | 114/469 [03:26<10:24,  1.76s/it]
 25%|██▍       | 115/469 [03:28<10:27,  1.77s/it]
 25%|██▍       | 116/469 [03:30<10:22,  1.76s/it]
 25%|██▍       | 117/469 [03:32<10:28,  1.79s/it]
 25%|██▌       | 118/469 [03:34<10:52,  1.86s/it]
 25%|██▌       | 119/469 [03:36<11:00,  1.89s/it]
 26%|██▌       | 120/469 [03:37<10:42,  1.84s/it]
 26%|██▌       | 121/469 [03:39<10:37,  1.83s/it]
 26%|██▌       | 122/469 [03:41<10:17,  1.78s/it]
 26%|██▌       | 123/469 [03:43<10:21,  1.80s/it]
 26%|██▋       | 124/469 [03:45<10:14,  1.78s/it]
 27%|██▋       | 125/469 [03:46<10:15,  1.79s/it]
 27%|██▋       | 126/469 [03:48<10:12,  1.79s/it]
 27%|██▋       | 127/469 [03:50<09:59,  1.75s/it]
 27%|██▋       | 128/469 [03:51<09:49,  1.73s/it]
 28%|██▊       | 129/469 [03:53<09:53,  1.75s/it]
 28%|██▊       | 130/469 [03:55<09:42,  1.72s/it]
 28%|██▊       | 131/469 [03:57<09:37,  1.71s/it]
 28%|██▊       | 132/469 [03:58<09:49,  1.75s/it]
 28%|██▊       | 133/469 [04:00<09:59,  1.78s/it]
 29%|██▊       | 134/469 [04:02<10:22,  1.86s/it]
 29%|██▉       | 135/469 [04:04<10:20,  1.86s/it]
 29%|██▉       | 136/469 [04:06<10:26,  1.88s/it]
 29%|██▉       | 137/469 [04:08<10:02,  1.81s/it]
 29%|██▉       | 138/469 [04:10<10:16,  1.86s/it]
 30%|██▉       | 139/469 [04:11<09:58,  1.81s/it]
 30%|██▉       | 140/469 [04:13<09:59,  1.82s/it]
 30%|███       | 141/469 [04:15<09:40,  1.77s/it]
 30%|███       | 142/469 [04:17<09:28,  1.74s/it]
 30%|███       | 143/469 [04:18<09:16,  1.71s/it]
 31%|███       | 144/469 [04:20<09:13,  1.70s/it]
 31%|███       | 145/469 [04:22<09:28,  1.75s/it]
 31%|███       | 146/469 [04:24<09:18,  1.73s/it]
 31%|███▏      | 147/469 [04:25<09:35,  1.79s/it]
 32%|███▏      | 148/469 [04:27<09:28,  1.77s/it]
 32%|███▏      | 149/469 [04:29<09:12,  1.73s/it]
 32%|███▏      | 150/469 [04:31<09:11,  1.73s/it]
 32%|███▏      | 151/469 [04:32<09:18,  1.76s/it]
 32%|███▏      | 152/469 [04:34<09:23,  1.78s/it]
 33%|███▎      | 153/469 [04:36<09:21,  1.78s/it]
 33%|███▎      | 154/469 [04:38<09:11,  1.75s/it]
 33%|███▎      | 155/469 [04:39<09:09,  1.75s/it]
 33%|███▎      | 156/469 [04:42<10:07,  1.94s/it]
 33%|███▎      | 157/469 [04:44<10:39,  2.05s/it]
 34%|███▎      | 158/469 [04:46<10:12,  1.97s/it]
 34%|███▍      | 159/469 [04:48<09:42,  1.88s/it]
 34%|███▍      | 160/469 [04:49<09:42,  1.88s/it]
 34%|███▍      | 161/469 [04:51<09:24,  1.83s/it]
 35%|███▍      | 162/469 [04:53<09:08,  1.79s/it]
 35%|███▍      | 163/469 [04:55<09:01,  1.77s/it]
 35%|███▍      | 164/469 [04:56<09:08,  1.80s/it]
 35%|███▌      | 165/469 [04:58<09:07,  1.80s/it]
 35%|███▌      | 166/469 [05:00<09:06,  1.80s/it]
 36%|███▌      | 167/469 [05:02<09:00,  1.79s/it]
 36%|███▌      | 168/469 [05:03<08:47,  1.75s/it]
 36%|███▌      | 169/469 [05:05<08:51,  1.77s/it]
 36%|███▌      | 170/469 [05:07<08:35,  1.73s/it]
 36%|███▋      | 171/469 [05:09<08:25,  1.70s/it]
 37%|███▋      | 172/469 [05:10<08:24,  1.70s/it]
 37%|███▋      | 173/469 [05:12<08:20,  1.69s/it]
 37%|███▋      | 174/469 [05:14<08:20,  1.70s/it]
 37%|███▋      | 175/469 [05:16<08:42,  1.78s/it]
 38%|███▊      | 176/469 [05:17<08:34,  1.76s/it]
 38%|███▊      | 177/469 [05:19<08:31,  1.75s/it]
 38%|███▊      | 178/469 [05:21<08:34,  1.77s/it]
 38%|███▊      | 179/469 [05:22<08:22,  1.73s/it]
 38%|███▊      | 180/469 [05:24<08:22,  1.74s/it]
 39%|███▊      | 181/469 [05:26<08:14,  1.72s/it]
 39%|███▉      | 182/469 [05:28<08:19,  1.74s/it]
 39%|███▉      | 183/469 [05:29<08:17,  1.74s/it]
 39%|███▉      | 184/469 [05:31<08:29,  1.79s/it]
 39%|███▉      | 185/469 [05:33<08:36,  1.82s/it]
 40%|███▉      | 186/469 [05:35<08:36,  1.82s/it]
 40%|███▉      | 187/469 [05:37<08:29,  1.81s/it]
 40%|████      | 188/469 [05:39<08:45,  1.87s/it]
 40%|████      | 189/469 [05:41<08:29,  1.82s/it]
 41%|████      | 190/469 [05:42<08:28,  1.82s/it]
 41%|████      | 191/469 [05:44<08:18,  1.79s/it]
 41%|████      | 192/469 [05:46<08:10,  1.77s/it]
 41%|████      | 193/469 [05:47<07:58,  1.73s/it]
 41%|████▏     | 194/469 [05:49<07:52,  1.72s/it]
 42%|████▏     | 195/469 [05:51<08:08,  1.78s/it]
 42%|████▏     | 196/469 [05:53<08:19,  1.83s/it]
 42%|████▏     | 197/469 [05:55<08:35,  1.90s/it]
 42%|████▏     | 198/469 [05:57<08:28,  1.88s/it]
 42%|████▏     | 199/469 [05:59<08:17,  1.84s/it]
 43%|████▎     | 200/469 [06:01<08:22,  1.87s/it]
 43%|████▎     | 201/469 [06:02<08:09,  1.83s/it]
 43%|████▎     | 202/469 [06:04<08:08,  1.83s/it]
 43%|████▎     | 203/469 [06:06<08:21,  1.88s/it]
 43%|████▎     | 204/469 [06:08<07:59,  1.81s/it]
 44%|████▎     | 205/469 [06:09<07:46,  1.77s/it]
 44%|████▍     | 206/469 [06:11<07:38,  1.74s/it]
 44%|████▍     | 207/469 [06:13<07:27,  1.71s/it]
 44%|████▍     | 208/469 [06:15<07:40,  1.76s/it]
 45%|████▍     | 209/469 [06:16<07:35,  1.75s/it]
 45%|████▍     | 210/469 [06:18<07:30,  1.74s/it]
 45%|████▍     | 211/469 [06:20<07:35,  1.77s/it]
 45%|████▌     | 212/469 [06:22<07:32,  1.76s/it]
 45%|████▌     | 213/469 [06:23<07:29,  1.76s/it]
 46%|████▌     | 214/469 [06:25<07:38,  1.80s/it]
 46%|████▌     | 215/469 [06:27<07:34,  1.79s/it]
 46%|████▌     | 216/469 [06:29<07:28,  1.77s/it]
 46%|████▋     | 217/469 [06:30<07:20,  1.75s/it]
 46%|████▋     | 218/469 [06:32<07:08,  1.71s/it]
 47%|████▋     | 219/469 [06:34<07:04,  1.70s/it]
 47%|████▋     | 220/469 [06:35<07:02,  1.70s/it]
 47%|████▋     | 221/469 [06:38<07:54,  1.91s/it]
 47%|████▋     | 222/469 [06:40<07:49,  1.90s/it]
 48%|████▊     | 223/469 [06:42<07:37,  1.86s/it]
 48%|████▊     | 224/469 [06:43<07:18,  1.79s/it]
 48%|████▊     | 225/469 [06:45<07:42,  1.90s/it]
 48%|████▊     | 226/469 [06:48<09:05,  2.25s/it]
 48%|████▊     | 227/469 [06:50<08:33,  2.12s/it]
 49%|████▊     | 228/469 [06:52<07:55,  1.97s/it]
 49%|████▉     | 229/469 [06:54<07:40,  1.92s/it]
 49%|████▉     | 230/469 [06:55<07:33,  1.90s/it]
 49%|████▉     | 231/469 [06:57<07:17,  1.84s/it]
 49%|████▉     | 232/469 [06:59<07:05,  1.79s/it]
 50%|████▉     | 233/469 [07:01<07:03,  1.80s/it]
 50%|████▉     | 234/469 [07:02<06:59,  1.78s/it]
 50%|█████     | 235/469 [07:05<08:13,  2.11s/it]
 50%|█████     | 236/469 [07:07<07:39,  1.97s/it]
 51%|█████     | 237/469 [07:09<07:15,  1.88s/it]
 51%|█████     | 238/469 [07:10<07:11,  1.87s/it]
 51%|█████     | 239/469 [07:12<06:55,  1.81s/it]
 51%|█████     | 240/469 [07:14<07:03,  1.85s/it]
 51%|█████▏    | 241/469 [07:16<06:54,  1.82s/it]
 52%|█████▏    | 242/469 [07:17<06:41,  1.77s/it]
 52%|█████▏    | 243/469 [07:19<06:33,  1.74s/it]
 52%|█████▏    | 244/469 [07:21<06:28,  1.73s/it]
 52%|█████▏    | 245/469 [07:24<07:39,  2.05s/it]
 52%|█████▏    | 246/469 [07:25<07:19,  1.97s/it]
 53%|█████▎    | 247/469 [07:27<07:08,  1.93s/it]
 53%|█████▎    | 248/469 [07:29<06:57,  1.89s/it]
 53%|█████▎    | 249/469 [07:31<06:40,  1.82s/it]
 53%|█████▎    | 250/469 [07:33<06:38,  1.82s/it]
 54%|█████▎    | 251/469 [07:34<06:25,  1.77s/it]
 54%|█████▎    | 252/469 [07:36<06:15,  1.73s/it]
 54%|█████▍    | 253/469 [07:37<06:09,  1.71s/it]
 54%|█████▍    | 254/469 [07:39<06:05,  1.70s/it]
 54%|█████▍    | 255/469 [07:41<06:02,  1.70s/it]
 55%|█████▍    | 256/469 [07:42<05:58,  1.68s/it]
 55%|█████▍    | 257/469 [07:44<06:01,  1.71s/it]
 55%|█████▌    | 258/469 [07:47<06:54,  1.96s/it]
 55%|█████▌    | 259/469 [07:49<06:35,  1.88s/it]
 55%|█████▌    | 260/469 [07:52<08:26,  2.42s/it]
 56%|█████▌    | 261/469 [07:54<07:35,  2.19s/it]
 56%|█████▌    | 262/469 [07:56<07:07,  2.07s/it]
 56%|█████▌    | 263/469 [07:57<06:47,  1.98s/it]
 56%|█████▋    | 264/469 [07:59<06:25,  1.88s/it]
 57%|█████▋    | 265/469 [08:01<06:21,  1.87s/it]
 57%|█████▋    | 266/469 [08:03<06:20,  1.87s/it]
 57%|█████▋    | 267/469 [08:04<06:08,  1.82s/it]
 57%|█████▋    | 268/469 [08:06<06:07,  1.83s/it]
 57%|█████▋    | 269/469 [08:08<05:54,  1.77s/it]
 58%|█████▊    | 270/469 [08:10<05:59,  1.80s/it]
 58%|█████▊    | 271/469 [08:12<05:51,  1.77s/it]
 58%|█████▊    | 272/469 [08:13<05:58,  1.82s/it]
 58%|█████▊    | 273/469 [08:15<06:06,  1.87s/it]
 58%|█████▊    | 274/469 [08:17<05:52,  1.81s/it]
 59%|█████▊    | 275/469 [08:19<05:50,  1.81s/it]
 59%|█████▉    | 276/469 [08:21<05:42,  1.77s/it]
 59%|█████▉    | 277/469 [08:22<05:37,  1.76s/it]
 59%|█████▉    | 278/469 [08:24<05:34,  1.75s/it]
 59%|█████▉    | 279/469 [08:26<05:38,  1.78s/it]
 60%|█████▉    | 280/469 [08:28<05:38,  1.79s/it]
 60%|█████▉    | 281/469 [08:30<05:39,  1.81s/it]
 60%|██████    | 282/469 [08:31<05:38,  1.81s/it]
 60%|██████    | 283/469 [08:33<05:30,  1.78s/it]
 61%|██████    | 284/469 [08:35<05:26,  1.77s/it]
 61%|██████    | 285/469 [08:36<05:17,  1.73s/it]
 61%|██████    | 286/469 [08:38<05:14,  1.72s/it]
 61%|██████    | 287/469 [08:40<05:10,  1.71s/it]
 61%|██████▏   | 288/469 [08:41<05:05,  1.69s/it]
 62%|██████▏   | 289/469 [08:43<05:04,  1.69s/it]
 62%|██████▏   | 290/469 [08:45<05:11,  1.74s/it]
 62%|██████▏   | 291/469 [08:47<05:11,  1.75s/it]
 62%|██████▏   | 292/469 [08:48<05:04,  1.72s/it]
 62%|██████▏   | 293/469 [08:50<05:04,  1.73s/it]
 63%|██████▎   | 294/469 [08:52<05:19,  1.83s/it]
 63%|██████▎   | 295/469 [08:54<05:14,  1.81s/it]
 63%|██████▎   | 296/469 [08:56<05:04,  1.76s/it]
 63%|██████▎   | 297/469 [08:57<04:58,  1.74s/it]
 64%|██████▎   | 298/469 [08:59<05:01,  1.76s/it]
 64%|██████▍   | 299/469 [09:01<04:56,  1.74s/it]
 64%|██████▍   | 300/469 [09:03<04:52,  1.73s/it]
 64%|██████▍   | 301/469 [09:04<04:48,  1.72s/it]
 64%|██████▍   | 302/469 [09:06<04:43,  1.70s/it]
 65%|██████▍   | 303/469 [09:08<04:50,  1.75s/it]
 65%|██████▍   | 304/469 [09:10<04:46,  1.74s/it]
 65%|██████▌   | 305/469 [09:11<04:47,  1.75s/it]
 65%|██████▌   | 306/469 [09:13<04:42,  1.73s/it]
 65%|██████▌   | 307/469 [09:15<04:58,  1.84s/it]
 66%|██████▌   | 308/469 [09:17<04:50,  1.81s/it]
 66%|██████▌   | 309/469 [09:19<04:46,  1.79s/it]
 66%|██████▌   | 310/469 [09:20<04:39,  1.76s/it]
 66%|██████▋   | 311/469 [09:22<04:34,  1.74s/it]
 67%|██████▋   | 312/469 [09:24<04:36,  1.76s/it]
 67%|██████▋   | 313/469 [09:26<04:52,  1.88s/it]
 67%|██████▋   | 314/469 [09:28<04:55,  1.91s/it]
 67%|██████▋   | 315/469 [09:30<04:43,  1.84s/it]
 67%|██████▋   | 316/469 [09:31<04:36,  1.81s/it]
 68%|██████▊   | 317/469 [09:33<04:42,  1.86s/it]
 68%|██████▊   | 318/469 [09:36<05:38,  2.24s/it]
 68%|██████▊   | 319/469 [09:38<05:18,  2.13s/it]
 68%|██████▊   | 320/469 [09:40<04:59,  2.01s/it]
 68%|██████▊   | 321/469 [09:42<04:49,  1.95s/it]
 69%|██████▊   | 322/469 [09:43<04:32,  1.86s/it]
 69%|██████▉   | 323/469 [09:47<05:51,  2.41s/it]
 69%|██████▉   | 324/469 [09:49<05:17,  2.19s/it]
 69%|██████▉   | 325/469 [09:51<04:58,  2.08s/it]
 70%|██████▉   | 326/469 [09:52<04:42,  1.97s/it]
 70%|██████▉   | 327/469 [09:54<04:29,  1.89s/it]
 70%|██████▉   | 328/469 [09:56<04:17,  1.82s/it]
 70%|███████   | 329/469 [09:58<04:12,  1.80s/it]
 70%|███████   | 330/469 [09:59<04:07,  1.78s/it]
 71%|███████   | 331/469 [10:01<04:12,  1.83s/it]
 71%|███████   | 332/469 [10:03<04:06,  1.80s/it]
 71%|███████   | 333/469 [10:05<03:57,  1.75s/it]
 71%|███████   | 334/469 [10:06<03:51,  1.71s/it]
 71%|███████▏  | 335/469 [10:08<03:47,  1.70s/it]
 72%|███████▏  | 336/469 [10:09<03:44,  1.69s/it]
 72%|███████▏  | 337/469 [10:11<03:42,  1.68s/it]
 72%|███████▏  | 338/469 [10:13<03:45,  1.72s/it]
 72%|███████▏  | 339/469 [10:15<03:41,  1.70s/it]
 72%|███████▏  | 340/469 [10:16<03:38,  1.70s/it]
 73%|███████▎  | 341/469 [10:18<03:37,  1.70s/it]
 73%|███████▎  | 342/469 [10:20<03:47,  1.79s/it]
 73%|███████▎  | 343/469 [10:22<03:42,  1.77s/it]
 73%|███████▎  | 344/469 [10:24<03:40,  1.77s/it]
 74%|███████▎  | 345/469 [10:25<03:39,  1.77s/it]
 74%|███████▍  | 346/469 [10:27<03:36,  1.76s/it]
 74%|███████▍  | 347/469 [10:29<03:31,  1.73s/it]
 74%|███████▍  | 348/469 [10:31<03:35,  1.78s/it]
 74%|███████▍  | 349/469 [10:32<03:34,  1.78s/it]
 75%|███████▍  | 350/469 [10:34<03:30,  1.76s/it]
 75%|███████▍  | 351/469 [10:36<03:23,  1.72s/it]
 75%|███████▌  | 352/469 [10:38<03:33,  1.82s/it]
 75%|███████▌  | 353/469 [10:40<03:29,  1.81s/it]
 75%|███████▌  | 354/469 [10:41<03:27,  1.80s/it]
 76%|███████▌  | 355/469 [10:43<03:21,  1.77s/it]
 76%|███████▌  | 356/469 [10:45<03:27,  1.83s/it]
 76%|███████▌  | 357/469 [10:47<03:18,  1.77s/it]
 76%|███████▋  | 358/469 [10:48<03:14,  1.75s/it]
 77%|██��████▋  | 359/469 [10:50<03:09,  1.73s/it]
 77%|███████▋  | 360/469 [10:52<03:07,  1.72s/it]
 77%|███████▋  | 361/469 [10:53<03:06,  1.72s/it]
 77%|███████▋  | 362/469 [10:55<03:07,  1.75s/it]
 77%|███████▋  | 363/469 [10:57<03:10,  1.79s/it]
 78%|███████▊  | 364/469 [10:59<03:06,  1.77s/it]
 78%|███████▊  | 365/469 [11:01<03:00,  1.73s/it]
 78%|███████▊  | 366/469 [11:02<02:55,  1.71s/it]
 78%|███████▊  | 367/469 [11:04<02:53,  1.70s/it]
 78%|███████▊  | 368/469 [11:06<02:53,  1.72s/it]
 79%|███████▊  | 369/469 [11:07<02:51,  1.71s/it]
 79%|███████▉  | 370/469 [11:09<02:48,  1.70s/it]
 79%|███████▉  | 371/469 [11:11<02:48,  1.71s/it]
 79%|███████▉  | 372/469 [11:13<02:51,  1.77s/it]
 80%|███████▉  | 373/469 [11:14<02:46,  1.73s/it]
 80%|███████▉  | 374/469 [11:16<02:45,  1.75s/it]
 80%|███████▉  | 375/469 [11:18<02:43,  1.74s/it]
 80%|████████  | 376/469 [11:20<02:42,  1.74s/it]
 80%|████████  | 377/469 [11:21<02:39,  1.74s/it]
 81%|████████  | 378/469 [11:23<02:36,  1.72s/it]
 81%|████████  | 379/469 [11:25<02:35,  1.72s/it]
 81%|████████  | 380/469 [11:26<02:32,  1.72s/it]
 81%|████████  | 381/469 [11:28<02:34,  1.76s/it]
 81%|████████▏ | 382/469 [11:30<02:43,  1.88s/it]
 82%|████████▏ | 383/469 [11:32<02:35,  1.81s/it]
 82%|████████▏ | 384/469 [11:34<02:29,  1.76s/it]
 82%|████████▏ | 385/469 [11:36<02:30,  1.80s/it]
 82%|████████▏ | 386/469 [11:37<02:26,  1.77s/it]
 83%|████████▎ | 387/469 [11:39<02:26,  1.78s/it]
 83%|████████▎ | 388/469 [11:41<02:23,  1.77s/it]
 83%|████████▎ | 389/469 [11:42<02:18,  1.73s/it]
 83%|████████▎ | 390/469 [11:44<02:18,  1.75s/it]
 83%|████████▎ | 391/469 [11:46<02:16,  1.76s/it]
 84%|████████▎ | 392/469 [11:49<02:33,  1.99s/it]
 84%|████████▍ | 393/469 [11:50<02:24,  1.90s/it]
 84%|████████▍ | 394/469 [11:52<02:18,  1.85s/it]
 84%|████████▍ | 395/469 [11:54<02:12,  1.79s/it]
 84%|████████▍ | 396/469 [11:58<03:02,  2.50s/it]
 85%|████████▍ | 397/469 [12:00<02:44,  2.29s/it]
 85%|████████▍ | 398/469 [12:01<02:31,  2.13s/it]
 85%|████████▌ | 399/469 [12:03<02:21,  2.02s/it]
 85%|████████▌ | 400/469 [12:05<02:15,  1.96s/it]
 86%|████████▌ | 401/469 [12:07<02:10,  1.92s/it]
 86%|████████▌ | 402/469 [12:08<02:02,  1.83s/it]
 86%|████████▌ | 403/469 [12:10<01:57,  1.78s/it]
 86%|████████▌ | 404/469 [12:12<01:53,  1.74s/it]
 86%|████████▋ | 405/469 [12:13<01:49,  1.72s/it]
 87%|████████▋ | 406/469 [12:15<01:50,  1.76s/it]
 87%|████████▋ | 407/469 [12:17<01:48,  1.74s/it]
 87%|████████▋ | 408/469 [12:19<01:44,  1.72s/it]
 87%|████████▋ | 409/469 [12:20<01:41,  1.70s/it]
 87%|████████▋ | 410/469 [12:22<01:45,  1.79s/it]
 88%|████████▊ | 411/469 [12:24<01:40,  1.74s/it]
 88%|████████▊ | 412/469 [12:26<01:41,  1.78s/it]
 88%|████████▊ | 413/469 [12:28<01:39,  1.78s/it]
 88%|████████▊ | 414/469 [12:29<01:35,  1.74s/it]
 88%|████████▊ | 415/469 [12:31<01:38,  1.83s/it]
 89%|████████▊ | 416/469 [12:33<01:36,  1.82s/it]
 89%|████████▉ | 417/469 [12:35<01:31,  1.76s/it]
 89%|████████▉ | 418/469 [12:36<01:31,  1.79s/it]
 89%|████████▉ | 419/469 [12:38<01:26,  1.74s/it]
 90%|████████▉ | 420/469 [12:40<01:25,  1.75s/it]
 90%|████████▉ | 421/469 [12:42<01:23,  1.75s/it]
 90%|████████▉ | 422/469 [12:43<01:20,  1.71s/it]
 90%|█████████ | 423/469 [12:45<01:20,  1.74s/it]
 90%|█████████ | 424/469 [12:47<01:18,  1.74s/it]
 91%|█████████ | 425/469 [12:49<01:17,  1.75s/it]
 91%|█████████ | 426/469 [12:50<01:15,  1.75s/it]
 91%|█████████ | 427/469 [12:52<01:14,  1.78s/it]
 91%|█████████▏| 428/469 [12:54<01:12,  1.77s/it]
 91%|█████████▏| 429/469 [12:56<01:09,  1.75s/it]
 92%|█████████▏| 430/469 [12:57<01:07,  1.72s/it]
 92%|█████████▏| 431/469 [12:59<01:04,  1.70s/it]
 92%|█████████▏| 432/469 [13:01<01:04,  1.74s/it]
 92%|█████████▏| 433/469 [13:03<01:07,  1.88s/it]
 93%|█████████▎| 434/469 [13:05<01:04,  1.86s/it]
 93%|█████████▎| 435/469 [13:07<01:03,  1.87s/it]
 93%|█████████▎| 436/469 [13:08<00:59,  1.81s/it]
 93%|█████████▎| 437/469 [13:10<00:57,  1.81s/it]
 93%|█████████▎| 438/469 [13:12<00:57,  1.86s/it]
 94%|█████████▎| 439/469 [13:14<00:54,  1.82s/it]
 94%|█████████▍| 440/469 [13:16<00:52,  1.82s/it]
 94%|█████████▍| 441/469 [13:18<00:51,  1.84s/it]
 94%|█████████▍| 442/469 [13:19<00:48,  1.80s/it]
 94%|█████████▍| 443/469 [13:21<00:48,  1.86s/it]
 95%|█████████▍| 444/469 [13:23<00:45,  1.80s/it]
 95%|█████████▍| 445/469 [13:25<00:42,  1.79s/it]
 95%|█████████▌| 446/469 [13:27<00:42,  1.85s/it]
 95%|█████████▌| 447/469 [13:29<00:41,  1.86s/it]
 96%|█████████▌| 448/469 [13:30<00:39,  1.88s/it]
 96%|█████████▌| 449/469 [13:32<00:37,  1.86s/it]
 96%|█████████▌| 450/469 [13:34<00:33,  1.79s/it]
 96%|█████████▌| 451/469 [13:36<00:31,  1.74s/it]
 96%|█████████▋| 452/469 [13:37<00:29,  1.73s/it]
 97%|█████████▋| 453/469 [13:39<00:28,  1.76s/it]
 97%|█████████▋| 454/469 [13:41<00:26,  1.74s/it]
 97%|█████████▋| 455/469 [13:42<00:23,  1.70s/it]
 97%|█████████▋| 456/469 [13:44<00:22,  1.71s/it]
 97%|█████████▋| 457/469 [13:46<00:20,  1.71s/it]
 98%|█████████▊| 458/469 [13:48<00:18,  1.71s/it]
 98%|█████████▊| 459/469 [13:49<00:16,  1.69s/it]
 98%|█████████▊| 460/469 [13:51<00:15,  1.68s/it]
 98%|█████████▊| 461/469 [13:53<00:14,  1.76s/it]
 99%|█████████▊| 462/469 [13:55<00:12,  1.78s/it]
 99%|█████████▊| 463/469 [13:57<00:10,  1.83s/it]
 99%|█████████▉| 464/469 [13:58<00:09,  1.86s/it]
 99%|█████████▉| 465/469 [14:00<00:07,  1.81s/it]
 99%|█████████▉| 466/469 [14:02<00:05,  1.78s/it]

+2026-03-21 21:11:32.431141: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-03-21 21:11:44.041066: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
+To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
+2026-03-21 21:11:44.095212: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
+2026-03-21 21:11:44.095324: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: d700126ec97dc07d69688b0430c49a6a-taskrole1-0
+2026-03-21 21:11:44.095377: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: d700126ec97dc07d69688b0430c49a6a-taskrole1-0
+2026-03-21 21:11:44.095515: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program
+2026-03-21 21:11:44.095581: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 550.127.8
+2026-03-21 21:11:45.185895: W tensorflow/core/framework/op_def_util.cc:371] Op BatchNormWithGlobalNormalization is deprecated. It will cease to work in GraphDef version 9. Use tf.nn.batch_normalization().
+warming up TensorFlow...
  0%|          | 0/1 [00:00<?, ?it/s]2026-03-21 21:11:45.919794: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled
+computing reference batch activations...
  0%|          | 0/211 [00:00<?, ?it/s]
  0%|          | 1/211 [00:02<07:04,  2.02s/it]
  1%|          | 2/211 [00:03<06:56,  2.00s/it]
  1%|▏         | 3/211 [00:05<06:50,  1.98s/it]
  2%|▏         | 4/211 [00:07<06:42,  1.95s/it]
  2%|▏         | 5/211 [00:09<06:25,  1.87s/it]
  3%|▎         | 6/211 [00:11<06:20,  1.85s/it]
  3%|▎         | 7/211 [00:13<06:46,  1.99s/it]
  4%|▍         | 8/211 [00:15<06:56,  2.05s/it]
  4%|▍         | 9/211 [00:30<20:28,  6.08s/it]
  5%|▍         | 10/211 [00:32<15:51,  4.73s/it]
  5%|▌         | 11/211 [00:34<12:42,  3.81s/it]
  6%|▌         | 12/211 [00:35<10:26,  3.15s/it]
  6%|▌         | 13/211 [00:37<08:55,  2.71s/it]
  7%|▋         | 14/211 [00:39<08:09,  2.49s/it]
  7%|▋         | 15/211 [00:41<07:32,  2.31s/it]
  8%|▊         | 16/211 [00:43<06:52,  2.12s/it]
  8%|▊         | 17/211 [00:44<06:35,  2.04s/it]
  9%|▊         | 18/211 [00:46<06:18,  1.96s/it]
  9%|▉         | 19/211 [00:48<06:16,  1.96s/it]
  9%|▉         | 20/211 [00:50<06:20,  1.99s/it]
 10%|▉         | 21/211 [00:52<06:01,  1.91s/it]
 10%|█         | 22/211 [00:54<05:51,  1.86s/it]
 11%|█         | 23/211 [00:56<05:48,  1.85s/it]
 11%|█▏        | 24/211 [00:57<05:44,  1.84s/it]
 12%|█▏        | 25/211 [00:59<05:44,  1.85s/it]
 12%|█▏        | 26/211 [01:01<05:38,  1.83s/it]
 13%|█▎        | 27/211 [01:03<05:33,  1.81s/it]
 13%|█▎        | 28/211 [01:05<06:02,  1.98s/it]
 14%|█▎        | 29/211 [01:07<05:49,  1.92s/it]
 14%|█▍        | 30/211 [01:09<05:33,  1.84s/it]
 15%|█▍        | 31/211 [01:11<05:38,  1.88s/it]
 15%|█▌        | 32/211 [01:12<05:32,  1.86s/it]
 16%|█▌        | 33/211 [01:14<05:26,  1.83s/it]
 16%|█▌        | 34/211 [01:16<05:20,  1.81s/it]
 17%|█▋        | 35/211 [01:18<05:16,  1.80s/it]
 17%|█▋        | 36/211 [01:19<05:11,  1.78s/it]
 18%|█▊        | 37/211 [01:21<05:06,  1.76s/it]
 18%|█▊        | 38/211 [01:23<05:22,  1.86s/it]
 18%|█▊        | 39/211 [01:25<05:12,  1.82s/it]
 19%|█▉        | 40/211 [01:35<11:51,  4.16s/it]
 19%|█▉        | 41/211 [01:36<09:45,  3.44s/it]
 20%|█▉        | 42/211 [01:38<08:27,  3.00s/it]
 20%|██        | 43/211 [01:40<07:25,  2.65s/it]
 21%|██        | 44/211 [01:42<06:46,  2.43s/it]
 21%|██▏       | 45/211 [01:44<06:28,  2.34s/it]
 22%|██▏       | 46/211 [01:46<05:56,  2.16s/it]
 22%|██▏       | 47/211 [01:48<05:55,  2.17s/it]
 23%|██▎       | 48/211 [01:50<05:30,  2.03s/it]
 23%|██▎       | 49/211 [01:52<05:22,  1.99s/it]
 24%|██▎       | 50/211 [01:54<05:11,  1.94s/it]
 24%|██▍       | 51/211 [01:56<05:18,  1.99s/it]
 25%|██▍       | 52/211 [01:57<05:02,  1.90s/it]
 25%|██▌       | 53/211 [01:59<04:52,  1.85s/it]
 26%|██▌       | 54/211 [02:01<04:42,  1.80s/it]
 26%|██▌       | 55/211 [02:03<04:40,  1.80s/it]
 27%|██▋       | 56/211 [02:05<04:47,  1.85s/it]
 27%|██▋       | 57/211 [02:06<04:38,  1.81s/it]
 27%|██▋       | 58/211 [02:08<04:40,  1.83s/it]
 28%|██▊       | 59/211 [02:10<04:34,  1.81s/it]
 28%|██▊       | 60/211 [02:12<04:31,  1.80s/it]
 29%|██▉       | 61/211 [02:13<04:24,  1.76s/it]
 29%|██▉       | 62/211 [02:15<04:19,  1.74s/it]
 30%|██▉       | 63/211 [02:17<04:25,  1.79s/it]
 30%|███       | 64/211 [02:19<04:22,  1.78s/it]
 31%|███       | 65/211 [02:21<04:21,  1.79s/it]
 31%|███▏      | 66/211 [02:22<04:16,  1.77s/it]
 32%|███▏      | 67/211 [02:24<04:27,  1.85s/it]
 32%|███▏      | 68/211 [02:26<04:23,  1.84s/it]
 33%|███▎      | 69/211 [02:28<04:21,  1.84s/it]
 33%|███▎      | 70/211 [02:30<04:15,  1.81s/it]
 34%|███▎      | 71/211 [02:32<04:13,  1.81s/it]
 34%|███▍      | 72/211 [02:33<04:09,  1.79s/it]
 35%|███▍      | 73/211 [02:35<04:04,  1.77s/it]
 35%|███▌      | 74/211 [02:53<15:24,  6.75s/it]
 36%|███▌      | 75/211 [02:55<11:58,  5.28s/it]
 36%|███▌      | 76/211 [02:57<09:31,  4.24s/it]
 36%|███▋      | 77/211 [02:59<07:46,  3.48s/it]
 37%|███▋      | 78/211 [03:00<06:33,  2.96s/it]
 37%|███▋      | 79/211 [03:02<05:47,  2.64s/it]
 38%|███▊      | 80/211 [03:04<05:19,  2.44s/it]
 38%|███▊      | 81/211 [03:06<04:47,  2.22s/it]
 39%|███▉      | 82/211 [03:08<04:27,  2.07s/it]
 39%|███▉      | 83/211 [03:09<04:12,  1.97s/it]
 40%|███▉      | 84/211 [03:11<04:02,  1.91s/it]
 40%|████      | 85/211 [03:13<03:53,  1.85s/it]
 41%|████      | 86/211 [03:15<03:48,  1.83s/it]
 41%|████      | 87/211 [03:16<03:42,  1.79s/it]
 42%|████▏     | 88/211 [03:18<03:40,  1.79s/it]
 42%|████▏     | 89/211 [03:20<03:35,  1.77s/it]
 43%|████▎     | 90/211 [03:22<03:33,  1.77s/it]
 43%|████▎     | 91/211 [03:23<03:30,  1.76s/it]
 44%|████▎     | 92/211 [03:25<03:35,  1.81s/it]
 44%|████▍     | 93/211 [03:27<03:28,  1.76s/it]
 45%|████▍     | 94/211 [03:29<03:25,  1.75s/it]
 45%|████▌     | 95/211 [03:31<03:22,  1.74s/it]
 45%|████▌     | 96/211 [03:32<03:24,  1.77s/it]
 46%|████▌     | 97/211 [03:34<03:21,  1.77s/it]
 46%|████▋     | 98/211 [03:36<03:19,  1.77s/it]
 47%|████▋     | 99/211 [03:38<03:18,  1.77s/it]
 47%|████▋     | 100/211 [03:39<03:14,  1.75s/it]
 48%|████▊     | 101/211 [03:41<03:13,  1.75s/it]
 48%|████▊     | 102/211 [03:43<03:13,  1.77s/it]
 49%|████▉     | 103/211 [03:45<03:08,  1.75s/it]
 49%|████▉     | 104/211 [03:47<03:12,  1.79s/it]
 50%|████▉     | 105/211 [03:48<03:07,  1.77s/it]
 50%|█████     | 106/211 [04:10<13:22,  7.64s/it]
 51%|█████     | 107/211 [04:12<10:18,  5.94s/it]
 51%|█████     | 108/211 [04:13<08:02,  4.68s/it]
 52%|█████▏    | 109/211 [04:15<06:26,  3.79s/it]
 52%|█████▏    | 110/211 [04:17<05:18,  3.16s/it]
 53%|█████▎    | 111/211 [04:18<04:33,  2.74s/it]
 53%|█████▎    | 112/211 [04:20<04:09,  2.52s/it]
 54%|█████▎    | 113/211 [04:22<03:50,  2.35s/it]
 54%|█████▍    | 114/211 [04:24<03:30,  2.17s/it]
 55%|█████▍    | 115/211 [04:26<03:22,  2.11s/it]
 55%|█████▍    | 116/211 [04:46<11:49,  7.47s/it]
 55%|█████▌    | 117/211 [04:48<08:58,  5.73s/it]
 56%|█████▌    | 118/211 [04:50<07:08,  4.61s/it]
 56%|█████▋    | 119/211 [04:51<05:41,  3.71s/it]
 57%|█████▋    | 120/211 [04:53<04:42,  3.10s/it]
 57%|█████▋    | 121/211 [04:55<04:01,  2.69s/it]
 58%|█████▊    | 122/211 [04:57<03:33,  2.40s/it]
 58%|█████▊    | 123/211 [04:58<03:13,  2.20s/it]
 59%|█████▉    | 124/211 [05:00<03:01,  2.08s/it]
 59%|█████▉    | 125/211 [05:02<02:50,  1.98s/it]
 60%|█████▉    | 126/211 [05:03<02:38,  1.87s/it]
 60%|██████    | 127/211 [05:05<02:31,  1.80s/it]
 61%|██████    | 128/211 [05:07<02:25,  1.75s/it]
 61%|██████    | 129/211 [05:09<02:29,  1.82s/it]
 62%|██████▏   | 130/211 [05:10<02:24,  1.79s/it]
 62%|██████▏   | 131/211 [05:12<02:20,  1.76s/it]
 63%|██████▎   | 132/211 [05:14<02:17,  1.74s/it]
 63%|██████▎   | 133/211 [05:15<02:14,  1.73s/it]
 64%|██████▎   | 134/211 [05:18<02:23,  1.87s/it]
 64%|██████▍   | 135/211 [05:20<02:22,  1.87s/it]
 64%|██████▍   | 136/211 [05:21<02:15,  1.80s/it]
 65%|██████▍   | 137/211 [05:23<02:13,  1.81s/it]
 65%|██████▌   | 138/211 [05:25<02:10,  1.79s/it]
 66%|██████▌   | 139/211 [05:26<02:06,  1.75s/it]
 66%|██████▋   | 140/211 [05:28<02:03,  1.73s/it]
 67%|██████▋   | 141/211 [05:30<02:04,  1.78s/it]
 67%|██████▋   | 142/211 [05:32<02:05,  1.82s/it]
 68%|██████▊   | 143/211 [05:34<02:04,  1.83s/it]
 68%|██��███▊   | 144/211 [05:35<01:59,  1.78s/it]
 69%|██████▊   | 145/211 [05:37<01:57,  1.78s/it]
 69%|██████▉   | 146/211 [05:55<07:10,  6.63s/it]
 70%|██████▉   | 147/211 [05:57<05:30,  5.17s/it]
 70%|███████   | 148/211 [05:59<04:23,  4.17s/it]
 71%|███████   | 149/211 [06:00<03:31,  3.42s/it]
 71%|███████   | 150/211 [06:02<02:58,  2.92s/it]
 72%|███████▏  | 151/211 [06:04<02:33,  2.56s/it]
 72%|███████▏  | 152/211 [06:06<02:15,  2.29s/it]
 73%|███████▎  | 153/211 [06:07<02:02,  2.12s/it]
 73%|███████▎  | 154/211 [06:10<02:15,  2.38s/it]
 73%|███████▎  | 155/211 [06:12<02:00,  2.15s/it]
 74%|███████▍  | 156/211 [06:14<01:50,  2.00s/it]
 74%|███████▍  | 157/211 [06:15<01:43,  1.91s/it]
 75%|███████▍  | 158/211 [06:17<01:38,  1.86s/it]
 75%|███████▌  | 159/211 [06:19<01:37,  1.88s/it]
 76%|███████▌  | 160/211 [06:21<01:32,  1.82s/it]
 76%|███████▋  | 161/211 [06:22<01:29,  1.79s/it]
 77%|███████▋  | 162/211 [06:24<01:31,  1.87s/it]
 77%|███████▋  | 163/211 [06:26<01:27,  1.83s/it]
 78%|███████▊  | 164/211 [06:29<01:36,  2.05s/it]
 78%|███████▊  | 165/211 [06:31<01:32,  2.01s/it]
 79%|███████▊  | 166/211 [06:32<01:27,  1.94s/it]
 79%|███████▉  | 167/211 [06:34<01:21,  1.85s/it]
 80%|███████▉  | 168/211 [06:36<01:17,  1.79s/it]
 80%|████████  | 169/211 [06:38<01:19,  1.89s/it]
 81%|████████  | 170/211 [06:40<01:17,  1.88s/it]
 81%|████████  | 171/211 [06:41<01:14,  1.85s/it]
 82%|████████▏ | 172/211 [06:43<01:11,  1.84s/it]
 82%|████████▏ | 173/211 [06:45<01:08,  1.81s/it]
 82%|████████▏ | 174/211 [06:47<01:07,  1.84s/it]
 83%|████████▎ | 175/211 [06:49<01:04,  1.78s/it]
 83%|████████▎ | 176/211 [06:50<01:02,  1.79s/it]
 84%|████████▍ | 177/211 [06:52<01:00,  1.77s/it]
 84%|████████▍ | 178/211 [06:54<01:03,  1.92s/it]
 85%|████████▍ | 179/211 [06:56<00:59,  1.87s/it]
 85%|████████▌ | 180/211 [06:58<00:56,  1.81s/it]
 86%|████████▌ | 181/211 [06:59<00:53,  1.77s/it]
 86%|████████▋ | 182/211 [07:01<00:53,  1.84s/it]
 87%|████████▋ | 183/211 [07:03<00:50,  1.81s/it]
 87%|████████▋ | 184/211 [07:05<00:49,  1.83s/it]
 88%|████████▊ | 185/211 [07:07<00:48,  1.86s/it]
 88%|████████▊ | 186/211 [07:09<00:45,  1.83s/it]
 89%|████████▊ | 187/211 [07:11<00:43,  1.82s/it]
 89%|████████▉ | 188/211 [07:12<00:42,  1.83s/it]
 90%|████████▉ | 189/211 [07:14<00:39,  1.77s/it]
 90%|█████████ | 190/211 [07:16<00:37,  1.77s/it]
 91%|█████████ | 191/211 [07:17<00:34,  1.73s/it]
 91%|█████████ | 192/211 [07:19<00:32,  1.71s/it]
 91%|█████████▏| 193/211 [07:21<00:30,  1.72s/it]
 92%|█████████▏| 194/211 [07:23<00:29,  1.72s/it]
 92%|█████████▏| 195/211 [07:24<00:27,  1.72s/it]
 93%|█████████▎| 196/211 [07:26<00:25,  1.70s/it]
 93%|█████████▎| 197/211 [07:28<00:24,  1.75s/it]
 94%|█████████▍| 198/211 [07:30<00:23,  1.78s/it]
 94%|█████████▍| 199/211 [07:31<00:21,  1.76s/it]
 95%|█████████▍| 200/211 [07:33<00:19,  1.78s/it]
 95%|█████████▌| 201/211 [07:35<00:17,  1.76s/it]
 96%|█████████▌| 202/211 [07:37<00:16,  1.80s/it]
 96%|█████████▌| 203/211 [07:39<00:14,  1.79s/it]
 97%|█████████▋| 204/211 [07:40<00:12,  1.76s/it]
 97%|█████████▋| 205/211 [07:42<00:10,  1.74s/it]
 98%|█████████▊| 206/211 [07:44<00:08,  1.75s/it]
 98%|█████████▊| 207/211 [07:45<00:06,  1.72s/it]
 99%|█████████▊| 208/211 [07:47<00:05,  1.70s/it]
 99%|█████████▉| 209/211 [07:49<00:03,  1.68s/it]
+computing/reading reference batch statistics...
+computing sample batch activations...
  0%|          | 0/469 [00:00<?, ?it/s]
  0%|          | 1/469 [00:02<18:09,  2.33s/it]
  0%|          | 2/469 [00:04<15:10,  1.95s/it]
  1%|          | 3/469 [00:05<14:14,  1.83s/it]
  1%|          | 4/469 [00:07<13:52,  1.79s/it]
  1%|          | 5/469 [00:09<13:23,  1.73s/it]
  1%|▏         | 6/469 [00:12<17:01,  2.21s/it]
  1%|▏         | 7/469 [00:14<16:26,  2.14s/it]
  2%|▏         | 8/469 [00:15<15:33,  2.02s/it]
  2%|▏         | 9/469 [00:17<15:18,  2.00s/it]
  2%|▏         | 10/469 [00:19<14:43,  1.92s/it]
  2%|▏         | 11/469 [00:21<14:04,  1.84s/it]
  3%|▎         | 12/469 [00:23<14:21,  1.89s/it]
  3%|▎         | 13/469 [00:25<14:16,  1.88s/it]
  3%|▎         | 14/469 [00:27<14:15,  1.88s/it]
  3%|▎         | 15/469 [00:28<14:05,  1.86s/it]
  3%|▎         | 16/469 [00:30<14:24,  1.91s/it]
  4%|▎         | 17/469 [00:32<14:05,  1.87s/it]
  4%|▍         | 18/469 [00:34<14:13,  1.89s/it]
  4%|▍         | 19/469 [00:36<13:44,  1.83s/it]
  4%|▍         | 20/469 [00:37<13:14,  1.77s/it]
  4%|▍         | 21/469 [00:39<13:07,  1.76s/it]
  5%|▍         | 22/469 [00:41<12:51,  1.72s/it]
  5%|▍         | 23/469 [00:43<12:56,  1.74s/it]
  5%|▌         | 24/469 [00:44<12:53,  1.74s/it]
  5%|▌         | 25/469 [00:46<12:43,  1.72s/it]
  6%|▌         | 26/469 [00:48<12:47,  1.73s/it]
  6%|▌         | 27/469 [00:49<12:34,  1.71s/it]
  6%|▌         | 28/469 [00:51<12:53,  1.75s/it]
  6%|▌         | 29/469 [00:53<12:38,  1.72s/it]
  6%|▋         | 30/469 [00:55<12:33,  1.72s/it]
  7%|▋         | 31/469 [00:56<12:17,  1.68s/it]
  7%|▋         | 32/469 [00:58<12:27,  1.71s/it]
  7%|▋         | 33/469 [01:00<12:22,  1.70s/it]
  7%|▋         | 34/469 [01:03<16:09,  2.23s/it]
  7%|▋         | 35/469 [01:05<15:00,  2.08s/it]
  8%|▊         | 36/469 [01:07<14:10,  1.96s/it]
  8%|▊         | 37/469 [01:08<13:48,  1.92s/it]
  8%|▊         | 38/469 [01:10<13:13,  1.84s/it]
  8%|▊         | 39/469 [01:12<13:28,  1.88s/it]
  9%|▊         | 40/469 [01:14<13:04,  1.83s/it]
  9%|▊         | 41/469 [01:16<13:04,  1.83s/it]
  9%|▉         | 42/469 [01:19<15:53,  2.23s/it]
  9%|▉         | 43/469 [01:21<15:36,  2.20s/it]
  9%|▉         | 44/469 [01:23<14:48,  2.09s/it]
 10%|▉         | 45/469 [01:24<13:55,  1.97s/it]
 10%|▉         | 46/469 [01:26<13:20,  1.89s/it]
 10%|█         | 47/469 [01:28<13:24,  1.91s/it]
 10%|█         | 48/469 [01:30<13:15,  1.89s/it]
 10%|█         | 49/469 [01:31<12:39,  1.81s/it]
 11%|█         | 50/469 [01:33<12:26,  1.78s/it]
 11%|█         | 51/469 [01:35<12:47,  1.84s/it]
 11%|█         | 52/469 [01:37<12:28,  1.79s/it]
 11%|█▏        | 53/469 [01:39<12:09,  1.75s/it]
 12%|█▏        | 54/469 [01:40<12:13,  1.77s/it]
 12%|█▏        | 55/469 [01:42<12:03,  1.75s/it]
 12%|█▏        | 56/469 [01:44<12:13,  1.78s/it]
 12%|█▏        | 57/469 [01:46<12:27,  1.81s/it]
 12%|█▏        | 58/469 [01:48<12:53,  1.88s/it]
 13%|█▎        | 59/469 [01:50<12:46,  1.87s/it]
 13%|█▎        | 60/469 [01:51<12:25,  1.82s/it]
 13%|█▎        | 61/469 [01:53<12:01,  1.77s/it]
 13%|█▎        | 62/469 [01:55<11:49,  1.74s/it]
 13%|█▎        | 63/469 [01:56<11:43,  1.73s/it]
 14%|█▎        | 64/469 [01:58<11:37,  1.72s/it]
 14%|█▍        | 65/469 [02:00<11:38,  1.73s/it]
 14%|█▍        | 66/469 [02:02<11:28,  1.71s/it]
 14%|█▍        | 67/469 [02:03<11:30,  1.72s/it]
 14%|█▍        | 68/469 [02:05<11:30,  1.72s/it]
 15%|█▍        | 69/469 [02:07<12:03,  1.81s/it]
 15%|█▍        | 70/469 [02:09<11:46,  1.77s/it]
 15%|█▌        | 71/469 [02:10<11:36,  1.75s/it]
 15%|█▌        | 72/469 [02:12<11:34,  1.75s/it]
 16%|█▌        | 73/469 [02:14<12:02,  1.82s/it]
 16%|█▌        | 74/469 [02:16<11:41,  1.78s/it]
 16%|█▌        | 75/469 [02:18<12:10,  1.85s/it]
 16%|█▌        | 76/469 [02:20<12:04,  1.84s/it]
 16%|█▋        | 77/469 [02:21<11:44,  1.80s/it]
 17%|█▋        | 78/469 [02:23<11:44,  1.80s/it]
 17%|█▋        | 79/469 [02:25<11:44,  1.81s/it]
 17%|█▋        | 80/469 [02:27<11:43,  1.81s/it]
 17%|█▋        | 81/469 [02:28<11:22,  1.76s/it]
 17%|█▋        | 82/469 [02:30<11:10,  1.73s/it]
 18%|█▊        | 83/469 [02:32<11:12,  1.74s/it]
 18%|█▊        | 84/469 [02:34<11:11,  1.75s/it]
 18%|█▊        | 85/469 [02:36<11:32,  1.80s/it]
 18%|█▊        | 86/469 [02:37<11:32,  1.81s/it]
 19%|█▊        | 87/469 [02:39<11:16,  1.77s/it]
 19%|█▉        | 88/469 [02:41<11:07,  1.75s/it]
 19%|█▉        | 89/469 [02:42<11:00,  1.74s/it]
 19%|█▉        | 90/469 [02:44<11:09,  1.77s/it]
 19%|█▉        | 91/469 [02:46<11:15,  1.79s/it]
 20%|█▉        | 92/469 [02:48<11:09,  1.78s/it]
 20%|█▉        | 93/469 [02:50<10:57,  1.75s/it]
 20%|██        | 94/469 [02:51<10:55,  1.75s/it]
 20%|██        | 95/469 [02:53<10:50,  1.74s/it]
 20%|██        | 96/469 [02:55<10:42,  1.72s/it]
 21%|██        | 97/469 [02:56<10:37,  1.71s/it]
 21%|██        | 98/469 [02:58<10:51,  1.76s/it]
 21%|██        | 99/469 [03:00<10:52,  1.76s/it]
 21%|██▏       | 100/469 [03:02<11:08,  1.81s/it]
 22%|██▏       | 101/469 [03:04<10:52,  1.77s/it]
 22%|██▏       | 102/469 [03:05<10:48,  1.77s/it]
 22%|██▏       | 103/469 [03:07<10:35,  1.74s/it]
 22%|██▏       | 104/469 [03:09<10:28,  1.72s/it]
 22%|██▏       | 105/469 [03:11<10:38,  1.75s/it]
 23%|██▎       | 106/469 [03:12<10:29,  1.73s/it]
 23%|██▎       | 107/469 [03:14<10:22,  1.72s/it]
 23%|██▎       | 108/469 [03:16<10:41,  1.78s/it]
 23%|██▎       | 109/469 [03:18<10:29,  1.75s/it]
 23%|██▎       | 110/469 [03:19<10:32,  1.76s/it]
 24%|██▎       | 111/469 [03:21<10:25,  1.75s/it]
 24%|██▍       | 112/469 [03:23<10:24,  1.75s/it]
 24%|██▍       | 113/469 [03:25<10:37,  1.79s/it]
 24%|██▍       | 114/469 [03:26<10:24,  1.76s/it]
 25%|██▍       | 115/469 [03:28<10:27,  1.77s/it]
 25%|██▍       | 116/469 [03:30<10:22,  1.76s/it]
 25%|██▍       | 117/469 [03:32<10:28,  1.79s/it]
 25%|██▌       | 118/469 [03:34<10:52,  1.86s/it]
 25%|██▌       | 119/469 [03:36<11:00,  1.89s/it]
 26%|██▌       | 120/469 [03:37<10:42,  1.84s/it]
 26%|██▌       | 121/469 [03:39<10:37,  1.83s/it]
 26%|██▌       | 122/469 [03:41<10:17,  1.78s/it]
 26%|██▌       | 123/469 [03:43<10:21,  1.80s/it]
 26%|██▋       | 124/469 [03:45<10:14,  1.78s/it]
 27%|██▋       | 125/469 [03:46<10:15,  1.79s/it]
 27%|██▋       | 126/469 [03:48<10:12,  1.79s/it]
 27%|██▋       | 127/469 [03:50<09:59,  1.75s/it]
 27%|██▋       | 128/469 [03:51<09:49,  1.73s/it]
 28%|██▊       | 129/469 [03:53<09:53,  1.75s/it]
 28%|██▊       | 130/469 [03:55<09:42,  1.72s/it]
 28%|██▊       | 131/469 [03:57<09:37,  1.71s/it]
 28%|██▊       | 132/469 [03:58<09:49,  1.75s/it]
 28%|██▊       | 133/469 [04:00<09:59,  1.78s/it]
 29%|██▊       | 134/469 [04:02<10:22,  1.86s/it]
 29%|██▉       | 135/469 [04:04<10:20,  1.86s/it]
 29%|██▉       | 136/469 [04:06<10:26,  1.88s/it]
 29%|██▉       | 137/469 [04:08<10:02,  1.81s/it]
 29%|██▉       | 138/469 [04:10<10:16,  1.86s/it]
 30%|██▉       | 139/469 [04:11<09:58,  1.81s/it]
 30%|██▉       | 140/469 [04:13<09:59,  1.82s/it]
 30%|███       | 141/469 [04:15<09:40,  1.77s/it]
 30%|███       | 142/469 [04:17<09:28,  1.74s/it]
 30%|███       | 143/469 [04:18<09:16,  1.71s/it]
 31%|███       | 144/469 [04:20<09:13,  1.70s/it]
 31%|███       | 145/469 [04:22<09:28,  1.75s/it]
 31%|███       | 146/469 [04:24<09:18,  1.73s/it]
 31%|███▏      | 147/469 [04:25<09:35,  1.79s/it]
 32%|███▏      | 148/469 [04:27<09:28,  1.77s/it]
 32%|███▏      | 149/469 [04:29<09:12,  1.73s/it]
 32%|███▏      | 150/469 [04:31<09:11,  1.73s/it]
 32%|███▏      | 151/469 [04:32<09:18,  1.76s/it]
 32%|███▏      | 152/469 [04:34<09:23,  1.78s/it]
 33%|███▎      | 153/469 [04:36<09:21,  1.78s/it]
 33%|███▎      | 154/469 [04:38<09:11,  1.75s/it]
 33%|███▎      | 155/469 [04:39<09:09,  1.75s/it]
 33%|███▎      | 156/469 [04:42<10:07,  1.94s/it]
 33%|███▎      | 157/469 [04:44<10:39,  2.05s/it]
 34%|███▎      | 158/469 [04:46<10:12,  1.97s/it]
 34%|███▍      | 159/469 [04:48<09:42,  1.88s/it]
 34%|███▍      | 160/469 [04:49<09:42,  1.88s/it]
 34%|███▍      | 161/469 [04:51<09:24,  1.83s/it]
 35%|███▍      | 162/469 [04:53<09:08,  1.79s/it]
 35%|███▍      | 163/469 [04:55<09:01,  1.77s/it]
 35%|███▍      | 164/469 [04:56<09:08,  1.80s/it]
 35%|███▌      | 165/469 [04:58<09:07,  1.80s/it]
 35%|███▌      | 166/469 [05:00<09:06,  1.80s/it]
 36%|███▌      | 167/469 [05:02<09:00,  1.79s/it]
 36%|███▌      | 168/469 [05:03<08:47,  1.75s/it]
 36%|███▌      | 169/469 [05:05<08:51,  1.77s/it]
 36%|███▌      | 170/469 [05:07<08:35,  1.73s/it]
 36%|███▋      | 171/469 [05:09<08:25,  1.70s/it]
 37%|███▋      | 172/469 [05:10<08:24,  1.70s/it]
 37%|███▋      | 173/469 [05:12<08:20,  1.69s/it]
 37%|███▋      | 174/469 [05:14<08:20,  1.70s/it]
 37%|███▋      | 175/469 [05:16<08:42,  1.78s/it]
 38%|███▊      | 176/469 [05:17<08:34,  1.76s/it]
 38%|███▊      | 177/469 [05:19<08:31,  1.75s/it]
 38%|███▊      | 178/469 [05:21<08:34,  1.77s/it]
 38%|███▊      | 179/469 [05:22<08:22,  1.73s/it]
 38%|███▊      | 180/469 [05:24<08:22,  1.74s/it]
 39%|███▊      | 181/469 [05:26<08:14,  1.72s/it]
 39%|███▉      | 182/469 [05:28<08:19,  1.74s/it]
 39%|███▉      | 183/469 [05:29<08:17,  1.74s/it]
 39%|███▉      | 184/469 [05:31<08:29,  1.79s/it]
 39%|███▉      | 185/469 [05:33<08:36,  1.82s/it]
 40%|███▉      | 186/469 [05:35<08:36,  1.82s/it]
 40%|███▉      | 187/469 [05:37<08:29,  1.81s/it]
 40%|████      | 188/469 [05:39<08:45,  1.87s/it]
 40%|████      | 189/469 [05:41<08:29,  1.82s/it]
 41%|████      | 190/469 [05:42<08:28,  1.82s/it]
 41%|████      | 191/469 [05:44<08:18,  1.79s/it]
 41%|████      | 192/469 [05:46<08:10,  1.77s/it]
 41%|████      | 193/469 [05:47<07:58,  1.73s/it]
 41%|████▏     | 194/469 [05:49<07:52,  1.72s/it]
 42%|████▏     | 195/469 [05:51<08:08,  1.78s/it]
 42%|████▏     | 196/469 [05:53<08:19,  1.83s/it]
 42%|████▏     | 197/469 [05:55<08:35,  1.90s/it]
 42%|████▏     | 198/469 [05:57<08:28,  1.88s/it]
 42%|████▏     | 199/469 [05:59<08:17,  1.84s/it]
 43%|████▎     | 200/469 [06:01<08:22,  1.87s/it]
 43%|████▎     | 201/469 [06:02<08:09,  1.83s/it]
 43%|████▎     | 202/469 [06:04<08:08,  1.83s/it]
 43%|████▎     | 203/469 [06:06<08:21,  1.88s/it]
 43%|████▎     | 204/469 [06:08<07:59,  1.81s/it]
 44%|████▎     | 205/469 [06:09<07:46,  1.77s/it]
 44%|████▍     | 206/469 [06:11<07:38,  1.74s/it]
 44%|████▍     | 207/469 [06:13<07:27,  1.71s/it]
 44%|████▍     | 208/469 [06:15<07:40,  1.76s/it]
 45%|████▍     | 209/469 [06:16<07:35,  1.75s/it]
 45%|████▍     | 210/469 [06:18<07:30,  1.74s/it]
 45%|████▍     | 211/469 [06:20<07:35,  1.77s/it]
 45%|████▌     | 212/469 [06:22<07:32,  1.76s/it]
 45%|████▌     | 213/469 [06:23<07:29,  1.76s/it]
 46%|████▌     | 214/469 [06:25<07:38,  1.80s/it]
 46%|████▌     | 215/469 [06:27<07:34,  1.79s/it]
 46%|████▌     | 216/469 [06:29<07:28,  1.77s/it]
 46%|████▋     | 217/469 [06:30<07:20,  1.75s/it]
 46%|████▋     | 218/469 [06:32<07:08,  1.71s/it]
 47%|████▋     | 219/469 [06:34<07:04,  1.70s/it]
 47%|████▋     | 220/469 [06:35<07:02,  1.70s/it]
 47%|████▋     | 221/469 [06:38<07:54,  1.91s/it]
 47%|████▋     | 222/469 [06:40<07:49,  1.90s/it]
 48%|████▊     | 223/469 [06:42<07:37,  1.86s/it]
 48%|████▊     | 224/469 [06:43<07:18,  1.79s/it]
 48%|████▊     | 225/469 [06:45<07:42,  1.90s/it]
 48%|████▊     | 226/469 [06:48<09:05,  2.25s/it]
 48%|████▊     | 227/469 [06:50<08:33,  2.12s/it]
 49%|████▊     | 228/469 [06:52<07:55,  1.97s/it]
 49%|████▉     | 229/469 [06:54<07:40,  1.92s/it]
 49%|████▉     | 230/469 [06:55<07:33,  1.90s/it]
 49%|████▉     | 231/469 [06:57<07:17,  1.84s/it]
 49%|████▉     | 232/469 [06:59<07:05,  1.79s/it]
 50%|████▉     | 233/469 [07:01<07:03,  1.80s/it]
 50%|████▉     | 234/469 [07:02<06:59,  1.78s/it]
 50%|█████     | 235/469 [07:05<08:13,  2.11s/it]
 50%|█████     | 236/469 [07:07<07:39,  1.97s/it]
 51%|█████     | 237/469 [07:09<07:15,  1.88s/it]
 51%|█████     | 238/469 [07:10<07:11,  1.87s/it]
 51%|█████     | 239/469 [07:12<06:55,  1.81s/it]
 51%|█████     | 240/469 [07:14<07:03,  1.85s/it]
 51%|█████▏    | 241/469 [07:16<06:54,  1.82s/it]
 52%|█████▏    | 242/469 [07:17<06:41,  1.77s/it]
 52%|█████▏    | 243/469 [07:19<06:33,  1.74s/it]
 52%|█████▏    | 244/469 [07:21<06:28,  1.73s/it]
 52%|█████▏    | 245/469 [07:24<07:39,  2.05s/it]
 52%|█████▏    | 246/469 [07:25<07:19,  1.97s/it]
 53%|█████▎    | 247/469 [07:27<07:08,  1.93s/it]
 53%|█████▎    | 248/469 [07:29<06:57,  1.89s/it]
 53%|█████▎    | 249/469 [07:31<06:40,  1.82s/it]
 53%|█████▎    | 250/469 [07:33<06:38,  1.82s/it]
 54%|█████▎    | 251/469 [07:34<06:25,  1.77s/it]
 54%|█████▎    | 252/469 [07:36<06:15,  1.73s/it]
 54%|█████▍    | 253/469 [07:37<06:09,  1.71s/it]
 54%|█████▍    | 254/469 [07:39<06:05,  1.70s/it]
 54%|█████▍    | 255/469 [07:41<06:02,  1.70s/it]
 55%|█████▍    | 256/469 [07:42<05:58,  1.68s/it]
 55%|█████▍    | 257/469 [07:44<06:01,  1.71s/it]
 55%|█████▌    | 258/469 [07:47<06:54,  1.96s/it]
 55%|█████▌    | 259/469 [07:49<06:35,  1.88s/it]
 55%|█████▌    | 260/469 [07:52<08:26,  2.42s/it]
 56%|█████▌    | 261/469 [07:54<07:35,  2.19s/it]
 56%|█████▌    | 262/469 [07:56<07:07,  2.07s/it]
 56%|█████▌    | 263/469 [07:57<06:47,  1.98s/it]
 56%|█████▋    | 264/469 [07:59<06:25,  1.88s/it]
 57%|█████▋    | 265/469 [08:01<06:21,  1.87s/it]
 57%|█████▋    | 266/469 [08:03<06:20,  1.87s/it]
 57%|█████▋    | 267/469 [08:04<06:08,  1.82s/it]
 57%|█████▋    | 268/469 [08:06<06:07,  1.83s/it]
 57%|█████▋    | 269/469 [08:08<05:54,  1.77s/it]
 58%|█████▊    | 270/469 [08:10<05:59,  1.80s/it]
 58%|█████▊    | 271/469 [08:12<05:51,  1.77s/it]
 58%|█████▊    | 272/469 [08:13<05:58,  1.82s/it]
 58%|█████▊    | 273/469 [08:15<06:06,  1.87s/it]
 58%|█████▊    | 274/469 [08:17<05:52,  1.81s/it]
 59%|█████▊    | 275/469 [08:19<05:50,  1.81s/it]
 59%|█████▉    | 276/469 [08:21<05:42,  1.77s/it]
 59%|█████▉    | 277/469 [08:22<05:37,  1.76s/it]
 59%|█████▉    | 278/469 [08:24<05:34,  1.75s/it]
 59%|█████▉    | 279/469 [08:26<05:38,  1.78s/it]
 60%|█████▉    | 280/469 [08:28<05:38,  1.79s/it]
 60%|█████▉    | 281/469 [08:30<05:39,  1.81s/it]
 60%|██████    | 282/469 [08:31<05:38,  1.81s/it]
 60%|██████    | 283/469 [08:33<05:30,  1.78s/it]
 61%|██████    | 284/469 [08:35<05:26,  1.77s/it]
 61%|██████    | 285/469 [08:36<05:17,  1.73s/it]
 61%|██████    | 286/469 [08:38<05:14,  1.72s/it]
 61%|██████    | 287/469 [08:40<05:10,  1.71s/it]
 61%|██████▏   | 288/469 [08:41<05:05,  1.69s/it]
 62%|██████▏   | 289/469 [08:43<05:04,  1.69s/it]
 62%|██████▏   | 290/469 [08:45<05:11,  1.74s/it]
 62%|██████▏   | 291/469 [08:47<05:11,  1.75s/it]
 62%|██████▏   | 292/469 [08:48<05:04,  1.72s/it]
 62%|██████▏   | 293/469 [08:50<05:04,  1.73s/it]
 63%|██████▎   | 294/469 [08:52<05:19,  1.83s/it]
 63%|██████▎   | 295/469 [08:54<05:14,  1.81s/it]
 63%|██████▎   | 296/469 [08:56<05:04,  1.76s/it]
 63%|██████▎   | 297/469 [08:57<04:58,  1.74s/it]
 64%|██████▎   | 298/469 [08:59<05:01,  1.76s/it]
 64%|██████▍   | 299/469 [09:01<04:56,  1.74s/it]
 64%|██████▍   | 300/469 [09:03<04:52,  1.73s/it]
 64%|██████▍   | 301/469 [09:04<04:48,  1.72s/it]
 64%|██████▍   | 302/469 [09:06<04:43,  1.70s/it]
 65%|██████▍   | 303/469 [09:08<04:50,  1.75s/it]
 65%|██████▍   | 304/469 [09:10<04:46,  1.74s/it]
 65%|██████▌   | 305/469 [09:11<04:47,  1.75s/it]
 65%|██████▌   | 306/469 [09:13<04:42,  1.73s/it]
 65%|██████▌   | 307/469 [09:15<04:58,  1.84s/it]
 66%|██████▌   | 308/469 [09:17<04:50,  1.81s/it]
 66%|██████▌   | 309/469 [09:19<04:46,  1.79s/it]
 66%|██████▌   | 310/469 [09:20<04:39,  1.76s/it]
 66%|██████▋   | 311/469 [09:22<04:34,  1.74s/it]
 67%|██████▋   | 312/469 [09:24<04:36,  1.76s/it]
 67%|██████▋   | 313/469 [09:26<04:52,  1.88s/it]
 67%|██████▋   | 314/469 [09:28<04:55,  1.91s/it]
 67%|██████▋   | 315/469 [09:30<04:43,  1.84s/it]
 67%|██████▋   | 316/469 [09:31<04:36,  1.81s/it]
 68%|██████▊   | 317/469 [09:33<04:42,  1.86s/it]
 68%|██████▊   | 318/469 [09:36<05:38,  2.24s/it]
 68%|██████▊   | 319/469 [09:38<05:18,  2.13s/it]
 68%|██████▊   | 320/469 [09:40<04:59,  2.01s/it]
 68%|██████▊   | 321/469 [09:42<04:49,  1.95s/it]
 69%|██████▊   | 322/469 [09:43<04:32,  1.86s/it]
 69%|██████▉   | 323/469 [09:47<05:51,  2.41s/it]
 69%|██████▉   | 324/469 [09:49<05:17,  2.19s/it]
 69%|██████▉   | 325/469 [09:51<04:58,  2.08s/it]
 70%|██████▉   | 326/469 [09:52<04:42,  1.97s/it]
 70%|██████▉   | 327/469 [09:54<04:29,  1.89s/it]
 70%|██████▉   | 328/469 [09:56<04:17,  1.82s/it]
 70%|███████   | 329/469 [09:58<04:12,  1.80s/it]
 70%|███████   | 330/469 [09:59<04:07,  1.78s/it]
 71%|███████   | 331/469 [10:01<04:12,  1.83s/it]
 71%|███████   | 332/469 [10:03<04:06,  1.80s/it]
 71%|███████   | 333/469 [10:05<03:57,  1.75s/it]
 71%|███████   | 334/469 [10:06<03:51,  1.71s/it]
 71%|███████▏  | 335/469 [10:08<03:47,  1.70s/it]
 72%|███████▏  | 336/469 [10:09<03:44,  1.69s/it]
 72%|███████▏  | 337/469 [10:11<03:42,  1.68s/it]
 72%|███████▏  | 338/469 [10:13<03:45,  1.72s/it]
 72%|███████▏  | 339/469 [10:15<03:41,  1.70s/it]
 72%|███████▏  | 340/469 [10:16<03:38,  1.70s/it]
 73%|███████▎  | 341/469 [10:18<03:37,  1.70s/it]
 73%|███████▎  | 342/469 [10:20<03:47,  1.79s/it]
 73%|███████▎  | 343/469 [10:22<03:42,  1.77s/it]
 73%|███████▎  | 344/469 [10:24<03:40,  1.77s/it]
 74%|███████▎  | 345/469 [10:25<03:39,  1.77s/it]
 74%|███████▍  | 346/469 [10:27<03:36,  1.76s/it]
 74%|███████▍  | 347/469 [10:29<03:31,  1.73s/it]
 74%|███████▍  | 348/469 [10:31<03:35,  1.78s/it]
 74%|███████▍  | 349/469 [10:32<03:34,  1.78s/it]
 75%|███████▍  | 350/469 [10:34<03:30,  1.76s/it]
 75%|███████▍  | 351/469 [10:36<03:23,  1.72s/it]
 75%|███████▌  | 352/469 [10:38<03:33,  1.82s/it]
 75%|███████▌  | 353/469 [10:40<03:29,  1.81s/it]
 75%|███████▌  | 354/469 [10:41<03:27,  1.80s/it]
 76%|███████▌  | 355/469 [10:43<03:21,  1.77s/it]
 76%|███████▌  | 356/469 [10:45<03:27,  1.83s/it]
 76%|███████▌  | 357/469 [10:47<03:18,  1.77s/it]
 76%|███████▋  | 358/469 [10:48<03:14,  1.75s/it]
 77%|██��████▋  | 359/469 [10:50<03:09,  1.73s/it]
 77%|███████▋  | 360/469 [10:52<03:07,  1.72s/it]
 77%|███████▋  | 361/469 [10:53<03:06,  1.72s/it]
 77%|███████▋  | 362/469 [10:55<03:07,  1.75s/it]
 77%|███████▋  | 363/469 [10:57<03:10,  1.79s/it]
 78%|███████▊  | 364/469 [10:59<03:06,  1.77s/it]
 78%|███████▊  | 365/469 [11:01<03:00,  1.73s/it]
 78%|███████▊  | 366/469 [11:02<02:55,  1.71s/it]
 78%|███████▊  | 367/469 [11:04<02:53,  1.70s/it]
 78%|███████▊  | 368/469 [11:06<02:53,  1.72s/it]
 79%|███████▊  | 369/469 [11:07<02:51,  1.71s/it]
 79%|███████▉  | 370/469 [11:09<02:48,  1.70s/it]
 79%|███████▉  | 371/469 [11:11<02:48,  1.71s/it]
 79%|███████▉  | 372/469 [11:13<02:51,  1.77s/it]
 80%|███████▉  | 373/469 [11:14<02:46,  1.73s/it]
 80%|███████▉  | 374/469 [11:16<02:45,  1.75s/it]
 80%|███████▉  | 375/469 [11:18<02:43,  1.74s/it]
 80%|████████  | 376/469 [11:20<02:42,  1.74s/it]
 80%|████████  | 377/469 [11:21<02:39,  1.74s/it]
 81%|████████  | 378/469 [11:23<02:36,  1.72s/it]
 81%|████████  | 379/469 [11:25<02:35,  1.72s/it]
 81%|████████  | 380/469 [11:26<02:32,  1.72s/it]
 81%|████████  | 381/469 [11:28<02:34,  1.76s/it]
 81%|████████▏ | 382/469 [11:30<02:43,  1.88s/it]
 82%|████████▏ | 383/469 [11:32<02:35,  1.81s/it]
 82%|████████▏ | 384/469 [11:34<02:29,  1.76s/it]
 82%|████████▏ | 385/469 [11:36<02:30,  1.80s/it]
 82%|████████▏ | 386/469 [11:37<02:26,  1.77s/it]
 83%|████████▎ | 387/469 [11:39<02:26,  1.78s/it]
 83%|████████▎ | 388/469 [11:41<02:23,  1.77s/it]
 83%|████████▎ | 389/469 [11:42<02:18,  1.73s/it]
 83%|████████▎ | 390/469 [11:44<02:18,  1.75s/it]
 83%|████████▎ | 391/469 [11:46<02:16,  1.76s/it]
 84%|████████▎ | 392/469 [11:49<02:33,  1.99s/it]
 84%|████████▍ | 393/469 [11:50<02:24,  1.90s/it]
 84%|████████▍ | 394/469 [11:52<02:18,  1.85s/it]
 84%|████████▍ | 395/469 [11:54<02:12,  1.79s/it]
 84%|████████▍ | 396/469 [11:58<03:02,  2.50s/it]
 85%|████████▍ | 397/469 [12:00<02:44,  2.29s/it]
 85%|████████▍ | 398/469 [12:01<02:31,  2.13s/it]
 85%|████████▌ | 399/469 [12:03<02:21,  2.02s/it]
 85%|████████▌ | 400/469 [12:05<02:15,  1.96s/it]
 86%|████████▌ | 401/469 [12:07<02:10,  1.92s/it]
 86%|████████▌ | 402/469 [12:08<02:02,  1.83s/it]
 86%|████████▌ | 403/469 [12:10<01:57,  1.78s/it]
 86%|████████▌ | 404/469 [12:12<01:53,  1.74s/it]
 86%|████████▋ | 405/469 [12:13<01:49,  1.72s/it]
 87%|████████▋ | 406/469 [12:15<01:50,  1.76s/it]
 87%|████████▋ | 407/469 [12:17<01:48,  1.74s/it]
 87%|████████▋ | 408/469 [12:19<01:44,  1.72s/it]
 87%|████████▋ | 409/469 [12:20<01:41,  1.70s/it]
 87%|████████▋ | 410/469 [12:22<01:45,  1.79s/it]
 88%|████████▊ | 411/469 [12:24<01:40,  1.74s/it]
 88%|████████▊ | 412/469 [12:26<01:41,  1.78s/it]
 88%|████████▊ | 413/469 [12:28<01:39,  1.78s/it]
 88%|████████▊ | 414/469 [12:29<01:35,  1.74s/it]
 88%|████████▊ | 415/469 [12:31<01:38,  1.83s/it]
 89%|████████▊ | 416/469 [12:33<01:36,  1.82s/it]
 89%|████████▉ | 417/469 [12:35<01:31,  1.76s/it]
 89%|████████▉ | 418/469 [12:36<01:31,  1.79s/it]
 89%|████████▉ | 419/469 [12:38<01:26,  1.74s/it]
 90%|████████▉ | 420/469 [12:40<01:25,  1.75s/it]
 90%|████████▉ | 421/469 [12:42<01:23,  1.75s/it]
 90%|████████▉ | 422/469 [12:43<01:20,  1.71s/it]
 90%|█████████ | 423/469 [12:45<01:20,  1.74s/it]
 90%|█████████ | 424/469 [12:47<01:18,  1.74s/it]
 91%|█████████ | 425/469 [12:49<01:17,  1.75s/it]
 91%|█████████ | 426/469 [12:50<01:15,  1.75s/it]
 91%|█████████ | 427/469 [12:52<01:14,  1.78s/it]
 91%|█████████▏| 428/469 [12:54<01:12,  1.77s/it]
 91%|█████████▏| 429/469 [12:56<01:09,  1.75s/it]
 92%|█████████▏| 430/469 [12:57<01:07,  1.72s/it]
 92%|█████████▏| 431/469 [12:59<01:04,  1.70s/it]
 92%|█████████▏| 432/469 [13:01<01:04,  1.74s/it]
 92%|█████████▏| 433/469 [13:03<01:07,  1.88s/it]
 93%|█████████▎| 434/469 [13:05<01:04,  1.86s/it]
 93%|█████████▎| 435/469 [13:07<01:03,  1.87s/it]
 93%|█████████▎| 436/469 [13:08<00:59,  1.81s/it]
 93%|█████████▎| 437/469 [13:10<00:57,  1.81s/it]
 93%|█████████▎| 438/469 [13:12<00:57,  1.86s/it]
 94%|█████████▎| 439/469 [13:14<00:54,  1.82s/it]
 94%|█████████▍| 440/469 [13:16<00:52,  1.82s/it]
 94%|█████████▍| 441/469 [13:18<00:51,  1.84s/it]
 94%|█████████▍| 442/469 [13:19<00:48,  1.80s/it]
 94%|█████████▍| 443/469 [13:21<00:48,  1.86s/it]
 95%|█████████▍| 444/469 [13:23<00:45,  1.80s/it]
 95%|█████████▍| 445/469 [13:25<00:42,  1.79s/it]
 95%|█████████▌| 446/469 [13:27<00:42,  1.85s/it]
 95%|█████████▌| 447/469 [13:29<00:41,  1.86s/it]
 96%|█████████▌| 448/469 [13:30<00:39,  1.88s/it]
 96%|█████████▌| 449/469 [13:32<00:37,  1.86s/it]
 96%|█████████▌| 450/469 [13:34<00:33,  1.79s/it]
 96%|█████████▌| 451/469 [13:36<00:31,  1.74s/it]
 96%|█████████▋| 452/469 [13:37<00:29,  1.73s/it]
 97%|█████████▋| 453/469 [13:39<00:28,  1.76s/it]
 97%|█████████▋| 454/469 [13:41<00:26,  1.74s/it]
 97%|█████████▋| 455/469 [13:42<00:23,  1.70s/it]
 97%|█████████▋| 456/469 [13:44<00:22,  1.71s/it]
 97%|█████████▋| 457/469 [13:46<00:20,  1.71s/it]
 98%|█████████▊| 458/469 [13:48<00:18,  1.71s/it]
 98%|█████████▊| 459/469 [13:49<00:16,  1.69s/it]
 98%|█████████▊| 460/469 [13:51<00:15,  1.68s/it]
 98%|█████████▊| 461/469 [13:53<00:14,  1.76s/it]
 99%|█████████▊| 462/469 [13:55<00:12,  1.78s/it]
 99%|█████████▊| 463/469 [13:57<00:10,  1.83s/it]
 99%|█████████▉| 464/469 [13:58<00:09,  1.86s/it]
 99%|█████████▉| 465/469 [14:00<00:07,  1.81s/it]
 99%|█████████▉| 466/469 [14:02<00:05,  1.78s/it]
+computing/reading sample batch statistics...
+Computing evaluations...
+Inception Score: 38.328826904296875
+FID: 21.82574123258769
+sFID: 70.92829349483634
+Precision: 0.6937
+Recall: 0.3517815963698579

eval_rectified_noise_new_batch_2.log ADDED Viewed

@@ -0,0 +1,24 @@
  0%|          | 0/1 [00:00<?, ?it/s]2026-03-23 16:28:27.563125: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled
  0%|          | 0/211 [00:00<?, ?it/s]
  0%|          | 1/211 [00:02<10:16,  2.94s/it]
  1%|          | 2/211 [00:04<08:15,  2.37s/it]
  1%|▏         | 3/211 [00:06<07:38,  2.21s/it]
  2%|▏         | 4/211 [00:08<07:20,  2.13s/it]
  2%|▏         | 5/211 [00:11<07:25,  2.16s/it]
  3%|▎         | 6/211 [00:12<06:59,  2.04s/it]
  3%|▎         | 7/211 [00:15<07:15,  2.13s/it]
  4%|▍         | 8/211 [00:17<06:55,  2.05s/it]
  4%|▍         | 9/211 [00:19<06:53,  2.05s/it]
  5%|▍         | 10/211 [00:21<06:37,  1.98s/it]
  5%|▌         | 11/211 [00:22<06:18,  1.89s/it]
  6%|▌         | 12/211 [00:24<06:13,  1.88s/it]
  6%|▌         | 13/211 [00:26<06:08,  1.86s/it]
  7%|▋         | 14/211 [00:28<06:02,  1.84s/it]
  7%|▋         | 15/211 [00:29<05:57,  1.82s/it]
  8%|▊         | 16/211 [00:31<05:55,  1.82s/it]
  8%|▊         | 17/211 [00:33<06:00,  1.86s/it]
  9%|▊         | 18/211 [00:35<06:01,  1.87s/it]
  9%|▉         | 19/211 [00:37<05:59,  1.87s/it]
  9%|▉         | 20/211 [00:39<05:51,  1.84s/it]
 10%|▉         | 21/211 [00:41<05:45,  1.82s/it]
 10%|█         | 22/211 [00:42<05:43,  1.82s/it]
 11%|█         | 23/211 [00:44<05:41,  1.82s/it]
 11%|█▏        | 24/211 [00:47<06:23,  2.05s/it]
 12%|█▏        | 25/211 [00:50<07:07,  2.30s/it]
 12%|█▏        | 26/211 [00:51<06:37,  2.15s/it]
 13%|█▎        | 27/211 [00:53<06:14,  2.04s/it]
 13%|█▎        | 28/211 [00:55<05:56,  1.95s/it]
 14%|█▎        | 29/211 [00:57<05:56,  1.96s/it]
 14%|█▍        | 30/211 [00:59<05:49,  1.93s/it]
 15%|█▍        | 31/211 [01:01<05:49,  1.94s/it]
 15%|█▌        | 32/211 [01:03<05:41,  1.91s/it]
 16%|█▌        | 33/211 [01:05<05:42,  1.93s/it]
 16%|█▌        | 34/211 [01:07<05:49,  1.97s/it]
 17%|█▋        | 35/211 [01:09<06:11,  2.11s/it]
 17%|█▋        | 36/211 [01:11<06:00,  2.06s/it]
 18%|█▊        | 37/211 [01:13<05:46,  1.99s/it]
 18%|█▊        | 38/211 [01:15<05:57,  2.07s/it]
 18%|█▊        | 39/211 [01:17<05:44,  2.01s/it]
 19%|█▉        | 40/211 [01:19<05:31,  1.94s/it]
 19%|█▉        | 41/211 [01:21<05:24,  1.91s/it]
 20%|█▉        | 42/211 [01:22<05:19,  1.89s/it]
 20%|██        | 43/211 [01:24<05:20,  1.91s/it]
 21%|██        | 44/211 [01:26<05:09,  1.86s/it]
 21%|██▏       | 45/211 [01:29<05:50,  2.11s/it]
 22%|██▏       | 46/211 [01:31<05:46,  2.10s/it]
 22%|██▏       | 47/211 [01:33<05:33,  2.03s/it]
 23%|██▎       | 48/211 [01:35<05:17,  1.95s/it]
 23%|██▎       | 49/211 [01:36<05:14,  1.94s/it]
 24%|██▎       | 50/211 [01:38<05:03,  1.88s/it]
 24%|██▍       | 51/211 [01:40<05:19,  2.00s/it]
 25%|██▍       | 52/211 [01:42<05:13,  1.97s/it]
 25%|██▌       | 53/211 [01:45<05:19,  2.02s/it]
 26%|██▌       | 54/211 [01:46<05:05,  1.95s/it]
 26%|██▌       | 55/211 [01:48<04:55,  1.89s/it]
 27%|██▋       | 56/211 [01:50<04:56,  1.92s/it]
 27%|██▋       | 57/211 [01:52<04:48,  1.88s/it]
 27%|██▋       | 58/211 [01:54<04:44,  1.86s/it]
 28%|██▊       | 59/211 [01:56<04:47,  1.89s/it]
 28%|██▊       | 60/211 [01:58<04:57,  1.97s/it]
 29%|██▉       | 61/211 [02:00<04:46,  1.91s/it]
 29%|██▉       | 62/211 [02:01<04:38,  1.87s/it]
 30%|██▉       | 63/211 [02:04<05:05,  2.07s/it]
 30%|███       | 64/211 [02:06<04:59,  2.04s/it]
 31%|███       | 65/211 [02:08<05:05,  2.09s/it]
 31%|███▏      | 66/211 [02:10<04:48,  1.99s/it]
 32%|███▏      | 67/211 [02:12<04:37,  1.93s/it]
 32%|███▏      | 68/211 [02:13<04:27,  1.87s/it]
 33%|███▎      | 69/211 [02:15<04:25,  1.87s/it]
 33%|███▎      | 70/211 [02:17<04:18,  1.83s/it]
 34%|███▎      | 71/211 [02:19<04:15,  1.83s/it]
 34%|███▍      | 72/211 [02:20<04:10,  1.80s/it]
 35%|███▍      | 73/211 [02:22<04:08,  1.80s/it]
 35%|███▌      | 74/211 [02:24<04:12,  1.84s/it]
 36%|███▌      | 75/211 [02:26<04:10,  1.84s/it]
 36%|███▌      | 76/211 [02:28<04:06,  1.83s/it]
 36%|███▋      | 77/211 [02:30<04:11,  1.88s/it]
 37%|███▋      | 78/211 [02:32<04:07,  1.86s/it]
 37%|███▋      | 79/211 [02:34<04:08,  1.88s/it]
 38%|███▊      | 80/211 [02:35<04:06,  1.88s/it]
 38%|███▊      | 81/211 [02:37<04:11,  1.93s/it]
 39%|███▉      | 82/211 [02:39<04:04,  1.89s/it]
 39%|███▉      | 83/211 [02:41<04:06,  1.92s/it]
 40%|███▉      | 84/211 [02:43<04:03,  1.92s/it]
 40%|████      | 85/211 [02:45<03:55,  1.87s/it]
 41%|████      | 86/211 [02:47<03:50,  1.84s/it]
 41%|████      | 87/211 [02:48<03:45,  1.82s/it]
 42%|████▏     | 88/211 [02:50<03:41,  1.80s/it]
 42%|████▏     | 89/211 [02:52<03:36,  1.78s/it]
 43%|████▎     | 90/211 [02:54<03:51,  1.91s/it]
 43%|████▎     | 91/211 [02:56<03:45,  1.88s/it]
 44%|████▎     | 92/211 [02:58<03:38,  1.84s/it]
 44%|████▍     | 93/211 [03:00<03:37,  1.85s/it]
 45%|████▍     | 94/211 [03:02<03:46,  1.94s/it]
 45%|████▌     | 95/211 [03:03<03:36,  1.87s/it]
 45%|████▌     | 96/211 [03:05<03:34,  1.87s/it]
 46%|████▌     | 97/211 [03:08<03:58,  2.09s/it]
 46%|████▋     | 98/211 [03:10<03:45,  1.99s/it]
 47%|████▋     | 99/211 [03:11<03:34,  1.91s/it]
 47%|████▋     | 100/211 [03:13<03:31,  1.91s/it]
 48%|████▊     | 101/211 [03:15<03:28,  1.89s/it]
 48%|████▊     | 102/211 [03:21<05:46,  3.18s/it]
 49%|████▉     | 103/211 [03:23<04:58,  2.76s/it]
 49%|████▉     | 104/211 [03:25<04:22,  2.45s/it]
 50%|████▉     | 105/211 [03:27<04:03,  2.30s/it]
 50%|█████     | 106/211 [03:29<03:45,  2.15s/it]
 51%|█████     | 107/211 [03:31<03:35,  2.07s/it]
 51%|█████     | 108/211 [03:32<03:29,  2.03s/it]
 52%|█████▏    | 109/211 [03:34<03:21,  1.97s/it]
 52%|█████▏    | 110/211 [03:37<03:36,  2.15s/it]
 53%|█████▎    | 111/211 [03:39<03:23,  2.03s/it]
 53%|█████▎    | 112/211 [03:41<03:17,  1.99s/it]
 54%|█████▎    | 113/211 [03:42<03:10,  1.95s/it]
 54%|█████▍    | 114/211 [03:44<03:03,  1.90s/it]
 55%|█████▍    | 115/211 [03:46<03:02,  1.91s/it]
 55%|█████▍    | 116/211 [03:48<02:59,  1.89s/it]
 55%|█████▌    | 117/211 [03:50<03:03,  1.96s/it]
 56%|█████▌    | 118/211 [03:52<02:56,  1.90s/it]
 56%|█████▋    | 119/211 [03:54<02:51,  1.86s/it]
 57%|█████▋    | 120/211 [03:55<02:49,  1.86s/it]
 57%|█████▋    | 121/211 [03:57<02:52,  1.91s/it]
 58%|█████▊    | 122/211 [03:59<02:47,  1.89s/it]
 58%|█████▊    | 123/211 [04:01<02:51,  1.94s/it]
 59%|█████▉    | 124/211 [04:03<02:47,  1.93s/it]
 59%|█████▉    | 125/211 [04:05<02:40,  1.87s/it]
 60%|█████▉    | 126/211 [04:07<02:46,  1.96s/it]
 60%|██████    | 127/211 [04:09<02:43,  1.95s/it]
 61%|██████    | 128/211 [04:11<02:42,  1.95s/it]
 61%|██████    | 129/211 [04:13<02:47,  2.04s/it]
 62%|██████▏   | 130/211 [04:15<02:41,  2.00s/it]
 62%|██████▏   | 131/211 [04:17<02:39,  2.00s/it]
 63%|██████▎   | 132/211 [04:19<02:36,  1.98s/it]
 63%|██████▎   | 133/211 [04:21<02:31,  1.94s/it]
 64%|██████▎   | 134/211 [04:23<02:26,  1.91s/it]
 64%|██████▍   | 135/211 [04:25<02:21,  1.86s/it]
 64%|██████▍   | 136/211 [04:26<02:20,  1.87s/it]
 65%|██████▍   | 137/211 [04:29<02:23,  1.94s/it]
 65%|██████▌   | 138/211 [04:30<02:18,  1.90s/it]
 66%|██████▌   | 139/211 [04:32<02:13,  1.86s/it]
 66%|██████▋   | 140/211 [04:34<02:10,  1.84s/it]
 67%|██████▋   | 141/211 [04:36<02:07,  1.82s/it]
 67%|██████▋   | 142/211 [04:38<02:06,  1.83s/it]
 68%|██████▊   | 143/211 [04:39<02:06,  1.86s/it]
 68%|██��███▊   | 144/211 [04:41<02:02,  1.83s/it]
 69%|██████▊   | 145/211 [04:43<02:00,  1.82s/it]
 69%|██████▉   | 146/211 [04:45<01:58,  1.82s/it]
 70%|██████▉   | 147/211 [04:47<02:06,  1.98s/it]
 70%|███████   | 148/211 [04:49<02:07,  2.02s/it]
 71%|███████   | 149/211 [04:51<02:02,  1.97s/it]
 71%|███████   | 150/211 [04:53<01:59,  1.96s/it]
 72%|███████▏  | 151/211 [04:55<01:56,  1.95s/it]
 72%|███████▏  | 152/211 [04:57<01:54,  1.94s/it]
 73%|███████▎  | 153/211 [04:59<01:49,  1.88s/it]
 73%|███████▎  | 154/211 [05:01<01:56,  2.04s/it]
 73%|███████▎  | 155/211 [05:03<01:50,  1.97s/it]
 74%|███████▍  | 156/211 [05:05<01:48,  1.97s/it]
 74%|███████▍  | 157/211 [05:07<01:46,  1.96s/it]
 75%|███████▍  | 158/211 [05:09<01:41,  1.92s/it]
 75%|███████▌  | 159/211 [05:10<01:38,  1.90s/it]
 76%|███████▌  | 160/211 [05:12<01:35,  1.88s/it]
 76%|███████▋  | 161/211 [05:14<01:32,  1.85s/it]
 77%|███████▋  | 162/211 [05:16<01:32,  1.89s/it]
 77%|███████▋  | 163/211 [05:18<01:31,  1.90s/it]
 78%|███████▊  | 164/211 [05:20<01:28,  1.88s/it]
 78%|███████▊  | 165/211 [05:22<01:26,  1.88s/it]
 79%|███████▊  | 166/211 [05:24<01:25,  1.89s/it]
 79%|███████▉  | 167/211 [05:25<01:22,  1.88s/it]
 80%|███████▉  | 168/211 [05:27<01:22,  1.92s/it]
 80%|████████  | 169/211 [05:29<01:21,  1.94s/it]
 81%|████████  | 170/211 [05:31<01:18,  1.92s/it]
 81%|████████  | 171/211 [05:34<01:26,  2.15s/it]
 82%|████████▏ | 172/211 [05:36<01:23,  2.14s/it]
 82%|████████▏ | 173/211 [05:38<01:17,  2.05s/it]
 82%|████████▏ | 174/211 [05:40<01:13,  1.99s/it]
 83%|████████▎ | 175/211 [05:42<01:09,  1.93s/it]
 83%|████████▎ | 176/211 [05:44<01:10,  2.02s/it]
 84%|████████▍ | 177/211 [05:46<01:10,  2.06s/it]
 84%|████████▍ | 178/211 [05:49<01:16,  2.33s/it]
 85%|████████▍ | 179/211 [05:51<01:09,  2.16s/it]
 85%|████████▌ | 180/211 [05:53<01:03,  2.05s/it]
 86%|████████▌ | 181/211 [05:54<00:58,  1.96s/it]
 86%|████████▋ | 182/211 [05:56<00:56,  1.95s/it]
 87%|████████▋ | 183/211 [05:59<00:57,  2.05s/it]
 87%|████████▋ | 184/211 [06:00<00:54,  2.01s/it]
 88%|████████▊ | 185/211 [06:02<00:50,  1.95s/it]
 88%|████████▊ | 186/211 [06:04<00:47,  1.89s/it]
 89%|████████▊ | 187/211 [06:06<00:45,  1.88s/it]
 89%|████████▉ | 188/211 [06:08<00:42,  1.86s/it]
 90%|████████▉ | 189/211 [06:09<00:40,  1.83s/it]
 90%|█████████ | 190/211 [06:11<00:38,  1.85s/it]
 91%|█████████ | 191/211 [06:13<00:36,  1.85s/it]
 91%|█████████ | 192/211 [06:15<00:34,  1.84s/it]
 91%|█████████▏| 193/211 [06:17<00:34,  1.92s/it]
 92%|█████████▏| 194/211 [06:19<00:32,  1.91s/it]
 92%|█████████▏| 195/211 [06:21<00:31,  1.96s/it]
 93%|█████████▎| 196/211 [06:23<00:28,  1.92s/it]
 93%|█████████▎| 197/211 [06:25<00:27,  1.96s/it]
 94%|█████████▍| 198/211 [06:27<00:25,  1.93s/it]
 94%|█████████▍| 199/211 [06:29<00:22,  1.88s/it]
 95%|█████████▍| 200/211 [06:30<00:20,  1.89s/it]
 95%|█████████▌| 201/211 [06:32<00:18,  1.88s/it]
 96%|█████████▌| 202/211 [06:34<00:16,  1.86s/it]
 96%|█████████▌| 203/211 [06:36<00:15,  1.88s/it]
 97%|█████████▋| 204/211 [06:38<00:13,  1.89s/it]
 97%|█████████▋| 205/211 [06:40<00:11,  1.91s/it]
 98%|█████████▊| 206/211 [06:42<00:10,  2.10s/it]
 98%|█████████▊| 207/211 [06:44<00:08,  2.07s/it]
 99%|█████████▊| 208/211 [06:46<00:05,  1.98s/it]
 99%|█████████▉| 209/211 [06:49<00:04,  2.12s/it]
  0%|          | 0/469 [00:00<?, ?it/s]
  0%|          | 1/469 [00:01<14:40,  1.88s/it]
  0%|          | 2/469 [00:04<16:02,  2.06s/it]
  1%|          | 3/469 [00:05<15:06,  1.95s/it]
  1%|          | 4/469 [00:07<14:44,  1.90s/it]
  1%|          | 5/469 [00:09<14:34,  1.88s/it]
  1%|▏         | 6/469 [00:11<14:24,  1.87s/it]
  1%|▏         | 7/469 [00:13<14:05,  1.83s/it]
  2%|▏         | 8/469 [00:15<14:19,  1.87s/it]
  2%|▏         | 9/469 [00:16<14:13,  1.85s/it]
  2%|▏         | 10/469 [00:18<14:09,  1.85s/it]
  2%|▏         | 11/469 [00:20<14:04,  1.84s/it]
  3%|▎         | 12/469 [00:22<13:59,  1.84s/it]
  3%|▎         | 13/469 [00:24<14:09,  1.86s/it]
  3%|▎         | 14/469 [00:26<14:18,  1.89s/it]
  3%|▎         | 15/469 [00:28<14:01,  1.85s/it]
  3%|▎         | 16/469 [00:30<14:36,  1.94s/it]
  4%|▎         | 17/469 [00:31<14:12,  1.89s/it]
  4%|▍         | 18/469 [00:33<13:58,  1.86s/it]
  4%|▍         | 19/469 [00:36<14:55,  1.99s/it]
  4%|▍         | 20/469 [00:38<14:53,  1.99s/it]
  4%|▍         | 21/469 [00:40<14:50,  1.99s/it]
  5%|▍         | 22/469 [00:41<14:43,  1.98s/it]
  5%|▍         | 23/469 [00:44<15:04,  2.03s/it]
  5%|▌         | 24/469 [00:45<14:34,  1.97s/it]
  5%|▌         | 25/469 [00:47<14:03,  1.90s/it]
  6%|▌         | 26/469 [00:49<13:42,  1.86s/it]
  6%|▌         | 27/469 [00:51<13:31,  1.84s/it]
  6%|▌         | 28/469 [00:53<13:35,  1.85s/it]
  6%|▌         | 29/469 [00:54<13:24,  1.83s/it]
  6%|▋         | 30/469 [00:56<13:30,  1.85s/it]
  7%|▋         | 31/469 [00:58<13:10,  1.80s/it]
  7%|▋         | 32/469 [01:00<13:43,  1.88s/it]
  7%|▋         | 33/469 [01:02<13:23,  1.84s/it]
  7%|▋         | 34/469 [01:05<15:15,  2.10s/it]
  7%|▋         | 35/469 [01:06<14:40,  2.03s/it]
  8%|▊         | 36/469 [01:08<14:23,  1.99s/it]
  8%|▊         | 37/469 [01:10<13:51,  1.93s/it]
  8%|▊         | 38/469 [01:12<13:26,  1.87s/it]
  8%|▊         | 39/469 [01:14<13:13,  1.85s/it]
  9%|▊         | 40/469 [01:15<13:15,  1.85s/it]
  9%|▊         | 41/469 [01:17<13:14,  1.86s/it]
  9%|▉         | 42/469 [01:19<12:58,  1.82s/it]
  9%|▉         | 43/469 [01:21<13:04,  1.84s/it]
  9%|▉         | 44/469 [01:23<12:55,  1.82s/it]
 10%|▉         | 45/469 [01:25<12:56,  1.83s/it]
 10%|▉         | 46/469 [01:26<12:50,  1.82s/it]
 10%|█         | 47/469 [01:28<12:47,  1.82s/it]
 10%|█         | 48/469 [01:30<12:41,  1.81s/it]
 10%|█         | 49/469 [01:32<12:52,  1.84s/it]
 11%|█         | 50/469 [01:34<13:00,  1.86s/it]
 11%|█         | 51/469 [01:36<12:46,  1.83s/it]
 11%|█         | 52/469 [01:37<12:41,  1.83s/it]
 11%|█▏        | 53/469 [01:39<12:40,  1.83s/it]
 12%|█▏        | 54/469 [01:41<12:46,  1.85s/it]
 12%|█▏        | 55/469 [01:44<15:33,  2.25s/it]
 12%|█▏        | 56/469 [01:46<14:24,  2.09s/it]
 12%|█▏        | 57/469 [01:48<13:41,  1.99s/it]
 12%|█▏        | 58/469 [01:50<13:22,  1.95s/it]
 13%|█▎        | 59/469 [01:51<12:54,  1.89s/it]
 13%|█▎        | 60/469 [01:53<12:33,  1.84s/it]
 13%|█▎        | 61/469 [01:55<12:23,  1.82s/it]
 13%|█▎        | 62/469 [01:57<12:29,  1.84s/it]
 13%|█▎        | 63/469 [01:59<12:42,  1.88s/it]
 14%|█▎        | 64/469 [02:01<12:43,  1.89s/it]
 14%|█▍        | 65/469 [02:02<12:29,  1.86s/it]
 14%|█▍        | 66/469 [02:04<12:49,  1.91s/it]
 14%|█▍        | 67/469 [02:06<12:50,  1.92s/it]
 14%|█▍        | 68/469 [02:08<12:34,  1.88s/it]
 15%|█▍        | 69/469 [02:10<12:22,  1.86s/it]
 15%|█▍        | 70/469 [02:12<12:20,  1.86s/it]
 15%|█▌        | 71/469 [02:14<12:10,  1.84s/it]
 15%|█▌        | 72/469 [02:15<12:06,  1.83s/it]
 16%|█▌        | 73/469 [02:17<12:00,  1.82s/it]
 16%|█▌        | 74/469 [02:19<12:09,  1.85s/it]
 16%|█▌        | 75/469 [02:21<12:18,  1.87s/it]
 16%|█▌        | 76/469 [02:23<12:13,  1.87s/it]
 16%|█▋        | 77/469 [02:25<12:16,  1.88s/it]
 17%|█▋        | 78/469 [02:27<12:45,  1.96s/it]
 17%|█▋        | 79/469 [02:29<12:17,  1.89s/it]
 17%|█▋        | 80/469 [02:31<12:07,  1.87s/it]
 17%|█▋        | 81/469 [02:32<11:53,  1.84s/it]
 17%|█▋        | 82/469 [02:34<12:09,  1.89s/it]
 18%|█▊        | 83/469 [02:36<12:01,  1.87s/it]
 18%|█▊        | 84/469 [02:38<11:45,  1.83s/it]
 18%|█▊        | 85/469 [02:40<12:13,  1.91s/it]
 18%|█▊        | 86/469 [02:42<12:12,  1.91s/it]
 19%|█▊        | 87/469 [02:44<11:53,  1.87s/it]
 19%|█▉        | 88/469 [02:45<11:32,  1.82s/it]
 19%|█▉        | 89/469 [02:47<11:28,  1.81s/it]
 19%|█▉        | 90/469 [02:49<11:52,  1.88s/it]
 19%|█▉        | 91/469 [02:51<11:34,  1.84s/it]
 20%|█▉        | 92/469 [02:53<11:36,  1.85s/it]
 20%|█▉        | 93/469 [02:55<11:36,  1.85s/it]
 20%|██        | 94/469 [02:56<11:27,  1.83s/it]
 20%|██        | 95/469 [02:58<11:24,  1.83s/it]
 20%|██        | 96/469 [03:00<11:19,  1.82s/it]
 21%|██        | 97/469 [03:02<11:25,  1.84s/it]
 21%|██        | 98/469 [03:04<11:45,  1.90s/it]
 21%|██        | 99/469 [03:06<11:45,  1.91s/it]
 21%|██▏       | 100/469 [03:08<11:30,  1.87s/it]
 22%|██▏       | 101/469 [03:10<12:35,  2.05s/it]
 22%|██▏       | 102/469 [03:12<12:40,  2.07s/it]
 22%|██▏       | 103/469 [03:14<12:10,  2.00s/it]
 22%|██▏       | 104/469 [03:16<11:49,  1.94s/it]
 22%|██▏       | 105/469 [03:18<11:38,  1.92s/it]
 23%|██▎       | 106/469 [03:20<11:50,  1.96s/it]
 23%|██▎       | 107/469 [03:22<11:29,  1.91s/it]
 23%|██▎       | 108/469 [03:24<12:19,  2.05s/it]
 23%|██▎       | 109/469 [03:26<11:50,  1.97s/it]
 23%|██▎       | 110/469 [03:28<12:21,  2.06s/it]
 24%|██▎       | 111/469 [03:30<12:00,  2.01s/it]
 24%|██▍       | 112/469 [03:32<11:37,  1.95s/it]
 24%|██▍       | 113/469 [03:34<11:13,  1.89s/it]
 24%|██▍       | 114/469 [03:36<11:26,  1.93s/it]
 25%|██▍       | 115/469 [03:37<11:06,  1.88s/it]
 25%|██▍       | 116/469 [03:39<11:27,  1.95s/it]
 25%|██▍       | 117/469 [03:42<12:08,  2.07s/it]
 25%|██▌       | 118/469 [03:44<11:37,  1.99s/it]
 25%|██▌       | 119/469 [03:45<11:14,  1.93s/it]
 26%|██▌       | 120/469 [03:47<10:53,  1.87s/it]
 26%|██▌       | 121/469 [03:49<10:39,  1.84s/it]
 26%|██▌       | 122/469 [03:51<10:28,  1.81s/it]
 26%|██▌       | 123/469 [03:52<10:23,  1.80s/it]
 26%|██▋       | 124/469 [03:54<10:16,  1.79s/it]
 27%|██▋       | 125/469 [03:56<10:26,  1.82s/it]
 27%|██▋       | 126/469 [03:58<10:19,  1.81s/it]
 27%|██▋       | 127/469 [04:00<10:05,  1.77s/it]
 27%|██▋       | 128/469 [04:01<10:13,  1.80s/it]
 28%|██▊       | 129/469 [04:04<10:47,  1.91s/it]
 28%|██▊       | 130/469 [04:05<10:30,  1.86s/it]
 28%|██▊       | 131/469 [04:08<11:02,  1.96s/it]
 28%|██▊       | 132/469 [04:10<11:07,  1.98s/it]
 28%|██▊       | 133/469 [04:11<11:00,  1.97s/it]
 29%|██▊       | 134/469 [04:13<10:36,  1.90s/it]
 29%|██▉       | 135/469 [04:15<10:33,  1.90s/it]
 29%|██▉       | 136/469 [04:17<10:28,  1.89s/it]
 29%|██▉       | 137/469 [04:23<16:46,  3.03s/it]
 29%|██▉       | 138/469 [04:24<14:39,  2.66s/it]
 30%|██▉       | 139/469 [04:27<13:43,  2.49s/it]
 30%|██▉       | 140/469 [04:29<13:38,  2.49s/it]
 30%|███       | 141/469 [04:31<12:21,  2.26s/it]
 30%|███       | 142/469 [04:33<11:27,  2.10s/it]
 30%|███       | 143/469 [04:34<10:47,  1.98s/it]
 31%|███       | 144/469 [04:36<10:23,  1.92s/it]
 31%|███       | 145/469 [04:38<10:18,  1.91s/it]
 31%|███       | 146/469 [04:40<10:36,  1.97s/it]
 31%|███▏      | 147/469 [04:42<10:39,  1.99s/it]
 32%|███▏      | 148/469 [04:44<10:50,  2.03s/it]
 32%|███▏      | 149/469 [04:46<10:30,  1.97s/it]
 32%|███▏      | 150/469 [04:48<10:12,  1.92s/it]
 32%|███▏      | 151/469 [04:50<09:55,  1.87s/it]
 32%|███▏      | 152/469 [04:51<10:00,  1.89s/it]
 33%|███▎      | 153/469 [04:53<10:00,  1.90s/it]
 33%|███▎      | 154/469 [04:55<09:56,  1.89s/it]
 33%|███▎      | 155/469 [04:57<09:55,  1.90s/it]
 33%|███▎      | 156/469 [04:59<09:38,  1.85s/it]
 33%|███▎      | 157/469 [05:01<09:29,  1.83s/it]
 34%|███▎      | 158/469 [05:03<09:30,  1.84s/it]
 34%|███▍      | 159/469 [05:04<09:18,  1.80s/it]
 34%|███▍      | 160/469 [05:07<09:59,  1.94s/it]
 34%|███▍      | 161/469 [05:08<09:52,  1.93s/it]
 35%|███▍      | 162/469 [05:10<09:53,  1.93s/it]
 35%|███▍      | 163/469 [05:12<09:41,  1.90s/it]
 35%|███▍      | 164/469 [05:14<09:34,  1.88s/it]
 35%|███▌      | 165/469 [05:16<09:32,  1.88s/it]
 35%|███▌      | 166/469 [05:18<09:23,  1.86s/it]
 36%|███▌      | 167/469 [05:20<09:39,  1.92s/it]
 36%|███▌      | 168/469 [05:22<09:30,  1.90s/it]
 36%|███▌      | 169/469 [05:23<09:21,  1.87s/it]
 36%|███▌      | 170/469 [05:25<09:10,  1.84s/it]
 36%|███▋      | 171/469 [05:27<09:20,  1.88s/it]
 37%|███▋      | 172/469 [05:29<09:07,  1.84s/it]
 37%|███▋      | 173/469 [05:31<09:01,  1.83s/it]
 37%|███▋      | 174/469 [05:33<09:00,  1.83s/it]
 37%|███▋      | 175/469 [05:34<08:49,  1.80s/it]
 38%|███▊      | 176/469 [05:36<09:09,  1.88s/it]
 38%|███▊      | 177/469 [05:38<09:08,  1.88s/it]
 38%|███▊      | 178/469 [05:40<09:00,  1.86s/it]
 38%|███▊      | 179/469 [05:42<08:51,  1.83s/it]
 38%|███▊      | 180/469 [05:44<08:51,  1.84s/it]
 39%|███▊      | 181/469 [05:46<10:03,  2.10s/it]
 39%|███▉      | 182/469 [05:48<09:44,  2.04s/it]
 39%|███▉      | 183/469 [05:50<09:21,  1.96s/it]
 39%|███▉      | 184/469 [05:52<09:23,  1.98s/it]
 39%|███▉      | 185/469 [05:54<09:03,  1.92s/it]
 40%|███▉      | 186/469 [05:56<08:53,  1.88s/it]
 40%|███▉      | 187/469 [05:57<08:40,  1.85s/it]
 40%|████      | 188/469 [05:59<08:58,  1.92s/it]
 40%|████      | 189/469 [06:01<08:50,  1.89s/it]
 41%|████      | 190/469 [06:03<08:55,  1.92s/it]
 41%|████      | 191/469 [06:05<08:49,  1.91s/it]
 41%|████      | 192/469 [06:07<08:48,  1.91s/it]
 41%|████      | 193/469 [06:09<08:43,  1.90s/it]
 41%|████▏     | 194/469 [06:11<08:31,  1.86s/it]
 42%|████▏     | 195/469 [06:12<08:18,  1.82s/it]
 42%|████▏     | 196/469 [06:14<08:06,  1.78s/it]
 42%|████▏     | 197/469 [06:16<08:05,  1.79s/it]
 42%|████▏     | 198/469 [06:18<08:14,  1.83s/it]
 42%|████▏     | 199/469 [06:20<08:11,  1.82s/it]
 43%|████▎     | 200/469 [06:22<08:36,  1.92s/it]
 43%|████▎     | 201/469 [06:24<08:25,  1.89s/it]
 43%|████▎     | 202/469 [06:26<08:28,  1.91s/it]
 43%|████▎     | 203/469 [06:27<08:24,  1.90s/it]
 43%|████▎     | 204/469 [06:29<08:31,  1.93s/it]
 44%|████▎     | 205/469 [06:31<08:20,  1.89s/it]
 44%|████▍     | 206/469 [06:33<08:09,  1.86s/it]
 44%|████▍     | 207/469 [06:35<08:13,  1.89s/it]
 44%|████▍     | 208/469 [06:37<08:21,  1.92s/it]
 45%|████▍     | 209/469 [06:39<08:55,  2.06s/it]
 45%|████▍     | 210/469 [06:41<08:32,  1.98s/it]
 45%|████▍     | 211/469 [06:43<08:21,  1.94s/it]
 45%|████▌     | 212/469 [06:45<08:19,  1.94s/it]
 45%|████▌     | 213/469 [06:47<08:13,  1.93s/it]
 46%|████▌     | 214/469 [06:49<08:23,  1.98s/it]
 46%|████▌     | 215/469 [06:51<08:15,  1.95s/it]
 46%|████▌     | 216/469 [06:53<08:02,  1.91s/it]
 46%|████▋     | 217/469 [06:55<07:57,  1.90s/it]
 46%|████▋     | 218/469 [06:56<07:51,  1.88s/it]
 47%|████▋     | 219/469 [06:58<07:40,  1.84s/it]
 47%|████▋     | 220/469 [07:00<07:52,  1.90s/it]
 47%|████▋     | 221/469 [07:02<07:44,  1.87s/it]
 47%|████▋     | 222/469 [07:04<08:09,  1.98s/it]
 48%|████▊     | 223/469 [07:07<09:19,  2.27s/it]
 48%|████▊     | 224/469 [07:09<08:42,  2.13s/it]
 48%|████▊     | 225/469 [07:11<08:22,  2.06s/it]
 48%|████▊     | 226/469 [07:13<08:01,  1.98s/it]
 48%|████▊     | 227/469 [07:15<07:51,  1.95s/it]
 49%|████▊     | 228/469 [07:16<07:43,  1.92s/it]
 49%|████▉     | 229/469 [07:18<07:35,  1.90s/it]
 49%|████▉     | 230/469 [07:20<07:28,  1.88s/it]
 49%|████▉     | 231/469 [07:22<07:44,  1.95s/it]
 49%|████▉     | 232/469 [07:24<07:32,  1.91s/it]
 50%|████▉     | 233/469 [07:26<07:24,  1.88s/it]
 50%|████▉     | 234/469 [07:28<07:26,  1.90s/it]
 50%|█████     | 235/469 [07:30<07:18,  1.88s/it]
 50%|█████     | 236/469 [07:32<07:21,  1.89s/it]
 51%|█████     | 237/469 [07:33<07:15,  1.88s/it]
 51%|█████     | 238/469 [07:35<07:31,  1.95s/it]
 51%|█████     | 239/469 [07:37<07:25,  1.94s/it]
 51%|█████     | 240/469 [07:39<07:26,  1.95s/it]
 51%|█████▏    | 241/469 [07:42<08:39,  2.28s/it]
 52%|█████▏    | 242/469 [07:44<08:17,  2.19s/it]
 52%|█████▏    | 243/469 [07:47<08:19,  2.21s/it]
 52%|█████▏    | 244/469 [07:49<07:57,  2.12s/it]
 52%|█████▏    | 245/469 [07:50<07:35,  2.03s/it]
 52%|█████▏    | 246/469 [07:52<07:26,  2.00s/it]
 53%|█████▎    | 247/469 [07:54<07:19,  1.98s/it]
 53%|█████▎    | 248/469 [07:56<07:11,  1.95s/it]
 53%|█████▎    | 249/469 [07:58<07:15,  1.98s/it]
 53%|█████▎    | 250/469 [08:00<06:58,  1.91s/it]
 54%|█████▎    | 251/469 [08:02<06:51,  1.89s/it]
 54%|█████▎    | 252/469 [08:04<06:42,  1.85s/it]
 54%|█████▍    | 253/469 [08:05<06:33,  1.82s/it]
 54%|█████▍    | 254/469 [08:07<06:29,  1.81s/it]
 54%|█████▍    | 255/469 [08:09<06:32,  1.83s/it]
 55%|█████▍    | 256/469 [08:11<06:24,  1.80s/it]
 55%|█████▍    | 257/469 [08:12<06:18,  1.79s/it]
 55%|█████▌    | 258/469 [08:14<06:25,  1.83s/it]
 55%|█████▌    | 259/469 [08:16<06:25,  1.83s/it]
 55%|█████▌    | 260/469 [08:18<06:19,  1.82s/it]
 56%|█████▌    | 261/469 [08:20<06:42,  1.94s/it]
 56%|█████▌    | 262/469 [08:22<06:32,  1.90s/it]
 56%|█████▌    | 263/469 [08:24<06:25,  1.87s/it]
 56%|█████▋    | 264/469 [08:26<06:17,  1.84s/it]
 57%|█████▋    | 265/469 [08:28<06:20,  1.87s/it]
 57%|█████▋    | 266/469 [08:29<06:17,  1.86s/it]
 57%|█████▋    | 267/469 [08:31<06:16,  1.86s/it]
 57%|█████▋    | 268/469 [08:33<06:12,  1.86s/it]
 57%|█████▋    | 269/469 [08:35<06:41,  2.01s/it]
 58%|█████▊    | 270/469 [08:37<06:24,  1.93s/it]
 58%|█████▊    | 271/469 [08:39<06:10,  1.87s/it]
 58%|█████▊    | 272/469 [08:41<06:09,  1.87s/it]
 58%|█████▊    | 273/469 [08:43<06:24,  1.96s/it]
 58%|█████▊    | 274/469 [08:45<06:11,  1.90s/it]
 59%|█████▊    | 275/469 [08:47<06:10,  1.91s/it]
 59%|█████▉    | 276/469 [08:48<06:03,  1.88s/it]
 59%|█████▉    | 277/469 [08:50<06:01,  1.88s/it]
 59%|█████▉    | 278/469 [08:52<06:08,  1.93s/it]
 59%|█████▉    | 279/469 [08:54<05:57,  1.88s/it]
 60%|█████▉    | 280/469 [08:56<06:11,  1.97s/it]
 60%|█████▉    | 281/469 [08:58<06:07,  1.95s/it]
 60%|██████    | 282/469 [09:00<05:57,  1.91s/it]
 60%|██████    | 283/469 [09:02<05:48,  1.87s/it]
 61%|██████    | 284/469 [09:04<05:44,  1.86s/it]
 61%|██████    | 285/469 [09:06<05:45,  1.88s/it]
 61%|██████    | 286/469 [09:08<06:11,  2.03s/it]
 61%|██████    | 287/469 [09:10<06:16,  2.07s/it]
 61%|██████▏   | 288/469 [09:12<05:59,  1.99s/it]
 62%|██████▏   | 289/469 [09:14<05:50,  1.95s/it]
 62%|██████▏   | 290/469 [09:16<05:39,  1.90s/it]
 62%|██████▏   | 291/469 [09:17<05:28,  1.85s/it]
 62%|██████▏   | 292/469 [09:19<05:25,  1.84s/it]
 62%|██████▏   | 293/469 [09:21<05:23,  1.84s/it]
 63%|██████▎   | 294/469 [09:23<05:31,  1.89s/it]
 63%|██████▎   | 295/469 [09:25<05:20,  1.84s/it]
 63%|██████▎   | 296/469 [09:26<05:13,  1.81s/it]
 63%|██████▎   | 297/469 [09:28<05:08,  1.80s/it]
 64%|██████▎   | 298/469 [09:30<05:04,  1.78s/it]
 64%|██████▍   | 299/469 [09:32<05:05,  1.80s/it]
 64%|██████▍   | 300/469 [09:34<05:08,  1.82s/it]
 64%|██████▍   | 301/469 [09:36<05:07,  1.83s/it]
 64%|██████▍   | 302/469 [09:37<05:04,  1.82s/it]
 65%|██████▍   | 303/469 [09:39<05:06,  1.84s/it]
 65%|██████▍   | 304/469 [09:41<05:06,  1.86s/it]
 65%|██████▌   | 305/469 [09:43<05:02,  1.84s/it]
 65%|██████▌   | 306/469 [09:45<05:01,  1.85s/it]
 65%|██████▌   | 307/469 [09:47<05:00,  1.85s/it]
 66%|██████▌   | 308/469 [09:49<05:02,  1.88s/it]
 66%|██████▌   | 309/469 [09:50<05:00,  1.88s/it]
 66%|██████▌   | 310/469 [09:52<04:53,  1.85s/it]
 66%|██████▋   | 311/469 [09:54<04:54,  1.86s/it]
 67%|██████▋   | 312/469 [09:56<04:58,  1.90s/it]
 67%|██████▋   | 313/469 [09:58<04:50,  1.86s/it]
 67%|██████▋   | 314/469 [10:00<04:51,  1.88s/it]
 67%|██████▋   | 315/469 [10:02<04:52,  1.90s/it]
 67%|██████▋   | 316/469 [10:04<04:53,  1.92s/it]
 68%|██████▊   | 317/469 [10:06<04:45,  1.88s/it]
 68%|██████▊   | 318/469 [10:08<04:57,  1.97s/it]
 68%|██████▊   | 319/469 [10:09<04:46,  1.91s/it]
 68%|██████▊   | 320/469 [10:11<04:46,  1.92s/it]
 68%|██████▊   | 321/469 [10:14<05:08,  2.09s/it]
 69%|██████▊   | 322/469 [10:16<05:13,  2.14s/it]
 69%|██████▉   | 323/469 [10:18<05:02,  2.07s/it]
 69%|██████▉   | 324/469 [10:20<04:52,  2.02s/it]
 69%|██████▉   | 325/469 [10:22<04:42,  1.96s/it]
 70%|██████▉   | 326/469 [10:24<04:34,  1.92s/it]
 70%|██████▉   | 327/469 [10:26<04:41,  1.98s/it]
 70%|██████▉   | 328/469 [10:28<04:31,  1.93s/it]
 70%|███████   | 329/469 [10:29<04:24,  1.89s/it]
 70%|███████   | 330/469 [10:31<04:19,  1.87s/it]
 71%|███████   | 331/469 [10:33<04:13,  1.84s/it]
 71%|███████   | 332/469 [10:35<04:24,  1.93s/it]
 71%|███████   | 333/469 [10:37<04:16,  1.89s/it]
 71%|███████   | 334/469 [10:42<06:23,  2.84s/it]
 71%|███████▏  | 335/469 [10:44<05:35,  2.51s/it]
 72%|███████▏  | 336/469 [10:45<05:04,  2.29s/it]
 72%|███████▏  | 337/469 [10:47<04:42,  2.14s/it]
 72%|███████▏  | 338/469 [10:49<04:41,  2.15s/it]
 72%|███████▏  | 339/469 [10:51<04:33,  2.11s/it]
 72%|███████▏  | 340/469 [10:53<04:21,  2.03s/it]
 73%|███████▎  | 341/469 [10:55<04:17,  2.01s/it]
 73%|███████▎  | 342/469 [10:57<04:19,  2.04s/it]
 73%|███████▎  | 343/469 [10:59<04:18,  2.05s/it]
 73%|███████▎  | 344/469 [11:01<04:04,  1.96s/it]
 74%|███████▎  | 345/469 [11:03<04:00,  1.94s/it]
 74%|███████▍  | 346/469 [11:05<03:52,  1.89s/it]
 74%|███████▍  | 347/469 [11:07<03:45,  1.85s/it]
 74%|███████▍  | 348/469 [11:08<03:39,  1.82s/it]
 74%|███████▍  | 349/469 [11:10<03:38,  1.82s/it]
 75%|███████▍  | 350/469 [11:12<03:42,  1.87s/it]
 75%|███████▍  | 351/469 [11:14<03:44,  1.90s/it]
 75%|███████▌  | 352/469 [11:16<03:40,  1.88s/it]
 75%|███████▌  | 353/469 [11:25<07:38,  3.95s/it]
 75%|███████▌  | 354/469 [11:27<06:26,  3.36s/it]
 76%|███████▌  | 355/469 [11:29<05:30,  2.90s/it]
 76%|███████▌  | 356/469 [11:31<05:08,  2.73s/it]
 76%|███████▌  | 357/469 [11:33<04:45,  2.55s/it]
 76%|███████▋  | 358/469 [11:35<04:23,  2.37s/it]
 77%|██��████▋  | 359/469 [11:37<04:10,  2.28s/it]
 77%|███████▋  | 360/469 [11:39<03:51,  2.12s/it]
 77%|███████▋  | 361/469 [11:41<03:37,  2.02s/it]
 77%|███████▋  | 362/469 [12:00<13:03,  7.32s/it]
 77%|███████▋  | 363/469 [12:02<10:03,  5.69s/it]
 78%|███████▊  | 364/469 [12:04<07:55,  4.53s/it]
 78%|███████▊  | 365/469 [12:06<06:27,  3.73s/it]
 78%|███████▊  | 366/469 [12:08<05:26,  3.17s/it]
 78%|███████▊  | 367/469 [12:09<04:40,  2.75s/it]
 78%|███████▊  | 368/469 [12:12<04:21,  2.59s/it]
 79%|███████▊  | 369/469 [12:13<03:55,  2.36s/it]
 79%|███████▉  | 370/469 [12:15<03:43,  2.25s/it]
 79%|███████▉  | 371/469 [12:17<03:28,  2.13s/it]
 79%|███████▉  | 372/469 [12:19<03:21,  2.07s/it]
 80%|███████▉  | 373/469 [12:21<03:10,  1.99s/it]
 80%|███████▉  | 374/469 [12:23<03:02,  1.92s/it]
 80%|███████▉  | 375/469 [12:25<02:55,  1.86s/it]
 80%|████████  | 376/469 [12:26<02:51,  1.84s/it]
 80%|████████  | 377/469 [12:28<02:51,  1.86s/it]
 81%|████████  | 378/469 [12:30<02:47,  1.84s/it]
 81%|████████  | 379/469 [12:32<02:43,  1.81s/it]
 81%|████████  | 380/469 [12:34<02:48,  1.89s/it]
 81%|████████  | 381/469 [12:36<02:45,  1.88s/it]
 81%|████████▏ | 382/469 [12:38<02:43,  1.88s/it]
 82%|████████▏ | 383/469 [12:39<02:38,  1.85s/it]
 82%|████████▏ | 384/469 [12:41<02:36,  1.84s/it]
 82%|████████▏ | 385/469 [12:43<02:37,  1.88s/it]
 82%|████████▏ | 386/469 [12:45<02:35,  1.87s/it]
 83%|████████▎ | 387/469 [12:47<02:45,  2.02s/it]
 83%|████████▎ | 388/469 [12:49<02:37,  1.94s/it]
 83%|████████▎ | 389/469 [12:51<02:30,  1.88s/it]
 83%|████████▎ | 390/469 [12:53<02:32,  1.93s/it]
 83%|████████▎ | 391/469 [12:55<02:29,  1.92s/it]
 84%|████████▎ | 392/469 [12:57<02:23,  1.87s/it]
 84%|████████▍ | 393/469 [13:00<03:00,  2.38s/it]
 84%|████████▍ | 394/469 [13:02<02:47,  2.23s/it]
 84%|████████▍ | 395/469 [13:04<02:36,  2.12s/it]
 84%|████████▍ | 396/469 [13:06<02:27,  2.02s/it]
 85%|████████▍ | 397/469 [13:08<02:24,  2.00s/it]
 85%|████████▍ | 398/469 [13:09<02:19,  1.96s/it]
 85%|████████▌ | 399/469 [13:11<02:14,  1.92s/it]
 85%|████████▌ | 400/469 [13:13<02:08,  1.86s/it]
 86%|████████▌ | 401/469 [13:15<02:05,  1.85s/it]
 86%|████████▌ | 402/469 [13:17<02:02,  1.84s/it]
 86%|████████▌ | 403/469 [13:18<01:59,  1.81s/it]
 86%|████████▌ | 404/469 [13:20<02:02,  1.89s/it]
 86%|████████▋ | 405/469 [13:22<01:59,  1.87s/it]
 87%|████████▋ | 406/469 [13:24<01:56,  1.85s/it]
 87%|████████▋ | 407/469 [13:26<01:52,  1.81s/it]
 87%|████████▋ | 408/469 [13:28<01:50,  1.81s/it]
 87%|████████▋ | 409/469 [13:30<01:50,  1.84s/it]
 87%|████████▋ | 410/469 [13:32<01:55,  1.95s/it]
 88%|████████▊ | 411/469 [13:33<01:49,  1.89s/it]
 88%|████████▊ | 412/469 [13:35<01:48,  1.90s/it]
 88%|████████▊ | 413/469 [13:37<01:45,  1.89s/it]
 88%|████████▊ | 414/469 [13:39<01:42,  1.86s/it]
 88%|████████▊ | 415/469 [13:41<01:38,  1.82s/it]
 89%|████████▊ | 416/469 [13:43<01:39,  1.87s/it]
 89%|████████▉ | 417/469 [13:45<01:38,  1.90s/it]
 89%|████████▉ | 418/469 [13:50<02:30,  2.96s/it]
 89%|████████▉ | 419/469 [13:52<02:12,  2.65s/it]
 90%|████████▉ | 420/469 [13:54<02:01,  2.47s/it]
 90%|████████▉ | 421/469 [13:56<01:48,  2.26s/it]
 90%|████████▉ | 422/469 [13:58<01:39,  2.12s/it]
 90%|█████████ | 423/469 [14:00<01:33,  2.02s/it]
 90%|█████████ | 424/469 [14:01<01:29,  2.00s/it]
 91%|█████████ | 425/469 [14:03<01:24,  1.93s/it]
 91%|█████████ | 426/469 [14:05<01:22,  1.93s/it]
 91%|█████████ | 427/469 [14:07<01:19,  1.90s/it]
 91%|█████████▏| 428/469 [14:10<01:30,  2.21s/it]
 91%|█████████▏| 429/469 [14:12<01:23,  2.09s/it]
 92%|█████████▏| 430/469 [14:13<01:17,  2.00s/it]
 92%|█████████▏| 431/469 [14:16<01:17,  2.05s/it]
 92%|█████████▏| 432/469 [14:17<01:12,  1.96s/it]
 92%|█████████▏| 433/469 [14:19<01:09,  1.93s/it]
 93%|█████████▎| 434/469 [14:21<01:06,  1.89s/it]
 93%|█████████▎| 435/469 [14:23<01:03,  1.88s/it]
 93%|█████████▎| 436/469 [14:25<01:06,  2.01s/it]
 93%|█████████▎| 437/469 [14:27<01:02,  1.95s/it]
 93%|█████████▎| 438/469 [14:29<00:59,  1.93s/it]
 94%|█████████▎| 439/469 [14:31<00:57,  1.91s/it]
 94%|█████████▍| 440/469 [14:33<00:59,  2.04s/it]
 94%|█████████▍| 441/469 [14:35<00:55,  1.99s/it]
 94%|█████████▍| 442/469 [14:37<00:53,  1.97s/it]
 94%|█████████▍| 443/469 [14:39<00:49,  1.92s/it]
 95%|█████████▍| 444/469 [14:41<00:47,  1.89s/it]
 95%|█████████▍| 445/469 [14:42<00:44,  1.87s/it]
 95%|█████████▌| 446/469 [14:44<00:44,  1.94s/it]
 95%|█████████▌| 447/469 [14:46<00:41,  1.88s/it]
 96%|█████████▌| 448/469 [14:48<00:38,  1.83s/it]
 96%|█████████▌| 449/469 [14:50<00:37,  1.89s/it]
 96%|█████████▌| 450/469 [14:52<00:34,  1.84s/it]
 96%|█████████▌| 451/469 [14:53<00:32,  1.81s/it]
 96%|█████████▋| 452/469 [14:55<00:30,  1.80s/it]
 97%|█████████▋| 453/469 [14:57<00:28,  1.77s/it]
 97%|█████████▋| 454/469 [14:59<00:27,  1.85s/it]
 97%|█████████▋| 455/469 [15:01<00:25,  1.84s/it]
 97%|█████████▋| 456/469 [15:03<00:24,  1.92s/it]
 97%|█████████▋| 457/469 [15:05<00:22,  1.88s/it]
 98%|█████████▊| 458/469 [15:07<00:20,  1.88s/it]
 98%|█████████▊| 459/469 [15:09<00:19,  1.92s/it]
 98%|█████████▊| 460/469 [15:10<00:17,  1.92s/it]
 98%|█████████▊| 461/469 [15:12<00:15,  1.90s/it]
 99%|█████████▊| 462/469 [15:15<00:14,  2.11s/it]
 99%|█████████▊| 463/469 [15:17<00:13,  2.18s/it]
 99%|█████████▉| 464/469 [15:19<00:10,  2.09s/it]
 99%|█████████▉| 465/469 [15:21<00:08,  2.07s/it]
 99%|█████████▉| 466/469 [15:23<00:06,  2.02s/it]

+2026-03-23 16:28:17.115582: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-03-23 16:28:26.392554: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
+To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
+2026-03-23 16:28:26.438747: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
+2026-03-23 16:28:26.438805: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: d9a147cbff28e342e3a570e4cd1afa4e-taskrole1-0
+2026-03-23 16:28:26.438830: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: d9a147cbff28e342e3a570e4cd1afa4e-taskrole1-0
+2026-03-23 16:28:26.438915: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program
+2026-03-23 16:28:26.438959: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 535.154.5
+2026-03-23 16:28:26.776048: W tensorflow/core/framework/op_def_util.cc:371] Op BatchNormWithGlobalNormalization is deprecated. It will cease to work in GraphDef version 9. Use tf.nn.batch_normalization().
+warming up TensorFlow...
  0%|          | 0/1 [00:00<?, ?it/s]2026-03-23 16:28:27.563125: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled
+computing reference batch activations...
  0%|          | 0/211 [00:00<?, ?it/s]
  0%|          | 1/211 [00:02<10:16,  2.94s/it]
  1%|          | 2/211 [00:04<08:15,  2.37s/it]
  1%|▏         | 3/211 [00:06<07:38,  2.21s/it]
  2%|▏         | 4/211 [00:08<07:20,  2.13s/it]
  2%|▏         | 5/211 [00:11<07:25,  2.16s/it]
  3%|▎         | 6/211 [00:12<06:59,  2.04s/it]
  3%|▎         | 7/211 [00:15<07:15,  2.13s/it]
  4%|▍         | 8/211 [00:17<06:55,  2.05s/it]
  4%|▍         | 9/211 [00:19<06:53,  2.05s/it]
  5%|▍         | 10/211 [00:21<06:37,  1.98s/it]
  5%|▌         | 11/211 [00:22<06:18,  1.89s/it]
  6%|▌         | 12/211 [00:24<06:13,  1.88s/it]
  6%|▌         | 13/211 [00:26<06:08,  1.86s/it]
  7%|▋         | 14/211 [00:28<06:02,  1.84s/it]
  7%|▋         | 15/211 [00:29<05:57,  1.82s/it]
  8%|▊         | 16/211 [00:31<05:55,  1.82s/it]
  8%|▊         | 17/211 [00:33<06:00,  1.86s/it]
  9%|▊         | 18/211 [00:35<06:01,  1.87s/it]
  9%|▉         | 19/211 [00:37<05:59,  1.87s/it]
  9%|▉         | 20/211 [00:39<05:51,  1.84s/it]
 10%|▉         | 21/211 [00:41<05:45,  1.82s/it]
 10%|█         | 22/211 [00:42<05:43,  1.82s/it]
 11%|█         | 23/211 [00:44<05:41,  1.82s/it]
 11%|█▏        | 24/211 [00:47<06:23,  2.05s/it]
 12%|█▏        | 25/211 [00:50<07:07,  2.30s/it]
 12%|█▏        | 26/211 [00:51<06:37,  2.15s/it]
 13%|█▎        | 27/211 [00:53<06:14,  2.04s/it]
 13%|█▎        | 28/211 [00:55<05:56,  1.95s/it]
 14%|█▎        | 29/211 [00:57<05:56,  1.96s/it]
 14%|█▍        | 30/211 [00:59<05:49,  1.93s/it]
 15%|█▍        | 31/211 [01:01<05:49,  1.94s/it]
 15%|█▌        | 32/211 [01:03<05:41,  1.91s/it]
 16%|█▌        | 33/211 [01:05<05:42,  1.93s/it]
 16%|█▌        | 34/211 [01:07<05:49,  1.97s/it]
 17%|█▋        | 35/211 [01:09<06:11,  2.11s/it]
 17%|█▋        | 36/211 [01:11<06:00,  2.06s/it]
 18%|█▊        | 37/211 [01:13<05:46,  1.99s/it]
 18%|█▊        | 38/211 [01:15<05:57,  2.07s/it]
 18%|█▊        | 39/211 [01:17<05:44,  2.01s/it]
 19%|█▉        | 40/211 [01:19<05:31,  1.94s/it]
 19%|█▉        | 41/211 [01:21<05:24,  1.91s/it]
 20%|█▉        | 42/211 [01:22<05:19,  1.89s/it]
 20%|██        | 43/211 [01:24<05:20,  1.91s/it]
 21%|██        | 44/211 [01:26<05:09,  1.86s/it]
 21%|██▏       | 45/211 [01:29<05:50,  2.11s/it]
 22%|██▏       | 46/211 [01:31<05:46,  2.10s/it]
 22%|██▏       | 47/211 [01:33<05:33,  2.03s/it]
 23%|██▎       | 48/211 [01:35<05:17,  1.95s/it]
 23%|██▎       | 49/211 [01:36<05:14,  1.94s/it]
 24%|██▎       | 50/211 [01:38<05:03,  1.88s/it]
 24%|██▍       | 51/211 [01:40<05:19,  2.00s/it]
 25%|██▍       | 52/211 [01:42<05:13,  1.97s/it]
 25%|██▌       | 53/211 [01:45<05:19,  2.02s/it]
 26%|██▌       | 54/211 [01:46<05:05,  1.95s/it]
 26%|██▌       | 55/211 [01:48<04:55,  1.89s/it]
 27%|██▋       | 56/211 [01:50<04:56,  1.92s/it]
 27%|██▋       | 57/211 [01:52<04:48,  1.88s/it]
 27%|██▋       | 58/211 [01:54<04:44,  1.86s/it]
 28%|██▊       | 59/211 [01:56<04:47,  1.89s/it]
 28%|██▊       | 60/211 [01:58<04:57,  1.97s/it]
 29%|██▉       | 61/211 [02:00<04:46,  1.91s/it]
 29%|██▉       | 62/211 [02:01<04:38,  1.87s/it]
 30%|██▉       | 63/211 [02:04<05:05,  2.07s/it]
 30%|███       | 64/211 [02:06<04:59,  2.04s/it]
 31%|███       | 65/211 [02:08<05:05,  2.09s/it]
 31%|███▏      | 66/211 [02:10<04:48,  1.99s/it]
 32%|███▏      | 67/211 [02:12<04:37,  1.93s/it]
 32%|███▏      | 68/211 [02:13<04:27,  1.87s/it]
 33%|███▎      | 69/211 [02:15<04:25,  1.87s/it]
 33%|███▎      | 70/211 [02:17<04:18,  1.83s/it]
 34%|███▎      | 71/211 [02:19<04:15,  1.83s/it]
 34%|███▍      | 72/211 [02:20<04:10,  1.80s/it]
 35%|███▍      | 73/211 [02:22<04:08,  1.80s/it]
 35%|███▌      | 74/211 [02:24<04:12,  1.84s/it]
 36%|███▌      | 75/211 [02:26<04:10,  1.84s/it]
 36%|███▌      | 76/211 [02:28<04:06,  1.83s/it]
 36%|███▋      | 77/211 [02:30<04:11,  1.88s/it]
 37%|███▋      | 78/211 [02:32<04:07,  1.86s/it]
 37%|███▋      | 79/211 [02:34<04:08,  1.88s/it]
 38%|███▊      | 80/211 [02:35<04:06,  1.88s/it]
 38%|███▊      | 81/211 [02:37<04:11,  1.93s/it]
 39%|███▉      | 82/211 [02:39<04:04,  1.89s/it]
 39%|███▉      | 83/211 [02:41<04:06,  1.92s/it]
 40%|███▉      | 84/211 [02:43<04:03,  1.92s/it]
 40%|████      | 85/211 [02:45<03:55,  1.87s/it]
 41%|████      | 86/211 [02:47<03:50,  1.84s/it]
 41%|████      | 87/211 [02:48<03:45,  1.82s/it]
 42%|████▏     | 88/211 [02:50<03:41,  1.80s/it]
 42%|████▏     | 89/211 [02:52<03:36,  1.78s/it]
 43%|████▎     | 90/211 [02:54<03:51,  1.91s/it]
 43%|████▎     | 91/211 [02:56<03:45,  1.88s/it]
 44%|████▎     | 92/211 [02:58<03:38,  1.84s/it]
 44%|████▍     | 93/211 [03:00<03:37,  1.85s/it]
 45%|████▍     | 94/211 [03:02<03:46,  1.94s/it]
 45%|████▌     | 95/211 [03:03<03:36,  1.87s/it]
 45%|████▌     | 96/211 [03:05<03:34,  1.87s/it]
 46%|████▌     | 97/211 [03:08<03:58,  2.09s/it]
 46%|████▋     | 98/211 [03:10<03:45,  1.99s/it]
 47%|████▋     | 99/211 [03:11<03:34,  1.91s/it]
 47%|████▋     | 100/211 [03:13<03:31,  1.91s/it]
 48%|████▊     | 101/211 [03:15<03:28,  1.89s/it]
 48%|████▊     | 102/211 [03:21<05:46,  3.18s/it]
 49%|████▉     | 103/211 [03:23<04:58,  2.76s/it]
 49%|████▉     | 104/211 [03:25<04:22,  2.45s/it]
 50%|████▉     | 105/211 [03:27<04:03,  2.30s/it]
 50%|█████     | 106/211 [03:29<03:45,  2.15s/it]
 51%|█████     | 107/211 [03:31<03:35,  2.07s/it]
 51%|█████     | 108/211 [03:32<03:29,  2.03s/it]
 52%|█████▏    | 109/211 [03:34<03:21,  1.97s/it]
 52%|█████▏    | 110/211 [03:37<03:36,  2.15s/it]
 53%|█████▎    | 111/211 [03:39<03:23,  2.03s/it]
 53%|█████▎    | 112/211 [03:41<03:17,  1.99s/it]
 54%|█████▎    | 113/211 [03:42<03:10,  1.95s/it]
 54%|█████▍    | 114/211 [03:44<03:03,  1.90s/it]
 55%|█████▍    | 115/211 [03:46<03:02,  1.91s/it]
 55%|█████▍    | 116/211 [03:48<02:59,  1.89s/it]
 55%|█████▌    | 117/211 [03:50<03:03,  1.96s/it]
 56%|█████▌    | 118/211 [03:52<02:56,  1.90s/it]
 56%|█████▋    | 119/211 [03:54<02:51,  1.86s/it]
 57%|█████▋    | 120/211 [03:55<02:49,  1.86s/it]
 57%|█████▋    | 121/211 [03:57<02:52,  1.91s/it]
 58%|█████▊    | 122/211 [03:59<02:47,  1.89s/it]
 58%|█████▊    | 123/211 [04:01<02:51,  1.94s/it]
 59%|█████▉    | 124/211 [04:03<02:47,  1.93s/it]
 59%|█████▉    | 125/211 [04:05<02:40,  1.87s/it]
 60%|█████▉    | 126/211 [04:07<02:46,  1.96s/it]
 60%|██████    | 127/211 [04:09<02:43,  1.95s/it]
 61%|██████    | 128/211 [04:11<02:42,  1.95s/it]
 61%|██████    | 129/211 [04:13<02:47,  2.04s/it]
 62%|██████▏   | 130/211 [04:15<02:41,  2.00s/it]
 62%|██████▏   | 131/211 [04:17<02:39,  2.00s/it]
 63%|██████▎   | 132/211 [04:19<02:36,  1.98s/it]
 63%|██████▎   | 133/211 [04:21<02:31,  1.94s/it]
 64%|██████▎   | 134/211 [04:23<02:26,  1.91s/it]
 64%|██████▍   | 135/211 [04:25<02:21,  1.86s/it]
 64%|██████▍   | 136/211 [04:26<02:20,  1.87s/it]
 65%|██████▍   | 137/211 [04:29<02:23,  1.94s/it]
 65%|██████▌   | 138/211 [04:30<02:18,  1.90s/it]
 66%|██████▌   | 139/211 [04:32<02:13,  1.86s/it]
 66%|██████▋   | 140/211 [04:34<02:10,  1.84s/it]
 67%|██████▋   | 141/211 [04:36<02:07,  1.82s/it]
 67%|██████▋   | 142/211 [04:38<02:06,  1.83s/it]
 68%|██████▊   | 143/211 [04:39<02:06,  1.86s/it]
 68%|██��███▊   | 144/211 [04:41<02:02,  1.83s/it]
 69%|██████▊   | 145/211 [04:43<02:00,  1.82s/it]
 69%|██████▉   | 146/211 [04:45<01:58,  1.82s/it]
 70%|██████▉   | 147/211 [04:47<02:06,  1.98s/it]
 70%|███████   | 148/211 [04:49<02:07,  2.02s/it]
 71%|███████   | 149/211 [04:51<02:02,  1.97s/it]
 71%|███████   | 150/211 [04:53<01:59,  1.96s/it]
 72%|███████▏  | 151/211 [04:55<01:56,  1.95s/it]
 72%|███████▏  | 152/211 [04:57<01:54,  1.94s/it]
 73%|███████▎  | 153/211 [04:59<01:49,  1.88s/it]
 73%|███████▎  | 154/211 [05:01<01:56,  2.04s/it]
 73%|███████▎  | 155/211 [05:03<01:50,  1.97s/it]
 74%|███████▍  | 156/211 [05:05<01:48,  1.97s/it]
 74%|███████▍  | 157/211 [05:07<01:46,  1.96s/it]
 75%|███████▍  | 158/211 [05:09<01:41,  1.92s/it]
 75%|███████▌  | 159/211 [05:10<01:38,  1.90s/it]
 76%|███████▌  | 160/211 [05:12<01:35,  1.88s/it]
 76%|███████▋  | 161/211 [05:14<01:32,  1.85s/it]
 77%|███████▋  | 162/211 [05:16<01:32,  1.89s/it]
 77%|███████▋  | 163/211 [05:18<01:31,  1.90s/it]
 78%|███████▊  | 164/211 [05:20<01:28,  1.88s/it]
 78%|███████▊  | 165/211 [05:22<01:26,  1.88s/it]
 79%|███████▊  | 166/211 [05:24<01:25,  1.89s/it]
 79%|███████▉  | 167/211 [05:25<01:22,  1.88s/it]
 80%|███████▉  | 168/211 [05:27<01:22,  1.92s/it]
 80%|████████  | 169/211 [05:29<01:21,  1.94s/it]
 81%|████████  | 170/211 [05:31<01:18,  1.92s/it]
 81%|████████  | 171/211 [05:34<01:26,  2.15s/it]
 82%|████████▏ | 172/211 [05:36<01:23,  2.14s/it]
 82%|████████▏ | 173/211 [05:38<01:17,  2.05s/it]
 82%|████████▏ | 174/211 [05:40<01:13,  1.99s/it]
 83%|████████▎ | 175/211 [05:42<01:09,  1.93s/it]
 83%|████████▎ | 176/211 [05:44<01:10,  2.02s/it]
 84%|████████▍ | 177/211 [05:46<01:10,  2.06s/it]
 84%|████████▍ | 178/211 [05:49<01:16,  2.33s/it]
 85%|████████▍ | 179/211 [05:51<01:09,  2.16s/it]
 85%|████████▌ | 180/211 [05:53<01:03,  2.05s/it]
 86%|████████▌ | 181/211 [05:54<00:58,  1.96s/it]
 86%|████████▋ | 182/211 [05:56<00:56,  1.95s/it]
 87%|████████▋ | 183/211 [05:59<00:57,  2.05s/it]
 87%|████████▋ | 184/211 [06:00<00:54,  2.01s/it]
 88%|████████▊ | 185/211 [06:02<00:50,  1.95s/it]
 88%|████████▊ | 186/211 [06:04<00:47,  1.89s/it]
 89%|████████▊ | 187/211 [06:06<00:45,  1.88s/it]
 89%|████████▉ | 188/211 [06:08<00:42,  1.86s/it]
 90%|████████▉ | 189/211 [06:09<00:40,  1.83s/it]
 90%|█████████ | 190/211 [06:11<00:38,  1.85s/it]
 91%|█████████ | 191/211 [06:13<00:36,  1.85s/it]
 91%|█████████ | 192/211 [06:15<00:34,  1.84s/it]
 91%|█████████▏| 193/211 [06:17<00:34,  1.92s/it]
 92%|█████████▏| 194/211 [06:19<00:32,  1.91s/it]
 92%|█████████▏| 195/211 [06:21<00:31,  1.96s/it]
 93%|█████████▎| 196/211 [06:23<00:28,  1.92s/it]
 93%|█████████▎| 197/211 [06:25<00:27,  1.96s/it]
 94%|█████████▍| 198/211 [06:27<00:25,  1.93s/it]
 94%|█████████▍| 199/211 [06:29<00:22,  1.88s/it]
 95%|█████████▍| 200/211 [06:30<00:20,  1.89s/it]
 95%|█████████▌| 201/211 [06:32<00:18,  1.88s/it]
 96%|█████████▌| 202/211 [06:34<00:16,  1.86s/it]
 96%|█████████▌| 203/211 [06:36<00:15,  1.88s/it]
 97%|█████████▋| 204/211 [06:38<00:13,  1.89s/it]
 97%|█████████▋| 205/211 [06:40<00:11,  1.91s/it]
 98%|█████████▊| 206/211 [06:42<00:10,  2.10s/it]
 98%|█████████▊| 207/211 [06:44<00:08,  2.07s/it]
 99%|█████████▊| 208/211 [06:46<00:05,  1.98s/it]
 99%|█████████▉| 209/211 [06:49<00:04,  2.12s/it]
+computing/reading reference batch statistics...
+computing sample batch activations...
  0%|          | 0/469 [00:00<?, ?it/s]
  0%|          | 1/469 [00:01<14:40,  1.88s/it]
  0%|          | 2/469 [00:04<16:02,  2.06s/it]
  1%|          | 3/469 [00:05<15:06,  1.95s/it]
  1%|          | 4/469 [00:07<14:44,  1.90s/it]
  1%|          | 5/469 [00:09<14:34,  1.88s/it]
  1%|▏         | 6/469 [00:11<14:24,  1.87s/it]
  1%|▏         | 7/469 [00:13<14:05,  1.83s/it]
  2%|▏         | 8/469 [00:15<14:19,  1.87s/it]
  2%|▏         | 9/469 [00:16<14:13,  1.85s/it]
  2%|▏         | 10/469 [00:18<14:09,  1.85s/it]
  2%|▏         | 11/469 [00:20<14:04,  1.84s/it]
  3%|▎         | 12/469 [00:22<13:59,  1.84s/it]
  3%|▎         | 13/469 [00:24<14:09,  1.86s/it]
  3%|▎         | 14/469 [00:26<14:18,  1.89s/it]
  3%|▎         | 15/469 [00:28<14:01,  1.85s/it]
  3%|▎         | 16/469 [00:30<14:36,  1.94s/it]
  4%|▎         | 17/469 [00:31<14:12,  1.89s/it]
  4%|▍         | 18/469 [00:33<13:58,  1.86s/it]
  4%|▍         | 19/469 [00:36<14:55,  1.99s/it]
  4%|▍         | 20/469 [00:38<14:53,  1.99s/it]
  4%|▍         | 21/469 [00:40<14:50,  1.99s/it]
  5%|▍         | 22/469 [00:41<14:43,  1.98s/it]
  5%|▍         | 23/469 [00:44<15:04,  2.03s/it]
  5%|▌         | 24/469 [00:45<14:34,  1.97s/it]
  5%|▌         | 25/469 [00:47<14:03,  1.90s/it]
  6%|▌         | 26/469 [00:49<13:42,  1.86s/it]
  6%|▌         | 27/469 [00:51<13:31,  1.84s/it]
  6%|▌         | 28/469 [00:53<13:35,  1.85s/it]
  6%|▌         | 29/469 [00:54<13:24,  1.83s/it]
  6%|▋         | 30/469 [00:56<13:30,  1.85s/it]
  7%|▋         | 31/469 [00:58<13:10,  1.80s/it]
  7%|▋         | 32/469 [01:00<13:43,  1.88s/it]
  7%|▋         | 33/469 [01:02<13:23,  1.84s/it]
  7%|▋         | 34/469 [01:05<15:15,  2.10s/it]
  7%|▋         | 35/469 [01:06<14:40,  2.03s/it]
  8%|▊         | 36/469 [01:08<14:23,  1.99s/it]
  8%|▊         | 37/469 [01:10<13:51,  1.93s/it]
  8%|▊         | 38/469 [01:12<13:26,  1.87s/it]
  8%|▊         | 39/469 [01:14<13:13,  1.85s/it]
  9%|▊         | 40/469 [01:15<13:15,  1.85s/it]
  9%|▊         | 41/469 [01:17<13:14,  1.86s/it]
  9%|▉         | 42/469 [01:19<12:58,  1.82s/it]
  9%|▉         | 43/469 [01:21<13:04,  1.84s/it]
  9%|▉         | 44/469 [01:23<12:55,  1.82s/it]
 10%|▉         | 45/469 [01:25<12:56,  1.83s/it]
 10%|▉         | 46/469 [01:26<12:50,  1.82s/it]
 10%|█         | 47/469 [01:28<12:47,  1.82s/it]
 10%|█         | 48/469 [01:30<12:41,  1.81s/it]
 10%|█         | 49/469 [01:32<12:52,  1.84s/it]
 11%|█         | 50/469 [01:34<13:00,  1.86s/it]
 11%|█         | 51/469 [01:36<12:46,  1.83s/it]
 11%|█         | 52/469 [01:37<12:41,  1.83s/it]
 11%|█▏        | 53/469 [01:39<12:40,  1.83s/it]
 12%|█▏        | 54/469 [01:41<12:46,  1.85s/it]
 12%|█▏        | 55/469 [01:44<15:33,  2.25s/it]
 12%|█▏        | 56/469 [01:46<14:24,  2.09s/it]
 12%|█▏        | 57/469 [01:48<13:41,  1.99s/it]
 12%|█▏        | 58/469 [01:50<13:22,  1.95s/it]
 13%|█▎        | 59/469 [01:51<12:54,  1.89s/it]
 13%|█▎        | 60/469 [01:53<12:33,  1.84s/it]
 13%|█▎        | 61/469 [01:55<12:23,  1.82s/it]
 13%|█▎        | 62/469 [01:57<12:29,  1.84s/it]
 13%|█▎        | 63/469 [01:59<12:42,  1.88s/it]
 14%|█▎        | 64/469 [02:01<12:43,  1.89s/it]
 14%|█▍        | 65/469 [02:02<12:29,  1.86s/it]
 14%|█▍        | 66/469 [02:04<12:49,  1.91s/it]
 14%|█▍        | 67/469 [02:06<12:50,  1.92s/it]
 14%|█▍        | 68/469 [02:08<12:34,  1.88s/it]
 15%|█▍        | 69/469 [02:10<12:22,  1.86s/it]
 15%|█▍        | 70/469 [02:12<12:20,  1.86s/it]
 15%|█▌        | 71/469 [02:14<12:10,  1.84s/it]
 15%|█▌        | 72/469 [02:15<12:06,  1.83s/it]
 16%|█▌        | 73/469 [02:17<12:00,  1.82s/it]
 16%|█▌        | 74/469 [02:19<12:09,  1.85s/it]
 16%|█▌        | 75/469 [02:21<12:18,  1.87s/it]
 16%|█▌        | 76/469 [02:23<12:13,  1.87s/it]
 16%|█▋        | 77/469 [02:25<12:16,  1.88s/it]
 17%|█▋        | 78/469 [02:27<12:45,  1.96s/it]
 17%|█▋        | 79/469 [02:29<12:17,  1.89s/it]
 17%|█▋        | 80/469 [02:31<12:07,  1.87s/it]
 17%|█▋        | 81/469 [02:32<11:53,  1.84s/it]
 17%|█▋        | 82/469 [02:34<12:09,  1.89s/it]
 18%|█▊        | 83/469 [02:36<12:01,  1.87s/it]
 18%|█▊        | 84/469 [02:38<11:45,  1.83s/it]
 18%|█▊        | 85/469 [02:40<12:13,  1.91s/it]
 18%|█▊        | 86/469 [02:42<12:12,  1.91s/it]
 19%|█▊        | 87/469 [02:44<11:53,  1.87s/it]
 19%|█▉        | 88/469 [02:45<11:32,  1.82s/it]
 19%|█▉        | 89/469 [02:47<11:28,  1.81s/it]
 19%|█▉        | 90/469 [02:49<11:52,  1.88s/it]
 19%|█▉        | 91/469 [02:51<11:34,  1.84s/it]
 20%|█▉        | 92/469 [02:53<11:36,  1.85s/it]
 20%|█▉        | 93/469 [02:55<11:36,  1.85s/it]
 20%|██        | 94/469 [02:56<11:27,  1.83s/it]
 20%|██        | 95/469 [02:58<11:24,  1.83s/it]
 20%|██        | 96/469 [03:00<11:19,  1.82s/it]
 21%|██        | 97/469 [03:02<11:25,  1.84s/it]
 21%|██        | 98/469 [03:04<11:45,  1.90s/it]
 21%|██        | 99/469 [03:06<11:45,  1.91s/it]
 21%|██▏       | 100/469 [03:08<11:30,  1.87s/it]
 22%|██▏       | 101/469 [03:10<12:35,  2.05s/it]
 22%|██▏       | 102/469 [03:12<12:40,  2.07s/it]
 22%|██▏       | 103/469 [03:14<12:10,  2.00s/it]
 22%|██▏       | 104/469 [03:16<11:49,  1.94s/it]
 22%|██▏       | 105/469 [03:18<11:38,  1.92s/it]
 23%|██▎       | 106/469 [03:20<11:50,  1.96s/it]
 23%|██▎       | 107/469 [03:22<11:29,  1.91s/it]
 23%|██▎       | 108/469 [03:24<12:19,  2.05s/it]
 23%|██▎       | 109/469 [03:26<11:50,  1.97s/it]
 23%|██▎       | 110/469 [03:28<12:21,  2.06s/it]
 24%|██▎       | 111/469 [03:30<12:00,  2.01s/it]
 24%|██▍       | 112/469 [03:32<11:37,  1.95s/it]
 24%|██▍       | 113/469 [03:34<11:13,  1.89s/it]
 24%|██▍       | 114/469 [03:36<11:26,  1.93s/it]
 25%|██▍       | 115/469 [03:37<11:06,  1.88s/it]
 25%|██▍       | 116/469 [03:39<11:27,  1.95s/it]
 25%|██▍       | 117/469 [03:42<12:08,  2.07s/it]
 25%|██▌       | 118/469 [03:44<11:37,  1.99s/it]
 25%|██▌       | 119/469 [03:45<11:14,  1.93s/it]
 26%|██▌       | 120/469 [03:47<10:53,  1.87s/it]
 26%|██▌       | 121/469 [03:49<10:39,  1.84s/it]
 26%|██▌       | 122/469 [03:51<10:28,  1.81s/it]
 26%|██▌       | 123/469 [03:52<10:23,  1.80s/it]
 26%|██▋       | 124/469 [03:54<10:16,  1.79s/it]
 27%|██▋       | 125/469 [03:56<10:26,  1.82s/it]
 27%|██▋       | 126/469 [03:58<10:19,  1.81s/it]
 27%|██▋       | 127/469 [04:00<10:05,  1.77s/it]
 27%|██▋       | 128/469 [04:01<10:13,  1.80s/it]
 28%|██▊       | 129/469 [04:04<10:47,  1.91s/it]
 28%|██▊       | 130/469 [04:05<10:30,  1.86s/it]
 28%|██▊       | 131/469 [04:08<11:02,  1.96s/it]
 28%|██▊       | 132/469 [04:10<11:07,  1.98s/it]
 28%|██▊       | 133/469 [04:11<11:00,  1.97s/it]
 29%|██▊       | 134/469 [04:13<10:36,  1.90s/it]
 29%|██▉       | 135/469 [04:15<10:33,  1.90s/it]
 29%|██▉       | 136/469 [04:17<10:28,  1.89s/it]
 29%|██▉       | 137/469 [04:23<16:46,  3.03s/it]
 29%|██▉       | 138/469 [04:24<14:39,  2.66s/it]
 30%|██▉       | 139/469 [04:27<13:43,  2.49s/it]
 30%|██▉       | 140/469 [04:29<13:38,  2.49s/it]
 30%|███       | 141/469 [04:31<12:21,  2.26s/it]
 30%|███       | 142/469 [04:33<11:27,  2.10s/it]
 30%|███       | 143/469 [04:34<10:47,  1.98s/it]
 31%|███       | 144/469 [04:36<10:23,  1.92s/it]
 31%|███       | 145/469 [04:38<10:18,  1.91s/it]
 31%|███       | 146/469 [04:40<10:36,  1.97s/it]
 31%|███▏      | 147/469 [04:42<10:39,  1.99s/it]
 32%|███▏      | 148/469 [04:44<10:50,  2.03s/it]
 32%|███▏      | 149/469 [04:46<10:30,  1.97s/it]
 32%|███▏      | 150/469 [04:48<10:12,  1.92s/it]
 32%|███▏      | 151/469 [04:50<09:55,  1.87s/it]
 32%|███▏      | 152/469 [04:51<10:00,  1.89s/it]
 33%|███▎      | 153/469 [04:53<10:00,  1.90s/it]
 33%|███▎      | 154/469 [04:55<09:56,  1.89s/it]
 33%|███▎      | 155/469 [04:57<09:55,  1.90s/it]
 33%|███▎      | 156/469 [04:59<09:38,  1.85s/it]
 33%|███▎      | 157/469 [05:01<09:29,  1.83s/it]
 34%|███▎      | 158/469 [05:03<09:30,  1.84s/it]
 34%|███▍      | 159/469 [05:04<09:18,  1.80s/it]
 34%|███▍      | 160/469 [05:07<09:59,  1.94s/it]
 34%|███▍      | 161/469 [05:08<09:52,  1.93s/it]
 35%|███▍      | 162/469 [05:10<09:53,  1.93s/it]
 35%|███▍      | 163/469 [05:12<09:41,  1.90s/it]
 35%|███▍      | 164/469 [05:14<09:34,  1.88s/it]
 35%|███▌      | 165/469 [05:16<09:32,  1.88s/it]
 35%|███▌      | 166/469 [05:18<09:23,  1.86s/it]
 36%|███▌      | 167/469 [05:20<09:39,  1.92s/it]
 36%|███▌      | 168/469 [05:22<09:30,  1.90s/it]
 36%|███▌      | 169/469 [05:23<09:21,  1.87s/it]
 36%|███▌      | 170/469 [05:25<09:10,  1.84s/it]
 36%|███▋      | 171/469 [05:27<09:20,  1.88s/it]
 37%|███▋      | 172/469 [05:29<09:07,  1.84s/it]
 37%|███▋      | 173/469 [05:31<09:01,  1.83s/it]
 37%|███▋      | 174/469 [05:33<09:00,  1.83s/it]
 37%|███▋      | 175/469 [05:34<08:49,  1.80s/it]
 38%|███▊      | 176/469 [05:36<09:09,  1.88s/it]
 38%|███▊      | 177/469 [05:38<09:08,  1.88s/it]
 38%|███▊      | 178/469 [05:40<09:00,  1.86s/it]
 38%|███▊      | 179/469 [05:42<08:51,  1.83s/it]
 38%|███▊      | 180/469 [05:44<08:51,  1.84s/it]
 39%|███▊      | 181/469 [05:46<10:03,  2.10s/it]
 39%|███▉      | 182/469 [05:48<09:44,  2.04s/it]
 39%|███▉      | 183/469 [05:50<09:21,  1.96s/it]
 39%|███▉      | 184/469 [05:52<09:23,  1.98s/it]
 39%|███▉      | 185/469 [05:54<09:03,  1.92s/it]
 40%|███▉      | 186/469 [05:56<08:53,  1.88s/it]
 40%|███▉      | 187/469 [05:57<08:40,  1.85s/it]
 40%|████      | 188/469 [05:59<08:58,  1.92s/it]
 40%|████      | 189/469 [06:01<08:50,  1.89s/it]
 41%|████      | 190/469 [06:03<08:55,  1.92s/it]
 41%|████      | 191/469 [06:05<08:49,  1.91s/it]
 41%|████      | 192/469 [06:07<08:48,  1.91s/it]
 41%|████      | 193/469 [06:09<08:43,  1.90s/it]
 41%|████▏     | 194/469 [06:11<08:31,  1.86s/it]
 42%|████▏     | 195/469 [06:12<08:18,  1.82s/it]
 42%|████▏     | 196/469 [06:14<08:06,  1.78s/it]
 42%|████▏     | 197/469 [06:16<08:05,  1.79s/it]
 42%|████▏     | 198/469 [06:18<08:14,  1.83s/it]
 42%|████▏     | 199/469 [06:20<08:11,  1.82s/it]
 43%|████▎     | 200/469 [06:22<08:36,  1.92s/it]
 43%|████▎     | 201/469 [06:24<08:25,  1.89s/it]
 43%|████▎     | 202/469 [06:26<08:28,  1.91s/it]
 43%|████▎     | 203/469 [06:27<08:24,  1.90s/it]
 43%|████▎     | 204/469 [06:29<08:31,  1.93s/it]
 44%|████▎     | 205/469 [06:31<08:20,  1.89s/it]
 44%|████▍     | 206/469 [06:33<08:09,  1.86s/it]
 44%|████▍     | 207/469 [06:35<08:13,  1.89s/it]
 44%|████▍     | 208/469 [06:37<08:21,  1.92s/it]
 45%|████▍     | 209/469 [06:39<08:55,  2.06s/it]
 45%|████▍     | 210/469 [06:41<08:32,  1.98s/it]
 45%|████▍     | 211/469 [06:43<08:21,  1.94s/it]
 45%|████▌     | 212/469 [06:45<08:19,  1.94s/it]
 45%|████▌     | 213/469 [06:47<08:13,  1.93s/it]
 46%|████▌     | 214/469 [06:49<08:23,  1.98s/it]
 46%|████▌     | 215/469 [06:51<08:15,  1.95s/it]
 46%|████▌     | 216/469 [06:53<08:02,  1.91s/it]
 46%|████▋     | 217/469 [06:55<07:57,  1.90s/it]
 46%|████▋     | 218/469 [06:56<07:51,  1.88s/it]
 47%|████▋     | 219/469 [06:58<07:40,  1.84s/it]
 47%|████▋     | 220/469 [07:00<07:52,  1.90s/it]
 47%|████▋     | 221/469 [07:02<07:44,  1.87s/it]
 47%|████▋     | 222/469 [07:04<08:09,  1.98s/it]
 48%|████▊     | 223/469 [07:07<09:19,  2.27s/it]
 48%|████▊     | 224/469 [07:09<08:42,  2.13s/it]
 48%|████▊     | 225/469 [07:11<08:22,  2.06s/it]
 48%|████▊     | 226/469 [07:13<08:01,  1.98s/it]
 48%|████▊     | 227/469 [07:15<07:51,  1.95s/it]
 49%|████▊     | 228/469 [07:16<07:43,  1.92s/it]
 49%|████▉     | 229/469 [07:18<07:35,  1.90s/it]
 49%|████▉     | 230/469 [07:20<07:28,  1.88s/it]
 49%|████▉     | 231/469 [07:22<07:44,  1.95s/it]
 49%|████▉     | 232/469 [07:24<07:32,  1.91s/it]
 50%|████▉     | 233/469 [07:26<07:24,  1.88s/it]
 50%|████▉     | 234/469 [07:28<07:26,  1.90s/it]
 50%|█████     | 235/469 [07:30<07:18,  1.88s/it]
 50%|█████     | 236/469 [07:32<07:21,  1.89s/it]
 51%|█████     | 237/469 [07:33<07:15,  1.88s/it]
 51%|█████     | 238/469 [07:35<07:31,  1.95s/it]
 51%|█████     | 239/469 [07:37<07:25,  1.94s/it]
 51%|█████     | 240/469 [07:39<07:26,  1.95s/it]
 51%|█████▏    | 241/469 [07:42<08:39,  2.28s/it]
 52%|█████▏    | 242/469 [07:44<08:17,  2.19s/it]
 52%|█████▏    | 243/469 [07:47<08:19,  2.21s/it]
 52%|█████▏    | 244/469 [07:49<07:57,  2.12s/it]
 52%|█████▏    | 245/469 [07:50<07:35,  2.03s/it]
 52%|█████▏    | 246/469 [07:52<07:26,  2.00s/it]
 53%|█████▎    | 247/469 [07:54<07:19,  1.98s/it]
 53%|█████▎    | 248/469 [07:56<07:11,  1.95s/it]
 53%|█████▎    | 249/469 [07:58<07:15,  1.98s/it]
 53%|█████▎    | 250/469 [08:00<06:58,  1.91s/it]
 54%|█████▎    | 251/469 [08:02<06:51,  1.89s/it]
 54%|█████▎    | 252/469 [08:04<06:42,  1.85s/it]
 54%|█████▍    | 253/469 [08:05<06:33,  1.82s/it]
 54%|█████▍    | 254/469 [08:07<06:29,  1.81s/it]
 54%|█████▍    | 255/469 [08:09<06:32,  1.83s/it]
 55%|█████▍    | 256/469 [08:11<06:24,  1.80s/it]
 55%|█████▍    | 257/469 [08:12<06:18,  1.79s/it]
 55%|█████▌    | 258/469 [08:14<06:25,  1.83s/it]
 55%|█████▌    | 259/469 [08:16<06:25,  1.83s/it]
 55%|█████▌    | 260/469 [08:18<06:19,  1.82s/it]
 56%|█████▌    | 261/469 [08:20<06:42,  1.94s/it]
 56%|█████▌    | 262/469 [08:22<06:32,  1.90s/it]
 56%|█████▌    | 263/469 [08:24<06:25,  1.87s/it]
 56%|█████▋    | 264/469 [08:26<06:17,  1.84s/it]
 57%|█████▋    | 265/469 [08:28<06:20,  1.87s/it]
 57%|█████▋    | 266/469 [08:29<06:17,  1.86s/it]
 57%|█████▋    | 267/469 [08:31<06:16,  1.86s/it]
 57%|█████▋    | 268/469 [08:33<06:12,  1.86s/it]
 57%|█████▋    | 269/469 [08:35<06:41,  2.01s/it]
 58%|█████▊    | 270/469 [08:37<06:24,  1.93s/it]
 58%|█████▊    | 271/469 [08:39<06:10,  1.87s/it]
 58%|█████▊    | 272/469 [08:41<06:09,  1.87s/it]
 58%|█████▊    | 273/469 [08:43<06:24,  1.96s/it]
 58%|█████▊    | 274/469 [08:45<06:11,  1.90s/it]
 59%|█████▊    | 275/469 [08:47<06:10,  1.91s/it]
 59%|█████▉    | 276/469 [08:48<06:03,  1.88s/it]
 59%|█████▉    | 277/469 [08:50<06:01,  1.88s/it]
 59%|█████▉    | 278/469 [08:52<06:08,  1.93s/it]
 59%|█████▉    | 279/469 [08:54<05:57,  1.88s/it]
 60%|█████▉    | 280/469 [08:56<06:11,  1.97s/it]
 60%|█████▉    | 281/469 [08:58<06:07,  1.95s/it]
 60%|██████    | 282/469 [09:00<05:57,  1.91s/it]
 60%|██████    | 283/469 [09:02<05:48,  1.87s/it]
 61%|██████    | 284/469 [09:04<05:44,  1.86s/it]
 61%|██████    | 285/469 [09:06<05:45,  1.88s/it]
 61%|██████    | 286/469 [09:08<06:11,  2.03s/it]
 61%|██████    | 287/469 [09:10<06:16,  2.07s/it]
 61%|██████▏   | 288/469 [09:12<05:59,  1.99s/it]
 62%|██████▏   | 289/469 [09:14<05:50,  1.95s/it]
 62%|██████▏   | 290/469 [09:16<05:39,  1.90s/it]
 62%|██████▏   | 291/469 [09:17<05:28,  1.85s/it]
 62%|██████▏   | 292/469 [09:19<05:25,  1.84s/it]
 62%|██████▏   | 293/469 [09:21<05:23,  1.84s/it]
 63%|██████▎   | 294/469 [09:23<05:31,  1.89s/it]
 63%|██████▎   | 295/469 [09:25<05:20,  1.84s/it]
 63%|██████▎   | 296/469 [09:26<05:13,  1.81s/it]
 63%|██████▎   | 297/469 [09:28<05:08,  1.80s/it]
 64%|██████▎   | 298/469 [09:30<05:04,  1.78s/it]
 64%|██████▍   | 299/469 [09:32<05:05,  1.80s/it]
 64%|██████▍   | 300/469 [09:34<05:08,  1.82s/it]
 64%|██████▍   | 301/469 [09:36<05:07,  1.83s/it]
 64%|██████▍   | 302/469 [09:37<05:04,  1.82s/it]
 65%|██████▍   | 303/469 [09:39<05:06,  1.84s/it]
 65%|██████▍   | 304/469 [09:41<05:06,  1.86s/it]
 65%|██████▌   | 305/469 [09:43<05:02,  1.84s/it]
 65%|██████▌   | 306/469 [09:45<05:01,  1.85s/it]
 65%|██████▌   | 307/469 [09:47<05:00,  1.85s/it]
 66%|██████▌   | 308/469 [09:49<05:02,  1.88s/it]
 66%|██████▌   | 309/469 [09:50<05:00,  1.88s/it]
 66%|██████▌   | 310/469 [09:52<04:53,  1.85s/it]
 66%|██████▋   | 311/469 [09:54<04:54,  1.86s/it]
 67%|██████▋   | 312/469 [09:56<04:58,  1.90s/it]
 67%|██████▋   | 313/469 [09:58<04:50,  1.86s/it]
 67%|██████▋   | 314/469 [10:00<04:51,  1.88s/it]
 67%|██████▋   | 315/469 [10:02<04:52,  1.90s/it]
 67%|██████▋   | 316/469 [10:04<04:53,  1.92s/it]
 68%|██████▊   | 317/469 [10:06<04:45,  1.88s/it]
 68%|██████▊   | 318/469 [10:08<04:57,  1.97s/it]
 68%|██████▊   | 319/469 [10:09<04:46,  1.91s/it]
 68%|██████▊   | 320/469 [10:11<04:46,  1.92s/it]
 68%|██████▊   | 321/469 [10:14<05:08,  2.09s/it]
 69%|██████▊   | 322/469 [10:16<05:13,  2.14s/it]
 69%|██████▉   | 323/469 [10:18<05:02,  2.07s/it]
 69%|██████▉   | 324/469 [10:20<04:52,  2.02s/it]
 69%|██████▉   | 325/469 [10:22<04:42,  1.96s/it]
 70%|██████▉   | 326/469 [10:24<04:34,  1.92s/it]
 70%|██████▉   | 327/469 [10:26<04:41,  1.98s/it]
 70%|██████▉   | 328/469 [10:28<04:31,  1.93s/it]
 70%|███████   | 329/469 [10:29<04:24,  1.89s/it]
 70%|███████   | 330/469 [10:31<04:19,  1.87s/it]
 71%|███████   | 331/469 [10:33<04:13,  1.84s/it]
 71%|███████   | 332/469 [10:35<04:24,  1.93s/it]
 71%|███████   | 333/469 [10:37<04:16,  1.89s/it]
 71%|███████   | 334/469 [10:42<06:23,  2.84s/it]
 71%|███████▏  | 335/469 [10:44<05:35,  2.51s/it]
 72%|███████▏  | 336/469 [10:45<05:04,  2.29s/it]
 72%|███████▏  | 337/469 [10:47<04:42,  2.14s/it]
 72%|███████▏  | 338/469 [10:49<04:41,  2.15s/it]
 72%|███████▏  | 339/469 [10:51<04:33,  2.11s/it]
 72%|███████▏  | 340/469 [10:53<04:21,  2.03s/it]
 73%|███████▎  | 341/469 [10:55<04:17,  2.01s/it]
 73%|███████▎  | 342/469 [10:57<04:19,  2.04s/it]
 73%|███████▎  | 343/469 [10:59<04:18,  2.05s/it]
 73%|███████▎  | 344/469 [11:01<04:04,  1.96s/it]
 74%|███████▎  | 345/469 [11:03<04:00,  1.94s/it]
 74%|███████▍  | 346/469 [11:05<03:52,  1.89s/it]
 74%|███████▍  | 347/469 [11:07<03:45,  1.85s/it]
 74%|███████▍  | 348/469 [11:08<03:39,  1.82s/it]
 74%|███████▍  | 349/469 [11:10<03:38,  1.82s/it]
 75%|███████▍  | 350/469 [11:12<03:42,  1.87s/it]
 75%|███████▍  | 351/469 [11:14<03:44,  1.90s/it]
 75%|███████▌  | 352/469 [11:16<03:40,  1.88s/it]
 75%|███████▌  | 353/469 [11:25<07:38,  3.95s/it]
 75%|███████▌  | 354/469 [11:27<06:26,  3.36s/it]
 76%|███████▌  | 355/469 [11:29<05:30,  2.90s/it]
 76%|███████▌  | 356/469 [11:31<05:08,  2.73s/it]
 76%|███████▌  | 357/469 [11:33<04:45,  2.55s/it]
 76%|███████▋  | 358/469 [11:35<04:23,  2.37s/it]
 77%|██��████▋  | 359/469 [11:37<04:10,  2.28s/it]
 77%|███████▋  | 360/469 [11:39<03:51,  2.12s/it]
 77%|███████▋  | 361/469 [11:41<03:37,  2.02s/it]
 77%|███████▋  | 362/469 [12:00<13:03,  7.32s/it]
 77%|███████▋  | 363/469 [12:02<10:03,  5.69s/it]
 78%|███████▊  | 364/469 [12:04<07:55,  4.53s/it]
 78%|███████▊  | 365/469 [12:06<06:27,  3.73s/it]
 78%|███████▊  | 366/469 [12:08<05:26,  3.17s/it]
 78%|███████▊  | 367/469 [12:09<04:40,  2.75s/it]
 78%|███████▊  | 368/469 [12:12<04:21,  2.59s/it]
 79%|███████▊  | 369/469 [12:13<03:55,  2.36s/it]
 79%|███████▉  | 370/469 [12:15<03:43,  2.25s/it]
 79%|███████▉  | 371/469 [12:17<03:28,  2.13s/it]
 79%|███████▉  | 372/469 [12:19<03:21,  2.07s/it]
 80%|███████▉  | 373/469 [12:21<03:10,  1.99s/it]
 80%|███████▉  | 374/469 [12:23<03:02,  1.92s/it]
 80%|███████▉  | 375/469 [12:25<02:55,  1.86s/it]
 80%|████████  | 376/469 [12:26<02:51,  1.84s/it]
 80%|████████  | 377/469 [12:28<02:51,  1.86s/it]
 81%|████████  | 378/469 [12:30<02:47,  1.84s/it]
 81%|████████  | 379/469 [12:32<02:43,  1.81s/it]
 81%|████████  | 380/469 [12:34<02:48,  1.89s/it]
 81%|████████  | 381/469 [12:36<02:45,  1.88s/it]
 81%|████████▏ | 382/469 [12:38<02:43,  1.88s/it]
 82%|████████▏ | 383/469 [12:39<02:38,  1.85s/it]
 82%|████████▏ | 384/469 [12:41<02:36,  1.84s/it]
 82%|████████▏ | 385/469 [12:43<02:37,  1.88s/it]
 82%|████████▏ | 386/469 [12:45<02:35,  1.87s/it]
 83%|████████▎ | 387/469 [12:47<02:45,  2.02s/it]
 83%|████████▎ | 388/469 [12:49<02:37,  1.94s/it]
 83%|████████▎ | 389/469 [12:51<02:30,  1.88s/it]
 83%|████████▎ | 390/469 [12:53<02:32,  1.93s/it]
 83%|████████▎ | 391/469 [12:55<02:29,  1.92s/it]
 84%|████████▎ | 392/469 [12:57<02:23,  1.87s/it]
 84%|████████▍ | 393/469 [13:00<03:00,  2.38s/it]
 84%|████████▍ | 394/469 [13:02<02:47,  2.23s/it]
 84%|████████▍ | 395/469 [13:04<02:36,  2.12s/it]
 84%|████████▍ | 396/469 [13:06<02:27,  2.02s/it]
 85%|████████▍ | 397/469 [13:08<02:24,  2.00s/it]
 85%|████████▍ | 398/469 [13:09<02:19,  1.96s/it]
 85%|████████▌ | 399/469 [13:11<02:14,  1.92s/it]
 85%|████████▌ | 400/469 [13:13<02:08,  1.86s/it]
 86%|████████▌ | 401/469 [13:15<02:05,  1.85s/it]
 86%|████████▌ | 402/469 [13:17<02:02,  1.84s/it]
 86%|████████▌ | 403/469 [13:18<01:59,  1.81s/it]
 86%|████████▌ | 404/469 [13:20<02:02,  1.89s/it]
 86%|████████▋ | 405/469 [13:22<01:59,  1.87s/it]
 87%|████████▋ | 406/469 [13:24<01:56,  1.85s/it]
 87%|████████▋ | 407/469 [13:26<01:52,  1.81s/it]
 87%|████████▋ | 408/469 [13:28<01:50,  1.81s/it]
 87%|████████▋ | 409/469 [13:30<01:50,  1.84s/it]
 87%|████████▋ | 410/469 [13:32<01:55,  1.95s/it]
 88%|████████▊ | 411/469 [13:33<01:49,  1.89s/it]
 88%|████████▊ | 412/469 [13:35<01:48,  1.90s/it]
 88%|████████▊ | 413/469 [13:37<01:45,  1.89s/it]
 88%|████████▊ | 414/469 [13:39<01:42,  1.86s/it]
 88%|████████▊ | 415/469 [13:41<01:38,  1.82s/it]
 89%|████████▊ | 416/469 [13:43<01:39,  1.87s/it]
 89%|████████▉ | 417/469 [13:45<01:38,  1.90s/it]
 89%|████████▉ | 418/469 [13:50<02:30,  2.96s/it]
 89%|████████▉ | 419/469 [13:52<02:12,  2.65s/it]
 90%|████████▉ | 420/469 [13:54<02:01,  2.47s/it]
 90%|████████▉ | 421/469 [13:56<01:48,  2.26s/it]
 90%|████████▉ | 422/469 [13:58<01:39,  2.12s/it]
 90%|█████████ | 423/469 [14:00<01:33,  2.02s/it]
 90%|█████████ | 424/469 [14:01<01:29,  2.00s/it]
 91%|█████████ | 425/469 [14:03<01:24,  1.93s/it]
 91%|█████████ | 426/469 [14:05<01:22,  1.93s/it]
 91%|█████████ | 427/469 [14:07<01:19,  1.90s/it]
 91%|█████████▏| 428/469 [14:10<01:30,  2.21s/it]
 91%|█████████▏| 429/469 [14:12<01:23,  2.09s/it]
 92%|█████████▏| 430/469 [14:13<01:17,  2.00s/it]
 92%|█████████▏| 431/469 [14:16<01:17,  2.05s/it]
 92%|█████████▏| 432/469 [14:17<01:12,  1.96s/it]
 92%|█████████▏| 433/469 [14:19<01:09,  1.93s/it]
 93%|█████████▎| 434/469 [14:21<01:06,  1.89s/it]
 93%|█████████▎| 435/469 [14:23<01:03,  1.88s/it]
 93%|█████████▎| 436/469 [14:25<01:06,  2.01s/it]
 93%|█████████▎| 437/469 [14:27<01:02,  1.95s/it]
 93%|█████████▎| 438/469 [14:29<00:59,  1.93s/it]
 94%|█████████▎| 439/469 [14:31<00:57,  1.91s/it]
 94%|█████████▍| 440/469 [14:33<00:59,  2.04s/it]
 94%|█████████▍| 441/469 [14:35<00:55,  1.99s/it]
 94%|█████████▍| 442/469 [14:37<00:53,  1.97s/it]
 94%|█████████▍| 443/469 [14:39<00:49,  1.92s/it]
 95%|█████████▍| 444/469 [14:41<00:47,  1.89s/it]
 95%|█████████▍| 445/469 [14:42<00:44,  1.87s/it]
 95%|█████████▌| 446/469 [14:44<00:44,  1.94s/it]
 95%|█████████▌| 447/469 [14:46<00:41,  1.88s/it]
 96%|█████████▌| 448/469 [14:48<00:38,  1.83s/it]
 96%|█████████▌| 449/469 [14:50<00:37,  1.89s/it]
 96%|█████████▌| 450/469 [14:52<00:34,  1.84s/it]
 96%|█████████▌| 451/469 [14:53<00:32,  1.81s/it]
 96%|█████████▋| 452/469 [14:55<00:30,  1.80s/it]
 97%|█████████▋| 453/469 [14:57<00:28,  1.77s/it]
 97%|█████████▋| 454/469 [14:59<00:27,  1.85s/it]
 97%|█████████▋| 455/469 [15:01<00:25,  1.84s/it]
 97%|█████████▋| 456/469 [15:03<00:24,  1.92s/it]
 97%|█████████▋| 457/469 [15:05<00:22,  1.88s/it]
 98%|█████████▊| 458/469 [15:07<00:20,  1.88s/it]
 98%|█████████▊| 459/469 [15:09<00:19,  1.92s/it]
 98%|█████████▊| 460/469 [15:10<00:17,  1.92s/it]
 98%|█████████▊| 461/469 [15:12<00:15,  1.90s/it]
 99%|█████████▊| 462/469 [15:15<00:14,  2.11s/it]
 99%|█████████▊| 463/469 [15:17<00:13,  2.18s/it]
 99%|█████████▉| 464/469 [15:19<00:10,  2.09s/it]
 99%|█████████▉| 465/469 [15:21<00:08,  2.07s/it]
 99%|█████████▉| 466/469 [15:23<00:06,  2.02s/it]
+computing/reading sample batch statistics...
+Computing evaluations...
+Inception Score: 37.95753860473633
+FID: 21.04736987276152
+sFID: 71.53442942455422
+Precision: 0.6907333333333333
+Recall: 0.35639366212898904

evaluate.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/env bash
+REF_BATCH="/gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.npz"
+CUDA_VISIBLE_DEVICES=1 nohup python evaluator_rf.py \
+    --ref_batch ${REF_BATCH} \
+    --sample_batch /gemini/space/gzy_new/models/Sida/sd3_rectified_samples_new_batch_2.npz \
+    > eval_rectified_noise_new_batch_2.log 2>&1 &
+# CUDA_VISIBLE_DEVICES=0 nohup python evaluator_rf.py \
+#     --ref_batch ${REF_BATCH} \
+#     --sample_batch "/gemini/space/gzy_new/models/Sida/sd3_lora_samples_3w/checkpoint-checkpoint-500000-rank32-guidance-7.0-steps-40-size-512x512.npz" \
+#     > eval_baseline.log 2>&1 &

evaluator_base copy.py ADDED Viewed

	@@ -0,0 +1,680 @@

+import argparse
+import io
+import os
+import random
+import warnings
+import zipfile
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from functools import partial
+from multiprocessing import cpu_count
+from multiprocessing.pool import ThreadPool
+from typing import Iterable, Optional, Tuple
+import numpy as np
+import requests
+import tensorflow.compat.v1 as tf
+from scipy import linalg
+from tqdm.auto import tqdm
+INCEPTION_V3_URL = "https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/classify_image_graph_def.pb"
+INCEPTION_V3_PATH = "classify_image_graph_def.pb"
+FID_POOL_NAME = "pool_3:0"
+FID_SPATIAL_NAME = "mixed_6/conv:0"
+def main():
+    parser = argparse.ArgumentParser()
+    #/gemini/space/gzy_new/Rectified_Noise/Finetune/finetune-coco/sd3_rectified_samples.npz
+    parser.add_argument("--ref_batch", default='/gemini/space/dataset/coco/coco_train_3w.npz',help="path to reference batch npz file")
+    parser.add_argument("--sample_batch", default='/gemini/space/gzy_new/Rectified_Noise/Finetune/finetune-coco/sd3_lora_samples/batch32-rank64-last-sd3-sd3-lora-finetuned-batch-32-guidance-7.0-steps-20-size-512x512.npz', help="path to sample batch npz file")
+    parser.add_argument("--save_path",  default='/gemini/space/gzy/w_w_last/w_w_sit_last1/temp/',help="path to sample batch npz file")
+    parser.add_argument("--cfg_cond", default=1, type=int)
+    parser.add_argument("--step", default=1, type=int)
+    parser.add_argument("--cfg", default=1.0, type=float)
+    parser.add_argument("--cls_cfg", default=1.0, type=float)
+    parser.add_argument("--gh", default=1.0, type=float)
+    parser.add_argument("--num_steps", default=250, type=int)
+    args = parser.parse_args()
+    if not os.path.exists(args.save_path):
+        os.mkdir(args.save_path)
+    config = tf.ConfigProto(
+        allow_soft_placement=True  # allows DecodeJpeg to run on CPU in Inception graph
+    )
+    config.gpu_options.allow_growth = True
+    evaluator = Evaluator(tf.Session(config=config))
+    print("warming up TensorFlow...")
+    # This will cause TF to print a bunch of verbose stuff now rather
+    # than after the next print(), to help prevent confusion.
+    evaluator.warmup()
+    print("computing reference batch activations...")
+    ref_acts = evaluator.read_activations(args.ref_batch)
+    print("computing/reading reference batch statistics...")
+    ref_stats, ref_stats_spatial = evaluator.read_statistics(args.ref_batch, ref_acts)
+    print("computing sample batch activations...")
+    sample_acts = evaluator.read_activations(args.sample_batch)
+    print("computing/reading sample batch statistics...")
+    sample_stats, sample_stats_spatial = evaluator.read_statistics(args.sample_batch, sample_acts)
+    print("Computing evaluations...")
+    Inception_Score = evaluator.compute_inception_score(sample_acts[0])
+    FID = sample_stats.frechet_distance(ref_stats)
+    sFID = sample_stats_spatial.frechet_distance(ref_stats_spatial)
+    prec, recall = evaluator.compute_prec_recall(ref_acts[0], sample_acts[0])
+    print("Inception Score:", Inception_Score)
+    print("FID:", FID)
+    print("sFID:", sFID)
+    print("Precision:", prec)
+    print("Recall:", recall)
+    if args.cfg_cond:
+        file_path = args.save_path + str(args.num_steps) + str(args.step) + str(args.cfg) + str(args.gh) + str(args.cls_cfg)+ "cfg_cond_true.txt"
+    else:
+        file_path = args.save_path + str(args.num_steps) + str(args.step) + str(args.cfg) + str(args.gh) + str(args.cls_cfg)+ "cfg_cond_false.txt"
+    with open(file_path, "w") as file:
+        file.write("Inception Score: {}\n".format(Inception_Score))
+        file.write("FID: {}\n".format(FID))
+        file.write("sFID: {}\n".format(sFID))
+        file.write("Precision: {}\n".format(prec))
+        file.write("Recall: {}\n".format(recall))
+class InvalidFIDException(Exception):
+    pass
+class FIDStatistics:
+    def __init__(self, mu: np.ndarray, sigma: np.ndarray):
+        self.mu = mu
+        self.sigma = sigma
+    def frechet_distance(self, other, eps=1e-6):
+        """
+        Compute the Frechet distance between two sets of statistics.
+        """
+        # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L132
+        mu1, sigma1 = self.mu, self.sigma
+        mu2, sigma2 = other.mu, other.sigma
+        mu1 = np.atleast_1d(mu1)
+        mu2 = np.atleast_1d(mu2)
+        sigma1 = np.atleast_2d(sigma1)
+        sigma2 = np.atleast_2d(sigma2)
+        assert (
+            mu1.shape == mu2.shape
+        ), f"Training and test mean vectors have different lengths: {mu1.shape}, {mu2.shape}"
+        assert (
+            sigma1.shape == sigma2.shape
+        ), f"Training and test covariances have different dimensions: {sigma1.shape}, {sigma2.shape}"
+        diff = mu1 - mu2
+        # product might be almost singular
+        covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+        if not np.isfinite(covmean).all():
+            msg = (
+                "fid calculation produces singular product; adding %s to diagonal of cov estimates"
+                % eps
+            )
+            warnings.warn(msg)
+            offset = np.eye(sigma1.shape[0]) * eps
+            covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+        # numerical error might give slight imaginary component
+        if np.iscomplexobj(covmean):
+            if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+                m = np.max(np.abs(covmean.imag))
+                raise ValueError("Imaginary component {}".format(m))
+            covmean = covmean.real
+        tr_covmean = np.trace(covmean)
+        return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
+class Evaluator:
+    def __init__(
+        self,
+        session,
+        batch_size=64,
+        softmax_batch_size=512,
+    ):
+        self.sess = session
+        self.batch_size = batch_size
+        self.softmax_batch_size = softmax_batch_size
+        self.manifold_estimator = ManifoldEstimator(session)
+        with self.sess.graph.as_default():
+            self.image_input = tf.placeholder(tf.float32, shape=[None, None, None, 3])
+            self.softmax_input = tf.placeholder(tf.float32, shape=[None, 2048])
+            self.pool_features, self.spatial_features = _create_feature_graph(self.image_input)
+            self.softmax = _create_softmax_graph(self.softmax_input)
+    def warmup(self):
+        self.compute_activations(np.zeros([1, 8, 64, 64, 3]))
+    def read_activations(self, npz_path: str) -> Tuple[np.ndarray, np.ndarray]:
+        with open_npz_array(npz_path, "arr_0") as reader:
+            return self.compute_activations(reader.read_batches(self.batch_size))
+    def compute_activations(self, batches: Iterable[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Compute image features for downstream evals.
+        :param batches: a iterator over NHWC numpy arrays in [0, 255].
+        :return: a tuple of numpy arrays of shape [N x X], where X is a feature
+                 dimension. The tuple is (pool_3, spatial).
+        """
+        preds = []
+        spatial_preds = []
+        for batch in tqdm(batches):
+            batch = batch.astype(np.float32)
+            pred, spatial_pred = self.sess.run(
+                [self.pool_features, self.spatial_features], {self.image_input: batch}
+            )
+            preds.append(pred.reshape([pred.shape[0], -1]))
+            spatial_preds.append(spatial_pred.reshape([spatial_pred.shape[0], -1]))
+        return (
+            np.concatenate(preds, axis=0),
+            np.concatenate(spatial_preds, axis=0),
+        )
+    def read_statistics(
+        self, npz_path: str, activations: Tuple[np.ndarray, np.ndarray]
+    ) -> Tuple[FIDStatistics, FIDStatistics]:
+        obj = np.load(npz_path)
+        if "mu" in list(obj.keys()):
+            return FIDStatistics(obj["mu"], obj["sigma"]), FIDStatistics(
+                obj["mu_s"], obj["sigma_s"]
+            )
+        return tuple(self.compute_statistics(x) for x in activations)
+    def compute_statistics(self, activations: np.ndarray) -> FIDStatistics:
+        mu = np.mean(activations, axis=0)
+        sigma = np.cov(activations, rowvar=False)
+        return FIDStatistics(mu, sigma)
+    def compute_inception_score(self, activations: np.ndarray, split_size: int = 5000) -> float:
+        softmax_out = []
+        for i in range(0, len(activations), self.softmax_batch_size):
+            acts = activations[i : i + self.softmax_batch_size]
+            softmax_out.append(self.sess.run(self.softmax, feed_dict={self.softmax_input: acts}))
+        preds = np.concatenate(softmax_out, axis=0)
+        # https://github.com/openai/improved-gan/blob/4f5d1ec5c16a7eceb206f42bfc652693601e1d5c/inception_score/model.py#L46
+        scores = []
+        for i in range(0, len(preds), split_size):
+            part = preds[i : i + split_size]
+            kl = part * (np.log(part) - np.log(np.expand_dims(np.mean(part, 0), 0)))
+            kl = np.mean(np.sum(kl, 1))
+            scores.append(np.exp(kl))
+        return float(np.mean(scores))
+    def compute_prec_recall(
+        self, activations_ref: np.ndarray, activations_sample: np.ndarray
+    ) -> Tuple[float, float]:
+        radii_1 = self.manifold_estimator.manifold_radii(activations_ref)
+        radii_2 = self.manifold_estimator.manifold_radii(activations_sample)
+        pr = self.manifold_estimator.evaluate_pr(
+            activations_ref, radii_1, activations_sample, radii_2
+        )
+        return (float(pr[0][0]), float(pr[1][0]))
+class ManifoldEstimator:
+    """
+    A helper for comparing manifolds of feature vectors.
+    Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L57
+    """
+    def __init__(
+        self,
+        session,
+        row_batch_size=10000,
+        col_batch_size=10000,
+        nhood_sizes=(3,),
+        clamp_to_percentile=None,
+        eps=1e-5,
+    ):
+        """
+        Estimate the manifold of given feature vectors.
+        :param session: the TensorFlow session.
+        :param row_batch_size: row batch size to compute pairwise distances
+                               (parameter to trade-off between memory usage and performance).
+        :param col_batch_size: column batch size to compute pairwise distances.
+        :param nhood_sizes: number of neighbors used to estimate the manifold.
+        :param clamp_to_percentile: prune hyperspheres that have radius larger than
+                                    the given percentile.
+        :param eps: small number for numerical stability.
+        """
+        self.distance_block = DistanceBlock(session)
+        self.row_batch_size = row_batch_size
+        self.col_batch_size = col_batch_size
+        self.nhood_sizes = nhood_sizes
+        self.num_nhoods = len(nhood_sizes)
+        self.clamp_to_percentile = clamp_to_percentile
+        self.eps = eps
+    def warmup(self):
+        feats, radii = (
+            np.zeros([1, 2048], dtype=np.float32),
+            np.zeros([1, 1], dtype=np.float32),
+        )
+        self.evaluate_pr(feats, radii, feats, radii)
+    def manifold_radii(self, features: np.ndarray) -> np.ndarray:
+        num_images = len(features)
+        # Estimate manifold of features by calculating distances to k-NN of each sample.
+        radii = np.zeros([num_images, self.num_nhoods], dtype=np.float32)
+        distance_batch = np.zeros([self.row_batch_size, num_images], dtype=np.float32)
+        seq = np.arange(max(self.nhood_sizes) + 1, dtype=np.int32)
+        for begin1 in range(0, num_images, self.row_batch_size):
+            end1 = min(begin1 + self.row_batch_size, num_images)
+            row_batch = features[begin1:end1]
+            for begin2 in range(0, num_images, self.col_batch_size):
+                end2 = min(begin2 + self.col_batch_size, num_images)
+                col_batch = features[begin2:end2]
+                # Compute distances between batches.
+                distance_batch[
+                    0 : end1 - begin1, begin2:end2
+                ] = self.distance_block.pairwise_distances(row_batch, col_batch)
+            # Find the k-nearest neighbor from the current batch.
+            radii[begin1:end1, :] = np.concatenate(
+                [
+                    x[:, self.nhood_sizes]
+                    for x in _numpy_partition(distance_batch[0 : end1 - begin1, :], seq, axis=1)
+                ],
+                axis=0,
+            )
+        if self.clamp_to_percentile is not None:
+            max_distances = np.percentile(radii, self.clamp_to_percentile, axis=0)
+            radii[radii > max_distances] = 0
+        return radii
+    def evaluate(self, features: np.ndarray, radii: np.ndarray, eval_features: np.ndarray):
+        """
+        Evaluate if new feature vectors are at the manifold.
+        """
+        num_eval_images = eval_features.shape[0]
+        num_ref_images = radii.shape[0]
+        distance_batch = np.zeros([self.row_batch_size, num_ref_images], dtype=np.float32)
+        batch_predictions = np.zeros([num_eval_images, self.num_nhoods], dtype=np.int32)
+        max_realism_score = np.zeros([num_eval_images], dtype=np.float32)
+        nearest_indices = np.zeros([num_eval_images], dtype=np.int32)
+        for begin1 in range(0, num_eval_images, self.row_batch_size):
+            end1 = min(begin1 + self.row_batch_size, num_eval_images)
+            feature_batch = eval_features[begin1:end1]
+            for begin2 in range(0, num_ref_images, self.col_batch_size):
+                end2 = min(begin2 + self.col_batch_size, num_ref_images)
+                ref_batch = features[begin2:end2]
+                distance_batch[
+                    0 : end1 - begin1, begin2:end2
+                ] = self.distance_block.pairwise_distances(feature_batch, ref_batch)
+            # From the minibatch of new feature vectors, determine if they are in the estimated manifold.
+            # If a feature vector is inside a hypersphere of some reference sample, then
+            # the new sample lies at the estimated manifold.
+            # The radii of the hyperspheres are determined from distances of neighborhood size k.
+            samples_in_manifold = distance_batch[0 : end1 - begin1, :, None] <= radii
+            batch_predictions[begin1:end1] = np.any(samples_in_manifold, axis=1).astype(np.int32)
+            max_realism_score[begin1:end1] = np.max(
+                radii[:, 0] / (distance_batch[0 : end1 - begin1, :] + self.eps), axis=1
+            )
+            nearest_indices[begin1:end1] = np.argmin(distance_batch[0 : end1 - begin1, :], axis=1)
+        return {
+            "fraction": float(np.mean(batch_predictions)),
+            "batch_predictions": batch_predictions,
+            "max_realisim_score": max_realism_score,
+            "nearest_indices": nearest_indices,
+        }
+    def evaluate_pr(
+        self,
+        features_1: np.ndarray,
+        radii_1: np.ndarray,
+        features_2: np.ndarray,
+        radii_2: np.ndarray,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Evaluate precision and recall efficiently.
+        :param features_1: [N1 x D] feature vectors for reference batch.
+        :param radii_1: [N1 x K1] radii for reference vectors.
+        :param features_2: [N2 x D] feature vectors for the other batch.
+        :param radii_2: [N x K2] radii for other vectors.
+        :return: a tuple of arrays for (precision, recall):
+                 - precision: an np.ndarray of length K1
+                 - recall: an np.ndarray of length K2
+        """
+        features_1_status = np.zeros([len(features_1), radii_2.shape[1]], dtype=np.bool_)
+        features_2_status = np.zeros([len(features_2), radii_1.shape[1]], dtype=np.bool_)
+        for begin_1 in range(0, len(features_1), self.row_batch_size):
+            end_1 = begin_1 + self.row_batch_size
+            batch_1 = features_1[begin_1:end_1]
+            for begin_2 in range(0, len(features_2), self.col_batch_size):
+                end_2 = begin_2 + self.col_batch_size
+                batch_2 = features_2[begin_2:end_2]
+                batch_1_in, batch_2_in = self.distance_block.less_thans(
+                    batch_1, radii_1[begin_1:end_1], batch_2, radii_2[begin_2:end_2]
+                )
+                features_1_status[begin_1:end_1] |= batch_1_in
+                features_2_status[begin_2:end_2] |= batch_2_in
+        return (
+            np.mean(features_2_status.astype(np.float64), axis=0),
+            np.mean(features_1_status.astype(np.float64), axis=0),
+        )
+class DistanceBlock:
+    """
+    Calculate pairwise distances between vectors.
+    Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L34
+    """
+    def __init__(self, session):
+        self.session = session
+        # Initialize TF graph to calculate pairwise distances.
+        with session.graph.as_default():
+            self._features_batch1 = tf.placeholder(tf.float32, shape=[None, None])
+            self._features_batch2 = tf.placeholder(tf.float32, shape=[None, None])
+            distance_block_16 = _batch_pairwise_distances(
+                tf.cast(self._features_batch1, tf.float16),
+                tf.cast(self._features_batch2, tf.float16),
+            )
+            self.distance_block = tf.cond(
+                tf.reduce_all(tf.math.is_finite(distance_block_16)),
+                lambda: tf.cast(distance_block_16, tf.float32),
+                lambda: _batch_pairwise_distances(self._features_batch1, self._features_batch2),
+            )
+            # Extra logic for less thans.
+            self._radii1 = tf.placeholder(tf.float32, shape=[None, None])
+            self._radii2 = tf.placeholder(tf.float32, shape=[None, None])
+            dist32 = tf.cast(self.distance_block, tf.float32)[..., None]
+            self._batch_1_in = tf.math.reduce_any(dist32 <= self._radii2, axis=1)
+            self._batch_2_in = tf.math.reduce_any(dist32 <= self._radii1[:, None], axis=0)
+    def pairwise_distances(self, U, V):
+        """
+        Evaluate pairwise distances between two batches of feature vectors.
+        """
+        return self.session.run(
+            self.distance_block,
+            feed_dict={self._features_batch1: U, self._features_batch2: V},
+        )
+    def less_thans(self, batch_1, radii_1, batch_2, radii_2):
+        return self.session.run(
+            [self._batch_1_in, self._batch_2_in],
+            feed_dict={
+                self._features_batch1: batch_1,
+                self._features_batch2: batch_2,
+                self._radii1: radii_1,
+                self._radii2: radii_2,
+            },
+        )
+def _batch_pairwise_distances(U, V):
+    """
+    Compute pairwise distances between two batches of feature vectors.
+    """
+    with tf.variable_scope("pairwise_dist_block"):
+        # Squared norms of each row in U and V.
+        norm_u = tf.reduce_sum(tf.square(U), 1)
+        norm_v = tf.reduce_sum(tf.square(V), 1)
+        # norm_u as a column and norm_v as a row vectors.
+        norm_u = tf.reshape(norm_u, [-1, 1])
+        norm_v = tf.reshape(norm_v, [1, -1])
+        # Pairwise squared Euclidean distances.
+        D = tf.maximum(norm_u - 2 * tf.matmul(U, V, False, True) + norm_v, 0.0)
+    return D
+class NpzArrayReader(ABC):
+    @abstractmethod
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        pass
+    @abstractmethod
+    def remaining(self) -> int:
+        pass
+    def read_batches(self, batch_size: int) -> Iterable[np.ndarray]:
+        def gen_fn():
+            while True:
+                batch = self.read_batch(batch_size)
+                if batch is None:
+                    break
+                yield batch
+        rem = self.remaining()
+        num_batches = rem // batch_size + int(rem % batch_size != 0)
+        return BatchIterator(gen_fn, num_batches)
+class BatchIterator:
+    def __init__(self, gen_fn, length):
+        self.gen_fn = gen_fn
+        self.length = length
+    def __len__(self):
+        return self.length
+    def __iter__(self):
+        return self.gen_fn()
+class StreamingNpzArrayReader(NpzArrayReader):
+    def __init__(self, arr_f, shape, dtype):
+        self.arr_f = arr_f
+        self.shape = shape
+        self.dtype = dtype
+        self.idx = 0
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        if self.idx >= self.shape[0]:
+            return None
+        bs = min(batch_size, self.shape[0] - self.idx)
+        self.idx += bs
+        if self.dtype.itemsize == 0:
+            return np.ndarray([bs, *self.shape[1:]], dtype=self.dtype)
+        read_count = bs * np.prod(self.shape[1:])
+        read_size = int(read_count * self.dtype.itemsize)
+        data = _read_bytes(self.arr_f, read_size, "array data")
+        return np.frombuffer(data, dtype=self.dtype).reshape([bs, *self.shape[1:]])
+    def remaining(self) -> int:
+        return max(0, self.shape[0] - self.idx)
+class MemoryNpzArrayReader(NpzArrayReader):
+    def __init__(self, arr):
+        self.arr = arr
+        self.idx = 0
+    @classmethod
+    def load(cls, path: str, arr_name: str):
+        with open(path, "rb") as f:
+            arr = np.load(f)[arr_name]
+        return cls(arr)
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        if self.idx >= self.arr.shape[0]:
+            return None
+        res = self.arr[self.idx : self.idx + batch_size]
+        self.idx += batch_size
+        return res
+    def remaining(self) -> int:
+        return max(0, self.arr.shape[0] - self.idx)
+@contextmanager
+def open_npz_array(path: str, arr_name: str) -> NpzArrayReader:
+    with _open_npy_file(path, arr_name) as arr_f:
+        version = np.lib.format.read_magic(arr_f)
+        if version == (1, 0):
+            header = np.lib.format.read_array_header_1_0(arr_f)
+        elif version == (2, 0):
+            header = np.lib.format.read_array_header_2_0(arr_f)
+        else:
+            yield MemoryNpzArrayReader.load(path, arr_name)
+            return
+        shape, fortran, dtype = header
+        if fortran or dtype.hasobject:
+            yield MemoryNpzArrayReader.load(path, arr_name)
+        else:
+            yield StreamingNpzArrayReader(arr_f, shape, dtype)
+def _read_bytes(fp, size, error_template="ran out of data"):
+    """
+    Copied from: https://github.com/numpy/numpy/blob/fb215c76967739268de71aa4bda55dd1b062bc2e/numpy/lib/format.py#L788-L886
+    Read from file-like object until size bytes are read.
+    Raises ValueError if not EOF is encountered before size bytes are read.
+    Non-blocking objects only supported if they derive from io objects.
+    Required as e.g. ZipExtFile in python 2.6 can return less data than
+    requested.
+    """
+    data = bytes()
+    while True:
+        # io files (default in python3) return None or raise on
+        # would-block, python2 file will truncate, probably nothing can be
+        # done about that.  note that regular files can't be non-blocking
+        try:
+            r = fp.read(size - len(data))
+            data += r
+            if len(r) == 0 or len(data) == size:
+                break
+        except io.BlockingIOError:
+            pass
+    if len(data) != size:
+        msg = "EOF: reading %s, expected %d bytes got %d"
+        raise ValueError(msg % (error_template, size, len(data)))
+    else:
+        return data
+@contextmanager
+def _open_npy_file(path: str, arr_name: str):
+    with open(path, "rb") as f:
+        with zipfile.ZipFile(f, "r") as zip_f:
+            if f"{arr_name}.npy" not in zip_f.namelist():
+                raise ValueError(f"missing {arr_name} in npz file")
+            with zip_f.open(f"{arr_name}.npy", "r") as arr_f:
+                yield arr_f
+def _download_inception_model():
+    if os.path.exists(INCEPTION_V3_PATH):
+        return
+    print("downloading InceptionV3 model...")
+    with requests.get(INCEPTION_V3_URL, stream=True) as r:
+        r.raise_for_status()
+        tmp_path = INCEPTION_V3_PATH + ".tmp"
+        with open(tmp_path, "wb") as f:
+            for chunk in tqdm(r.iter_content(chunk_size=8192)):
+                f.write(chunk)
+        os.rename(tmp_path, INCEPTION_V3_PATH)
+def _create_feature_graph(input_batch):
+    _download_inception_model()
+    prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}"
+    with open(INCEPTION_V3_PATH, "rb") as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+    pool3, spatial = tf.import_graph_def(
+        graph_def,
+        input_map={f"ExpandDims:0": input_batch},
+        return_elements=[FID_POOL_NAME, FID_SPATIAL_NAME],
+        name=prefix,
+    )
+    _update_shapes(pool3)
+    spatial = spatial[..., :7]
+    return pool3, spatial
+def _create_softmax_graph(input_batch):
+    _download_inception_model()
+    prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}"
+    with open(INCEPTION_V3_PATH, "rb") as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+    (matmul,) = tf.import_graph_def(
+        graph_def, return_elements=[f"softmax/logits/MatMul"], name=prefix
+    )
+    w = matmul.inputs[1]
+    logits = tf.matmul(input_batch, w)
+    return tf.nn.softmax(logits)
+def _update_shapes(pool3):
+    # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L50-L63
+    ops = pool3.graph.get_operations()
+    for op in ops:
+        for o in op.outputs:
+            shape = o.get_shape()
+            if shape._dims is not None:  # pylint: disable=protected-access
+                # shape = [s.value for s in shape] TF 1.x
+                shape = [s for s in shape]  # TF 2.x
+                new_shape = []
+                for j, s in enumerate(shape):
+                    if s == 1 and j == 0:
+                        new_shape.append(None)
+                    else:
+                        new_shape.append(s)
+                o.__dict__["_shape_val"] = tf.TensorShape(new_shape)
+    return pool3
+def _numpy_partition(arr, kth, **kwargs):
+    num_workers = min(cpu_count(), len(arr))
+    chunk_size = len(arr) // num_workers
+    extra = len(arr) % num_workers
+    start_idx = 0
+    batches = []
+    for i in range(num_workers):
+        size = chunk_size + (1 if i < extra else 0)
+        batches.append(arr[start_idx : start_idx + size])
+        start_idx += size
+    with ThreadPool(num_workers) as pool:
+        return list(pool.map(partial(np.partition, kth=kth, **kwargs), batches))
+if __name__ == "__main__":
+    main()

evaluator_base.log ADDED Viewed

	@@ -0,0 +1,5 @@

+nohup: ignoring input
+Traceback (most recent call last):
+  File "/gemini/space/gzy_new/models/Sida/evaluator_base.py", line 16, in <module>
+    import tensorflow.compat.v1 as tf
+ModuleNotFoundError: No module named 'tensorflow'

evaluator_base.py ADDED Viewed

	@@ -0,0 +1,685 @@

+import argparse
+import io
+import os
+import random
+import warnings
+import zipfile
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from functools import partial
+from multiprocessing import cpu_count
+from multiprocessing.pool import ThreadPool
+from typing import Iterable, Optional, Tuple
+import numpy as np
+import requests
+import tensorflow.compat.v1 as tf
+from scipy import linalg
+from tqdm.auto import tqdm
+INCEPTION_V3_URL = "https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/classify_image_graph_def.pb"
+INCEPTION_V3_PATH = "classify_image_graph_def.pb"
+FID_POOL_NAME = "pool_3:0"
+FID_SPATIAL_NAME = "mixed_6/conv:0"
+def main():
+    parser = argparse.ArgumentParser()
+    #/gemini/space/gzy_new/Rectified_Noise/Finetune/finetune-coco/sd3_rectified_samples.npz
+    parser.add_argument("--ref_batch", default='/gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.npz',help="path to reference batch npz file")
+    parser.add_argument("--sample_batch", default='/gemini/space/gzy_new/models/Sida/sd3_lora_samples_3w/checkpoint-checkpoint-500000-rank32-guidance-7.0-steps-40-size-512x512.npz', help="path to sample batch npz file")
+    parser.add_argument("--save_path",  default='/gemini/space/gzy_new/models/Sida/sd3_lora_samples_3w/checkpoint-checkpoint-500000-rank32-guidance-7.0-steps-40-size-512x512/result',help="path to sample batch npz file")
+    parser.add_argument("--cfg_cond", default=1, type=int)
+    parser.add_argument("--step", default=1, type=int)
+    parser.add_argument("--cfg", default=1.0, type=float)
+    parser.add_argument("--cls_cfg", default=1.0, type=float)
+    parser.add_argument("--gh", default=1.0, type=float)
+    parser.add_argument("--num_steps", default=50, type=int)
+    args = parser.parse_args()
+    if not os.path.exists(args.save_path):
+        os.mkdir(args.save_path)
+    # NOTE: 当前环境中 TensorFlow 与 CUDA/cuDNN 可能版本不匹配（例如报 "No DNN in stream executor"），
+    # 这会导致 GPU 计算失败。这里强制使用 CPU 进行评估（会慢一些，但能保证运行）。
+    os.environ["CUDA_VISIBLE_DEVICES"] = ""
+    config = tf.ConfigProto(
+        allow_soft_placement=True,  # allows DecodeJpeg to run on CPU in Inception graph
+        device_count={"GPU": 0},
+    )
+    evaluator = Evaluator(tf.Session(config=config))
+    print("warming up TensorFlow...")
+    # This will cause TF to print a bunch of verbose stuff now rather
+    # than after the next print(), to help prevent confusion.
+    evaluator.warmup()
+    print("computing reference batch activations...")
+    ref_acts = evaluator.read_activations(args.ref_batch)
+    print("computing/reading reference batch statistics...")
+    ref_stats, ref_stats_spatial = evaluator.read_statistics(args.ref_batch, ref_acts)
+    print("computing sample batch activations...")
+    sample_acts = evaluator.read_activations(args.sample_batch)
+    print("computing/reading sample batch statistics...")
+    sample_stats, sample_stats_spatial = evaluator.read_statistics(args.sample_batch, sample_acts)
+    print("Computing evaluations...")
+    Inception_Score = evaluator.compute_inception_score(sample_acts[0])
+    FID = sample_stats.frechet_distance(ref_stats)
+    sFID = sample_stats_spatial.frechet_distance(ref_stats_spatial)
+    prec, recall = evaluator.compute_prec_recall(ref_acts[0], sample_acts[0])
+    print("Inception Score:", Inception_Score)
+    print("FID:", FID)
+    print("sFID:", sFID)
+    print("Precision:", prec)
+    print("Recall:", recall)
+    if args.cfg_cond:
+        file_path = args.save_path + str(args.num_steps) + str(args.step) + str(args.cfg) + str(args.gh) + str(args.cls_cfg)+ "cfg_cond_true.txt"
+    else:
+        file_path = args.save_path + str(args.num_steps) + str(args.step) + str(args.cfg) + str(args.gh) + str(args.cls_cfg)+ "cfg_cond_false.txt"
+    with open(file_path, "w") as file:
+        file.write("Inception Score: {}\n".format(Inception_Score))
+        file.write("FID: {}\n".format(FID))
+        file.write("sFID: {}\n".format(sFID))
+        file.write("Precision: {}\n".format(prec))
+        file.write("Recall: {}\n".format(recall))
+class InvalidFIDException(Exception):
+    pass
+class FIDStatistics:
+    def __init__(self, mu: np.ndarray, sigma: np.ndarray):
+        self.mu = mu
+        self.sigma = sigma
+    def frechet_distance(self, other, eps=1e-6):
+        """
+        Compute the Frechet distance between two sets of statistics.
+        """
+        # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L132
+        mu1, sigma1 = self.mu, self.sigma
+        mu2, sigma2 = other.mu, other.sigma
+        mu1 = np.atleast_1d(mu1)
+        mu2 = np.atleast_1d(mu2)
+        sigma1 = np.atleast_2d(sigma1)
+        sigma2 = np.atleast_2d(sigma2)
+        assert (
+            mu1.shape == mu2.shape
+        ), f"Training and test mean vectors have different lengths: {mu1.shape}, {mu2.shape}"
+        assert (
+            sigma1.shape == sigma2.shape
+        ), f"Training and test covariances have different dimensions: {sigma1.shape}, {sigma2.shape}"
+        diff = mu1 - mu2
+        # product might be almost singular
+        covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+        if not np.isfinite(covmean).all():
+            msg = (
+                "fid calculation produces singular product; adding %s to diagonal of cov estimates"
+                % eps
+            )
+            warnings.warn(msg)
+            offset = np.eye(sigma1.shape[0]) * eps
+            covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+        # numerical error might give slight imaginary component
+        if np.iscomplexobj(covmean):
+            if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+                m = np.max(np.abs(covmean.imag))
+                raise ValueError("Imaginary component {}".format(m))
+            covmean = covmean.real
+        tr_covmean = np.trace(covmean)
+        return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
+class Evaluator:
+    def __init__(
+        self,
+        session,
+        batch_size=64,
+        softmax_batch_size=512,
+    ):
+        self.sess = session
+        self.batch_size = batch_size
+        self.softmax_batch_size = softmax_batch_size
+        self.manifold_estimator = ManifoldEstimator(session)
+        with self.sess.graph.as_default():
+            self.image_input = tf.placeholder(tf.float32, shape=[None, None, None, 3])
+            self.softmax_input = tf.placeholder(tf.float32, shape=[None, 2048])
+            self.pool_features, self.spatial_features = _create_feature_graph(self.image_input)
+            self.softmax = _create_softmax_graph(self.softmax_input)
+    def warmup(self):
+        self.compute_activations(np.zeros([1, 8, 64, 64, 3]))
+    def read_activations(self, npz_path: str) -> Tuple[np.ndarray, np.ndarray]:
+        with open_npz_array(npz_path, "arr_0") as reader:
+            return self.compute_activations(reader.read_batches(self.batch_size))
+    def compute_activations(self, batches: Iterable[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Compute image features for downstream evals.
+        :param batches: a iterator over NHWC numpy arrays in [0, 255].
+        :return: a tuple of numpy arrays of shape [N x X], where X is a feature
+                 dimension. The tuple is (pool_3, spatial).
+        """
+        preds = []
+        spatial_preds = []
+        for batch in tqdm(batches):
+            batch = batch.astype(np.float32)
+            pred, spatial_pred = self.sess.run(
+                [self.pool_features, self.spatial_features], {self.image_input: batch}
+            )
+            preds.append(pred.reshape([pred.shape[0], -1]))
+            spatial_preds.append(spatial_pred.reshape([spatial_pred.shape[0], -1]))
+        return (
+            np.concatenate(preds, axis=0),
+            np.concatenate(spatial_preds, axis=0),
+        )
+    def read_statistics(
+        self, npz_path: str, activations: Tuple[np.ndarray, np.ndarray]
+    ) -> Tuple[FIDStatistics, FIDStatistics]:
+        obj = np.load(npz_path)
+        if "mu" in list(obj.keys()):
+            return FIDStatistics(obj["mu"], obj["sigma"]), FIDStatistics(
+                obj["mu_s"], obj["sigma_s"]
+            )
+        return tuple(self.compute_statistics(x) for x in activations)
+    def compute_statistics(self, activations: np.ndarray) -> FIDStatistics:
+        mu = np.mean(activations, axis=0)
+        sigma = np.cov(activations, rowvar=False)
+        return FIDStatistics(mu, sigma)
+    def compute_inception_score(self, activations: np.ndarray, split_size: int = 5000) -> float:
+        softmax_out = []
+        for i in range(0, len(activations), self.softmax_batch_size):
+            acts = activations[i : i + self.softmax_batch_size]
+            softmax_out.append(self.sess.run(self.softmax, feed_dict={self.softmax_input: acts}))
+        preds = np.concatenate(softmax_out, axis=0)
+        # https://github.com/openai/improved-gan/blob/4f5d1ec5c16a7eceb206f42bfc652693601e1d5c/inception_score/model.py#L46
+        scores = []
+        for i in range(0, len(preds), split_size):
+            part = preds[i : i + split_size]
+            kl = part * (np.log(part) - np.log(np.expand_dims(np.mean(part, 0), 0)))
+            kl = np.mean(np.sum(kl, 1))
+            scores.append(np.exp(kl))
+        return float(np.mean(scores))
+    def compute_prec_recall(
+        self, activations_ref: np.ndarray, activations_sample: np.ndarray
+    ) -> Tuple[float, float]:
+        radii_1 = self.manifold_estimator.manifold_radii(activations_ref)
+        radii_2 = self.manifold_estimator.manifold_radii(activations_sample)
+        pr = self.manifold_estimator.evaluate_pr(
+            activations_ref, radii_1, activations_sample, radii_2
+        )
+        return (float(pr[0][0]), float(pr[1][0]))
+class ManifoldEstimator:
+    """
+    A helper for comparing manifolds of feature vectors.
+    Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L57
+    """
+    def __init__(
+        self,
+        session,
+        row_batch_size=10000,
+        col_batch_size=10000,
+        nhood_sizes=(3,),
+        clamp_to_percentile=None,
+        eps=1e-5,
+    ):
+        """
+        Estimate the manifold of given feature vectors.
+        :param session: the TensorFlow session.
+        :param row_batch_size: row batch size to compute pairwise distances
+                               (parameter to trade-off between memory usage and performance).
+        :param col_batch_size: column batch size to compute pairwise distances.
+        :param nhood_sizes: number of neighbors used to estimate the manifold.
+        :param clamp_to_percentile: prune hyperspheres that have radius larger than
+                                    the given percentile.
+        :param eps: small number for numerical stability.
+        """
+        self.distance_block = DistanceBlock(session)
+        self.row_batch_size = row_batch_size
+        self.col_batch_size = col_batch_size
+        self.nhood_sizes = nhood_sizes
+        self.num_nhoods = len(nhood_sizes)
+        self.clamp_to_percentile = clamp_to_percentile
+        self.eps = eps
+    def warmup(self):
+        feats, radii = (
+            np.zeros([1, 2048], dtype=np.float32),
+            np.zeros([1, 1], dtype=np.float32),
+        )
+        self.evaluate_pr(feats, radii, feats, radii)
+    def manifold_radii(self, features: np.ndarray) -> np.ndarray:
+        num_images = len(features)
+        # Estimate manifold of features by calculating distances to k-NN of each sample.
+        radii = np.zeros([num_images, self.num_nhoods], dtype=np.float32)
+        distance_batch = np.zeros([self.row_batch_size, num_images], dtype=np.float32)
+        seq = np.arange(max(self.nhood_sizes) + 1, dtype=np.int32)
+        for begin1 in range(0, num_images, self.row_batch_size):
+            end1 = min(begin1 + self.row_batch_size, num_images)
+            row_batch = features[begin1:end1]
+            for begin2 in range(0, num_images, self.col_batch_size):
+                end2 = min(begin2 + self.col_batch_size, num_images)
+                col_batch = features[begin2:end2]
+                # Compute distances between batches.
+                distance_batch[
+                    0 : end1 - begin1, begin2:end2
+                ] = self.distance_block.pairwise_distances(row_batch, col_batch)
+            # Find the k-nearest neighbor from the current batch.
+            radii[begin1:end1, :] = np.concatenate(
+                [
+                    x[:, self.nhood_sizes]
+                    for x in _numpy_partition(distance_batch[0 : end1 - begin1, :], seq, axis=1)
+                ],
+                axis=0,
+            )
+        if self.clamp_to_percentile is not None:
+            max_distances = np.percentile(radii, self.clamp_to_percentile, axis=0)
+            radii[radii > max_distances] = 0
+        return radii
+    def evaluate(self, features: np.ndarray, radii: np.ndarray, eval_features: np.ndarray):
+        """
+        Evaluate if new feature vectors are at the manifold.
+        """
+        num_eval_images = eval_features.shape[0]
+        num_ref_images = radii.shape[0]
+        distance_batch = np.zeros([self.row_batch_size, num_ref_images], dtype=np.float32)
+        batch_predictions = np.zeros([num_eval_images, self.num_nhoods], dtype=np.int32)
+        max_realism_score = np.zeros([num_eval_images], dtype=np.float32)
+        nearest_indices = np.zeros([num_eval_images], dtype=np.int32)
+        for begin1 in range(0, num_eval_images, self.row_batch_size):
+            end1 = min(begin1 + self.row_batch_size, num_eval_images)
+            feature_batch = eval_features[begin1:end1]
+            for begin2 in range(0, num_ref_images, self.col_batch_size):
+                end2 = min(begin2 + self.col_batch_size, num_ref_images)
+                ref_batch = features[begin2:end2]
+                distance_batch[
+                    0 : end1 - begin1, begin2:end2
+                ] = self.distance_block.pairwise_distances(feature_batch, ref_batch)
+            # From the minibatch of new feature vectors, determine if they are in the estimated manifold.
+            # If a feature vector is inside a hypersphere of some reference sample, then
+            # the new sample lies at the estimated manifold.
+            # The radii of the hyperspheres are determined from distances of neighborhood size k.
+            samples_in_manifold = distance_batch[0 : end1 - begin1, :, None] <= radii
+            batch_predictions[begin1:end1] = np.any(samples_in_manifold, axis=1).astype(np.int32)
+            max_realism_score[begin1:end1] = np.max(
+                radii[:, 0] / (distance_batch[0 : end1 - begin1, :] + self.eps), axis=1
+            )
+            nearest_indices[begin1:end1] = np.argmin(distance_batch[0 : end1 - begin1, :], axis=1)
+        return {
+            "fraction": float(np.mean(batch_predictions)),
+            "batch_predictions": batch_predictions,
+            "max_realisim_score": max_realism_score,
+            "nearest_indices": nearest_indices,
+        }
+    def evaluate_pr(
+        self,
+        features_1: np.ndarray,
+        radii_1: np.ndarray,
+        features_2: np.ndarray,
+        radii_2: np.ndarray,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Evaluate precision and recall efficiently.
+        :param features_1: [N1 x D] feature vectors for reference batch.
+        :param radii_1: [N1 x K1] radii for reference vectors.
+        :param features_2: [N2 x D] feature vectors for the other batch.
+        :param radii_2: [N x K2] radii for other vectors.
+        :return: a tuple of arrays for (precision, recall):
+                 - precision: an np.ndarray of length K1
+                 - recall: an np.ndarray of length K2
+        """
+        features_1_status = np.zeros([len(features_1), radii_2.shape[1]], dtype=np.bool_)
+        features_2_status = np.zeros([len(features_2), radii_1.shape[1]], dtype=np.bool_)
+        for begin_1 in range(0, len(features_1), self.row_batch_size):
+            end_1 = begin_1 + self.row_batch_size
+            batch_1 = features_1[begin_1:end_1]
+            for begin_2 in range(0, len(features_2), self.col_batch_size):
+                end_2 = begin_2 + self.col_batch_size
+                batch_2 = features_2[begin_2:end_2]
+                batch_1_in, batch_2_in = self.distance_block.less_thans(
+                    batch_1, radii_1[begin_1:end_1], batch_2, radii_2[begin_2:end_2]
+                )
+                features_1_status[begin_1:end_1] |= batch_1_in
+                features_2_status[begin_2:end_2] |= batch_2_in
+        return (
+            np.mean(features_2_status.astype(np.float64), axis=0),
+            np.mean(features_1_status.astype(np.float64), axis=0),
+        )
+class DistanceBlock:
+    """
+    Calculate pairwise distances between vectors.
+    Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L34
+    """
+    def __init__(self, session):
+        self.session = session
+        # Initialize TF graph to calculate pairwise distances.
+        with session.graph.as_default():
+            self._features_batch1 = tf.placeholder(tf.float32, shape=[None, None])
+            self._features_batch2 = tf.placeholder(tf.float32, shape=[None, None])
+            distance_block_16 = _batch_pairwise_distances(
+                tf.cast(self._features_batch1, tf.float16),
+                tf.cast(self._features_batch2, tf.float16),
+            )
+            self.distance_block = tf.cond(
+                tf.reduce_all(tf.math.is_finite(distance_block_16)),
+                lambda: tf.cast(distance_block_16, tf.float32),
+                lambda: _batch_pairwise_distances(self._features_batch1, self._features_batch2),
+            )
+            # Extra logic for less thans.
+            self._radii1 = tf.placeholder(tf.float32, shape=[None, None])
+            self._radii2 = tf.placeholder(tf.float32, shape=[None, None])
+            dist32 = tf.cast(self.distance_block, tf.float32)[..., None]
+            self._batch_1_in = tf.math.reduce_any(dist32 <= self._radii2, axis=1)
+            self._batch_2_in = tf.math.reduce_any(dist32 <= self._radii1[:, None], axis=0)
+    def pairwise_distances(self, U, V):
+        """
+        Evaluate pairwise distances between two batches of feature vectors.
+        """
+        return self.session.run(
+            self.distance_block,
+            feed_dict={self._features_batch1: U, self._features_batch2: V},
+        )
+    def less_thans(self, batch_1, radii_1, batch_2, radii_2):
+        return self.session.run(
+            [self._batch_1_in, self._batch_2_in],
+            feed_dict={
+                self._features_batch1: batch_1,
+                self._features_batch2: batch_2,
+                self._radii1: radii_1,
+                self._radii2: radii_2,
+            },
+        )
+def _batch_pairwise_distances(U, V):
+    """
+    Compute pairwise distances between two batches of feature vectors.
+    """
+    with tf.variable_scope("pairwise_dist_block"):
+        # Squared norms of each row in U and V.
+        norm_u = tf.reduce_sum(tf.square(U), 1)
+        norm_v = tf.reduce_sum(tf.square(V), 1)
+        # norm_u as a column and norm_v as a row vectors.
+        norm_u = tf.reshape(norm_u, [-1, 1])
+        norm_v = tf.reshape(norm_v, [1, -1])
+        # Pairwise squared Euclidean distances.
+        D = tf.maximum(norm_u - 2 * tf.matmul(U, V, False, True) + norm_v, 0.0)
+    return D
+class NpzArrayReader(ABC):
+    @abstractmethod
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        pass
+    @abstractmethod
+    def remaining(self) -> int:
+        pass
+    def read_batches(self, batch_size: int) -> Iterable[np.ndarray]:
+        def gen_fn():
+            while True:
+                batch = self.read_batch(batch_size)
+                if batch is None:
+                    break
+                yield batch
+        rem = self.remaining()
+        num_batches = rem // batch_size + int(rem % batch_size != 0)
+        return BatchIterator(gen_fn, num_batches)
+class BatchIterator:
+    def __init__(self, gen_fn, length):
+        self.gen_fn = gen_fn
+        self.length = length
+    def __len__(self):
+        return self.length
+    def __iter__(self):
+        return self.gen_fn()
+class StreamingNpzArrayReader(NpzArrayReader):
+    def __init__(self, arr_f, shape, dtype):
+        self.arr_f = arr_f
+        self.shape = shape
+        self.dtype = dtype
+        self.idx = 0
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        if self.idx >= self.shape[0]:
+            return None
+        bs = min(batch_size, self.shape[0] - self.idx)
+        self.idx += bs
+        if self.dtype.itemsize == 0:
+            return np.ndarray([bs, *self.shape[1:]], dtype=self.dtype)
+        read_count = bs * np.prod(self.shape[1:])
+        read_size = int(read_count * self.dtype.itemsize)
+        data = _read_bytes(self.arr_f, read_size, "array data")
+        return np.frombuffer(data, dtype=self.dtype).reshape([bs, *self.shape[1:]])
+    def remaining(self) -> int:
+        return max(0, self.shape[0] - self.idx)
+class MemoryNpzArrayReader(NpzArrayReader):
+    def __init__(self, arr):
+        self.arr = arr
+        self.idx = 0
+    @classmethod
+    def load(cls, path: str, arr_name: str):
+        with open(path, "rb") as f:
+            arr = np.load(f)[arr_name]
+        return cls(arr)
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        if self.idx >= self.arr.shape[0]:
+            return None
+        res = self.arr[self.idx : self.idx + batch_size]
+        self.idx += batch_size
+        return res
+    def remaining(self) -> int:
+        return max(0, self.arr.shape[0] - self.idx)
+@contextmanager
+def open_npz_array(path: str, arr_name: str) -> NpzArrayReader:
+    with _open_npy_file(path, arr_name) as arr_f:
+        version = np.lib.format.read_magic(arr_f)
+        if version == (1, 0):
+            header = np.lib.format.read_array_header_1_0(arr_f)
+        elif version == (2, 0):
+            header = np.lib.format.read_array_header_2_0(arr_f)
+        else:
+            yield MemoryNpzArrayReader.load(path, arr_name)
+            return
+        shape, fortran, dtype = header
+        if fortran or dtype.hasobject:
+            yield MemoryNpzArrayReader.load(path, arr_name)
+        else:
+            yield StreamingNpzArrayReader(arr_f, shape, dtype)
+def _read_bytes(fp, size, error_template="ran out of data"):
+    """
+    Copied from: https://github.com/numpy/numpy/blob/fb215c76967739268de71aa4bda55dd1b062bc2e/numpy/lib/format.py#L788-L886
+    Read from file-like object until size bytes are read.
+    Raises ValueError if not EOF is encountered before size bytes are read.
+    Non-blocking objects only supported if they derive from io objects.
+    Required as e.g. ZipExtFile in python 2.6 can return less data than
+    requested.
+    """
+    data = bytes()
+    while True:
+        # io files (default in python3) return None or raise on
+        # would-block, python2 file will truncate, probably nothing can be
+        # done about that.  note that regular files can't be non-blocking
+        try:
+            r = fp.read(size - len(data))
+            data += r
+            if len(r) == 0 or len(data) == size:
+                break
+        except io.BlockingIOError:
+            pass
+    if len(data) != size:
+        msg = "EOF: reading %s, expected %d bytes got %d"
+        raise ValueError(msg % (error_template, size, len(data)))
+    else:
+        return data
+@contextmanager
+def _open_npy_file(path: str, arr_name: str):
+    with open(path, "rb") as f:
+        with zipfile.ZipFile(f, "r") as zip_f:
+            if f"{arr_name}.npy" not in zip_f.namelist():
+                raise ValueError(f"missing {arr_name} in npz file")
+            with zip_f.open(f"{arr_name}.npy", "r") as arr_f:
+                yield arr_f
+def _download_inception_model():
+    if os.path.exists(INCEPTION_V3_PATH):
+        return
+    print("downloading InceptionV3 model...")
+    with requests.get(INCEPTION_V3_URL, stream=True) as r:
+        r.raise_for_status()
+        tmp_path = INCEPTION_V3_PATH + ".tmp"
+        with open(tmp_path, "wb") as f:
+            for chunk in tqdm(r.iter_content(chunk_size=8192)):
+                f.write(chunk)
+        os.rename(tmp_path, INCEPTION_V3_PATH)
+def _create_feature_graph(input_batch):
+    _download_inception_model()
+    prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}"
+    with open(INCEPTION_V3_PATH, "rb") as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+    pool3, spatial = tf.import_graph_def(
+        graph_def,
+        input_map={f"ExpandDims:0": input_batch},
+        return_elements=[FID_POOL_NAME, FID_SPATIAL_NAME],
+        name=prefix,
+    )
+    _update_shapes(pool3)
+    spatial = spatial[..., :7]
+    return pool3, spatial
+def _create_softmax_graph(input_batch):
+    _download_inception_model()
+    prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}"
+    with open(INCEPTION_V3_PATH, "rb") as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+    (matmul,) = tf.import_graph_def(
+        graph_def, return_elements=[f"softmax/logits/MatMul"], name=prefix
+    )
+    w = matmul.inputs[1]
+    logits = tf.matmul(input_batch, w)
+    return tf.nn.softmax(logits)
+def _update_shapes(pool3):
+    # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L50-L63
+    ops = pool3.graph.get_operations()
+    for op in ops:
+        for o in op.outputs:
+            shape = o.get_shape()
+            if shape._dims is not None:  # pylint: disable=protected-access
+                # shape = [s.value for s in shape] TF 1.x
+                shape = [s for s in shape]  # TF 2.x
+                new_shape = []
+                for j, s in enumerate(shape):
+                    if s == 1 and j == 0:
+                        new_shape.append(None)
+                    else:
+                        new_shape.append(s)
+                o.__dict__["_shape_val"] = tf.TensorShape(new_shape)
+    return pool3
+def _numpy_partition(arr, kth, **kwargs):
+    num_workers = min(cpu_count(), len(arr))
+    chunk_size = len(arr) // num_workers
+    extra = len(arr) % num_workers
+    start_idx = 0
+    batches = []
+    for i in range(num_workers):
+        size = chunk_size + (1 if i < extra else 0)
+        batches.append(arr[start_idx : start_idx + size])
+        start_idx += size
+    with ThreadPool(num_workers) as pool:
+        return list(pool.map(partial(np.partition, kth=kth, **kwargs), batches))
+if __name__ == "__main__":
+    main()
+# nohup python evaluator_base.py > evaluator_base.log 2>&1 &

evaluator_rf.py ADDED Viewed

	@@ -0,0 +1,685 @@

+import argparse
+import io
+import os
+import random
+import warnings
+import zipfile
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from functools import partial
+from multiprocessing import cpu_count
+from multiprocessing.pool import ThreadPool
+from typing import Iterable, Optional, Tuple
+import numpy as np
+import requests
+import tensorflow.compat.v1 as tf
+from scipy import linalg
+from tqdm.auto import tqdm
+INCEPTION_V3_URL = "https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/classify_image_graph_def.pb"
+INCEPTION_V3_PATH = "classify_image_graph_def.pb"
+FID_POOL_NAME = "pool_3:0"
+FID_SPATIAL_NAME = "mixed_6/conv:0"
+def main():
+    parser = argparse.ArgumentParser()
+    #/gemini/space/gzy_new/Rectified_Noise/Finetune/finetune-coco/sd3_rectified_samples.npz
+    parser.add_argument("--ref_batch", default='/gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.npz',help="path to reference batch npz file")
+    parser.add_argument("--sample_batch", default='/gemini/space/gzy_new/models/Sida/sd3_rectified_samples_batch2_220000.npz', help="path to sample batch npz file")
+    parser.add_argument("--save_path",  default='/gemini/space/gzy_new/models/Sida/sd3_rectified_samples_batch2_220000',help="path to sample batch npz file")
+    parser.add_argument("--cfg_cond", default=1, type=int)
+    parser.add_argument("--step", default=1, type=int)
+    parser.add_argument("--cfg", default=1.0, type=float)
+    parser.add_argument("--cls_cfg", default=1.0, type=float)
+    parser.add_argument("--gh", default=1.0, type=float)
+    parser.add_argument("--num_steps", default=50, type=int)
+    args = parser.parse_args()
+    if not os.path.exists(args.save_path):
+        os.mkdir(args.save_path)
+    # NOTE: 当前环境中 TensorFlow 与 CUDA/cuDNN 可能版本不匹配（例如报 "No DNN in stream executor"），
+    # 这会导致 GPU 计算失败。这里强制使用 CPU 进行评估（会慢一些，但能保证运行）。
+    os.environ["CUDA_VISIBLE_DEVICES"] = ""
+    config = tf.ConfigProto(
+        allow_soft_placement=True,  # allows DecodeJpeg to run on CPU in Inception graph
+        device_count={"GPU": 0},
+    )
+    evaluator = Evaluator(tf.Session(config=config))
+    print("warming up TensorFlow...")
+    # This will cause TF to print a bunch of verbose stuff now rather
+    # than after the next print(), to help prevent confusion.
+    evaluator.warmup()
+    print("computing reference batch activations...")
+    ref_acts = evaluator.read_activations(args.ref_batch)
+    print("computing/reading reference batch statistics...")
+    ref_stats, ref_stats_spatial = evaluator.read_statistics(args.ref_batch, ref_acts)
+    print("computing sample batch activations...")
+    sample_acts = evaluator.read_activations(args.sample_batch)
+    print("computing/reading sample batch statistics...")
+    sample_stats, sample_stats_spatial = evaluator.read_statistics(args.sample_batch, sample_acts)
+    print("Computing evaluations...")
+    Inception_Score = evaluator.compute_inception_score(sample_acts[0])
+    FID = sample_stats.frechet_distance(ref_stats)
+    sFID = sample_stats_spatial.frechet_distance(ref_stats_spatial)
+    prec, recall = evaluator.compute_prec_recall(ref_acts[0], sample_acts[0])
+    print("Inception Score:", Inception_Score)
+    print("FID:", FID)
+    print("sFID:", sFID)
+    print("Precision:", prec)
+    print("Recall:", recall)
+    if args.cfg_cond:
+        file_path = args.save_path + str(args.num_steps) + str(args.step) + str(args.cfg) + str(args.gh) + str(args.cls_cfg)+ "cfg_cond_true.txt"
+    else:
+        file_path = args.save_path + str(args.num_steps) + str(args.step) + str(args.cfg) + str(args.gh) + str(args.cls_cfg)+ "cfg_cond_false.txt"
+    with open(file_path, "w") as file:
+        file.write("Inception Score: {}\n".format(Inception_Score))
+        file.write("FID: {}\n".format(FID))
+        file.write("sFID: {}\n".format(sFID))
+        file.write("Precision: {}\n".format(prec))
+        file.write("Recall: {}\n".format(recall))
+class InvalidFIDException(Exception):
+    pass
+class FIDStatistics:
+    def __init__(self, mu: np.ndarray, sigma: np.ndarray):
+        self.mu = mu
+        self.sigma = sigma
+    def frechet_distance(self, other, eps=1e-6):
+        """
+        Compute the Frechet distance between two sets of statistics.
+        """
+        # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L132
+        mu1, sigma1 = self.mu, self.sigma
+        mu2, sigma2 = other.mu, other.sigma
+        mu1 = np.atleast_1d(mu1)
+        mu2 = np.atleast_1d(mu2)
+        sigma1 = np.atleast_2d(sigma1)
+        sigma2 = np.atleast_2d(sigma2)
+        assert (
+            mu1.shape == mu2.shape
+        ), f"Training and test mean vectors have different lengths: {mu1.shape}, {mu2.shape}"
+        assert (
+            sigma1.shape == sigma2.shape
+        ), f"Training and test covariances have different dimensions: {sigma1.shape}, {sigma2.shape}"
+        diff = mu1 - mu2
+        # product might be almost singular
+        covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+        if not np.isfinite(covmean).all():
+            msg = (
+                "fid calculation produces singular product; adding %s to diagonal of cov estimates"
+                % eps
+            )
+            warnings.warn(msg)
+            offset = np.eye(sigma1.shape[0]) * eps
+            covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+        # numerical error might give slight imaginary component
+        if np.iscomplexobj(covmean):
+            if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+                m = np.max(np.abs(covmean.imag))
+                raise ValueError("Imaginary component {}".format(m))
+            covmean = covmean.real
+        tr_covmean = np.trace(covmean)
+        return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
+class Evaluator:
+    def __init__(
+        self,
+        session,
+        batch_size=64,
+        softmax_batch_size=512,
+    ):
+        self.sess = session
+        self.batch_size = batch_size
+        self.softmax_batch_size = softmax_batch_size
+        self.manifold_estimator = ManifoldEstimator(session)
+        with self.sess.graph.as_default():
+            self.image_input = tf.placeholder(tf.float32, shape=[None, None, None, 3])
+            self.softmax_input = tf.placeholder(tf.float32, shape=[None, 2048])
+            self.pool_features, self.spatial_features = _create_feature_graph(self.image_input)
+            self.softmax = _create_softmax_graph(self.softmax_input)
+    def warmup(self):
+        self.compute_activations(np.zeros([1, 8, 64, 64, 3]))
+    def read_activations(self, npz_path: str) -> Tuple[np.ndarray, np.ndarray]:
+        with open_npz_array(npz_path, "arr_0") as reader:
+            return self.compute_activations(reader.read_batches(self.batch_size))
+    def compute_activations(self, batches: Iterable[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Compute image features for downstream evals.
+        :param batches: a iterator over NHWC numpy arrays in [0, 255].
+        :return: a tuple of numpy arrays of shape [N x X], where X is a feature
+                 dimension. The tuple is (pool_3, spatial).
+        """
+        preds = []
+        spatial_preds = []
+        for batch in tqdm(batches):
+            batch = batch.astype(np.float32)
+            pred, spatial_pred = self.sess.run(
+                [self.pool_features, self.spatial_features], {self.image_input: batch}
+            )
+            preds.append(pred.reshape([pred.shape[0], -1]))
+            spatial_preds.append(spatial_pred.reshape([spatial_pred.shape[0], -1]))
+        return (
+            np.concatenate(preds, axis=0),
+            np.concatenate(spatial_preds, axis=0),
+        )
+    def read_statistics(
+        self, npz_path: str, activations: Tuple[np.ndarray, np.ndarray]
+    ) -> Tuple[FIDStatistics, FIDStatistics]:
+        obj = np.load(npz_path)
+        if "mu" in list(obj.keys()):
+            return FIDStatistics(obj["mu"], obj["sigma"]), FIDStatistics(
+                obj["mu_s"], obj["sigma_s"]
+            )
+        return tuple(self.compute_statistics(x) for x in activations)
+    def compute_statistics(self, activations: np.ndarray) -> FIDStatistics:
+        mu = np.mean(activations, axis=0)
+        sigma = np.cov(activations, rowvar=False)
+        return FIDStatistics(mu, sigma)
+    def compute_inception_score(self, activations: np.ndarray, split_size: int = 5000) -> float:
+        softmax_out = []
+        for i in range(0, len(activations), self.softmax_batch_size):
+            acts = activations[i : i + self.softmax_batch_size]
+            softmax_out.append(self.sess.run(self.softmax, feed_dict={self.softmax_input: acts}))
+        preds = np.concatenate(softmax_out, axis=0)
+        # https://github.com/openai/improved-gan/blob/4f5d1ec5c16a7eceb206f42bfc652693601e1d5c/inception_score/model.py#L46
+        scores = []
+        for i in range(0, len(preds), split_size):
+            part = preds[i : i + split_size]
+            kl = part * (np.log(part) - np.log(np.expand_dims(np.mean(part, 0), 0)))
+            kl = np.mean(np.sum(kl, 1))
+            scores.append(np.exp(kl))
+        return float(np.mean(scores))
+    def compute_prec_recall(
+        self, activations_ref: np.ndarray, activations_sample: np.ndarray
+    ) -> Tuple[float, float]:
+        radii_1 = self.manifold_estimator.manifold_radii(activations_ref)
+        radii_2 = self.manifold_estimator.manifold_radii(activations_sample)
+        pr = self.manifold_estimator.evaluate_pr(
+            activations_ref, radii_1, activations_sample, radii_2
+        )
+        return (float(pr[0][0]), float(pr[1][0]))
+class ManifoldEstimator:
+    """
+    A helper for comparing manifolds of feature vectors.
+    Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L57
+    """
+    def __init__(
+        self,
+        session,
+        row_batch_size=10000,
+        col_batch_size=10000,
+        nhood_sizes=(3,),
+        clamp_to_percentile=None,
+        eps=1e-5,
+    ):
+        """
+        Estimate the manifold of given feature vectors.
+        :param session: the TensorFlow session.
+        :param row_batch_size: row batch size to compute pairwise distances
+                               (parameter to trade-off between memory usage and performance).
+        :param col_batch_size: column batch size to compute pairwise distances.
+        :param nhood_sizes: number of neighbors used to estimate the manifold.
+        :param clamp_to_percentile: prune hyperspheres that have radius larger than
+                                    the given percentile.
+        :param eps: small number for numerical stability.
+        """
+        self.distance_block = DistanceBlock(session)
+        self.row_batch_size = row_batch_size
+        self.col_batch_size = col_batch_size
+        self.nhood_sizes = nhood_sizes
+        self.num_nhoods = len(nhood_sizes)
+        self.clamp_to_percentile = clamp_to_percentile
+        self.eps = eps
+    def warmup(self):
+        feats, radii = (
+            np.zeros([1, 2048], dtype=np.float32),
+            np.zeros([1, 1], dtype=np.float32),
+        )
+        self.evaluate_pr(feats, radii, feats, radii)
+    def manifold_radii(self, features: np.ndarray) -> np.ndarray:
+        num_images = len(features)
+        # Estimate manifold of features by calculating distances to k-NN of each sample.
+        radii = np.zeros([num_images, self.num_nhoods], dtype=np.float32)
+        distance_batch = np.zeros([self.row_batch_size, num_images], dtype=np.float32)
+        seq = np.arange(max(self.nhood_sizes) + 1, dtype=np.int32)
+        for begin1 in range(0, num_images, self.row_batch_size):
+            end1 = min(begin1 + self.row_batch_size, num_images)
+            row_batch = features[begin1:end1]
+            for begin2 in range(0, num_images, self.col_batch_size):
+                end2 = min(begin2 + self.col_batch_size, num_images)
+                col_batch = features[begin2:end2]
+                # Compute distances between batches.
+                distance_batch[
+                    0 : end1 - begin1, begin2:end2
+                ] = self.distance_block.pairwise_distances(row_batch, col_batch)
+            # Find the k-nearest neighbor from the current batch.
+            radii[begin1:end1, :] = np.concatenate(
+                [
+                    x[:, self.nhood_sizes]
+                    for x in _numpy_partition(distance_batch[0 : end1 - begin1, :], seq, axis=1)
+                ],
+                axis=0,
+            )
+        if self.clamp_to_percentile is not None:
+            max_distances = np.percentile(radii, self.clamp_to_percentile, axis=0)
+            radii[radii > max_distances] = 0
+        return radii
+    def evaluate(self, features: np.ndarray, radii: np.ndarray, eval_features: np.ndarray):
+        """
+        Evaluate if new feature vectors are at the manifold.
+        """
+        num_eval_images = eval_features.shape[0]
+        num_ref_images = radii.shape[0]
+        distance_batch = np.zeros([self.row_batch_size, num_ref_images], dtype=np.float32)
+        batch_predictions = np.zeros([num_eval_images, self.num_nhoods], dtype=np.int32)
+        max_realism_score = np.zeros([num_eval_images], dtype=np.float32)
+        nearest_indices = np.zeros([num_eval_images], dtype=np.int32)
+        for begin1 in range(0, num_eval_images, self.row_batch_size):
+            end1 = min(begin1 + self.row_batch_size, num_eval_images)
+            feature_batch = eval_features[begin1:end1]
+            for begin2 in range(0, num_ref_images, self.col_batch_size):
+                end2 = min(begin2 + self.col_batch_size, num_ref_images)
+                ref_batch = features[begin2:end2]
+                distance_batch[
+                    0 : end1 - begin1, begin2:end2
+                ] = self.distance_block.pairwise_distances(feature_batch, ref_batch)
+            # From the minibatch of new feature vectors, determine if they are in the estimated manifold.
+            # If a feature vector is inside a hypersphere of some reference sample, then
+            # the new sample lies at the estimated manifold.
+            # The radii of the hyperspheres are determined from distances of neighborhood size k.
+            samples_in_manifold = distance_batch[0 : end1 - begin1, :, None] <= radii
+            batch_predictions[begin1:end1] = np.any(samples_in_manifold, axis=1).astype(np.int32)
+            max_realism_score[begin1:end1] = np.max(
+                radii[:, 0] / (distance_batch[0 : end1 - begin1, :] + self.eps), axis=1
+            )
+            nearest_indices[begin1:end1] = np.argmin(distance_batch[0 : end1 - begin1, :], axis=1)
+        return {
+            "fraction": float(np.mean(batch_predictions)),
+            "batch_predictions": batch_predictions,
+            "max_realisim_score": max_realism_score,
+            "nearest_indices": nearest_indices,
+        }
+    def evaluate_pr(
+        self,
+        features_1: np.ndarray,
+        radii_1: np.ndarray,
+        features_2: np.ndarray,
+        radii_2: np.ndarray,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Evaluate precision and recall efficiently.
+        :param features_1: [N1 x D] feature vectors for reference batch.
+        :param radii_1: [N1 x K1] radii for reference vectors.
+        :param features_2: [N2 x D] feature vectors for the other batch.
+        :param radii_2: [N x K2] radii for other vectors.
+        :return: a tuple of arrays for (precision, recall):
+                 - precision: an np.ndarray of length K1
+                 - recall: an np.ndarray of length K2
+        """
+        features_1_status = np.zeros([len(features_1), radii_2.shape[1]], dtype=np.bool_)
+        features_2_status = np.zeros([len(features_2), radii_1.shape[1]], dtype=np.bool_)
+        for begin_1 in range(0, len(features_1), self.row_batch_size):
+            end_1 = begin_1 + self.row_batch_size
+            batch_1 = features_1[begin_1:end_1]
+            for begin_2 in range(0, len(features_2), self.col_batch_size):
+                end_2 = begin_2 + self.col_batch_size
+                batch_2 = features_2[begin_2:end_2]
+                batch_1_in, batch_2_in = self.distance_block.less_thans(
+                    batch_1, radii_1[begin_1:end_1], batch_2, radii_2[begin_2:end_2]
+                )
+                features_1_status[begin_1:end_1] |= batch_1_in
+                features_2_status[begin_2:end_2] |= batch_2_in
+        return (
+            np.mean(features_2_status.astype(np.float64), axis=0),
+            np.mean(features_1_status.astype(np.float64), axis=0),
+        )
+class DistanceBlock:
+    """
+    Calculate pairwise distances between vectors.
+    Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L34
+    """
+    def __init__(self, session):
+        self.session = session
+        # Initialize TF graph to calculate pairwise distances.
+        with session.graph.as_default():
+            self._features_batch1 = tf.placeholder(tf.float32, shape=[None, None])
+            self._features_batch2 = tf.placeholder(tf.float32, shape=[None, None])
+            distance_block_16 = _batch_pairwise_distances(
+                tf.cast(self._features_batch1, tf.float16),
+                tf.cast(self._features_batch2, tf.float16),
+            )
+            self.distance_block = tf.cond(
+                tf.reduce_all(tf.math.is_finite(distance_block_16)),
+                lambda: tf.cast(distance_block_16, tf.float32),
+                lambda: _batch_pairwise_distances(self._features_batch1, self._features_batch2),
+            )
+            # Extra logic for less thans.
+            self._radii1 = tf.placeholder(tf.float32, shape=[None, None])
+            self._radii2 = tf.placeholder(tf.float32, shape=[None, None])
+            dist32 = tf.cast(self.distance_block, tf.float32)[..., None]
+            self._batch_1_in = tf.math.reduce_any(dist32 <= self._radii2, axis=1)
+            self._batch_2_in = tf.math.reduce_any(dist32 <= self._radii1[:, None], axis=0)
+    def pairwise_distances(self, U, V):
+        """
+        Evaluate pairwise distances between two batches of feature vectors.
+        """
+        return self.session.run(
+            self.distance_block,
+            feed_dict={self._features_batch1: U, self._features_batch2: V},
+        )
+    def less_thans(self, batch_1, radii_1, batch_2, radii_2):
+        return self.session.run(
+            [self._batch_1_in, self._batch_2_in],
+            feed_dict={
+                self._features_batch1: batch_1,
+                self._features_batch2: batch_2,
+                self._radii1: radii_1,
+                self._radii2: radii_2,
+            },
+        )
+def _batch_pairwise_distances(U, V):
+    """
+    Compute pairwise distances between two batches of feature vectors.
+    """
+    with tf.variable_scope("pairwise_dist_block"):
+        # Squared norms of each row in U and V.
+        norm_u = tf.reduce_sum(tf.square(U), 1)
+        norm_v = tf.reduce_sum(tf.square(V), 1)
+        # norm_u as a column and norm_v as a row vectors.
+        norm_u = tf.reshape(norm_u, [-1, 1])
+        norm_v = tf.reshape(norm_v, [1, -1])
+        # Pairwise squared Euclidean distances.
+        D = tf.maximum(norm_u - 2 * tf.matmul(U, V, False, True) + norm_v, 0.0)
+    return D
+class NpzArrayReader(ABC):
+    @abstractmethod
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        pass
+    @abstractmethod
+    def remaining(self) -> int:
+        pass
+    def read_batches(self, batch_size: int) -> Iterable[np.ndarray]:
+        def gen_fn():
+            while True:
+                batch = self.read_batch(batch_size)
+                if batch is None:
+                    break
+                yield batch
+        rem = self.remaining()
+        num_batches = rem // batch_size + int(rem % batch_size != 0)
+        return BatchIterator(gen_fn, num_batches)
+class BatchIterator:
+    def __init__(self, gen_fn, length):
+        self.gen_fn = gen_fn
+        self.length = length
+    def __len__(self):
+        return self.length
+    def __iter__(self):
+        return self.gen_fn()
+class StreamingNpzArrayReader(NpzArrayReader):
+    def __init__(self, arr_f, shape, dtype):
+        self.arr_f = arr_f
+        self.shape = shape
+        self.dtype = dtype
+        self.idx = 0
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        if self.idx >= self.shape[0]:
+            return None
+        bs = min(batch_size, self.shape[0] - self.idx)
+        self.idx += bs
+        if self.dtype.itemsize == 0:
+            return np.ndarray([bs, *self.shape[1:]], dtype=self.dtype)
+        read_count = bs * np.prod(self.shape[1:])
+        read_size = int(read_count * self.dtype.itemsize)
+        data = _read_bytes(self.arr_f, read_size, "array data")
+        return np.frombuffer(data, dtype=self.dtype).reshape([bs, *self.shape[1:]])
+    def remaining(self) -> int:
+        return max(0, self.shape[0] - self.idx)
+class MemoryNpzArrayReader(NpzArrayReader):
+    def __init__(self, arr):
+        self.arr = arr
+        self.idx = 0
+    @classmethod
+    def load(cls, path: str, arr_name: str):
+        with open(path, "rb") as f:
+            arr = np.load(f)[arr_name]
+        return cls(arr)
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        if self.idx >= self.arr.shape[0]:
+            return None
+        res = self.arr[self.idx : self.idx + batch_size]
+        self.idx += batch_size
+        return res
+    def remaining(self) -> int:
+        return max(0, self.arr.shape[0] - self.idx)
+@contextmanager
+def open_npz_array(path: str, arr_name: str) -> NpzArrayReader:
+    with _open_npy_file(path, arr_name) as arr_f:
+        version = np.lib.format.read_magic(arr_f)
+        if version == (1, 0):
+            header = np.lib.format.read_array_header_1_0(arr_f)
+        elif version == (2, 0):
+            header = np.lib.format.read_array_header_2_0(arr_f)
+        else:
+            yield MemoryNpzArrayReader.load(path, arr_name)
+            return
+        shape, fortran, dtype = header
+        if fortran or dtype.hasobject:
+            yield MemoryNpzArrayReader.load(path, arr_name)
+        else:
+            yield StreamingNpzArrayReader(arr_f, shape, dtype)
+def _read_bytes(fp, size, error_template="ran out of data"):
+    """
+    Copied from: https://github.com/numpy/numpy/blob/fb215c76967739268de71aa4bda55dd1b062bc2e/numpy/lib/format.py#L788-L886
+    Read from file-like object until size bytes are read.
+    Raises ValueError if not EOF is encountered before size bytes are read.
+    Non-blocking objects only supported if they derive from io objects.
+    Required as e.g. ZipExtFile in python 2.6 can return less data than
+    requested.
+    """
+    data = bytes()
+    while True:
+        # io files (default in python3) return None or raise on
+        # would-block, python2 file will truncate, probably nothing can be
+        # done about that.  note that regular files can't be non-blocking
+        try:
+            r = fp.read(size - len(data))
+            data += r
+            if len(r) == 0 or len(data) == size:
+                break
+        except io.BlockingIOError:
+            pass
+    if len(data) != size:
+        msg = "EOF: reading %s, expected %d bytes got %d"
+        raise ValueError(msg % (error_template, size, len(data)))
+    else:
+        return data
+@contextmanager
+def _open_npy_file(path: str, arr_name: str):
+    with open(path, "rb") as f:
+        with zipfile.ZipFile(f, "r") as zip_f:
+            if f"{arr_name}.npy" not in zip_f.namelist():
+                raise ValueError(f"missing {arr_name} in npz file")
+            with zip_f.open(f"{arr_name}.npy", "r") as arr_f:
+                yield arr_f
+def _download_inception_model():
+    if os.path.exists(INCEPTION_V3_PATH):
+        return
+    print("downloading InceptionV3 model...")
+    with requests.get(INCEPTION_V3_URL, stream=True) as r:
+        r.raise_for_status()
+        tmp_path = INCEPTION_V3_PATH + ".tmp"
+        with open(tmp_path, "wb") as f:
+            for chunk in tqdm(r.iter_content(chunk_size=8192)):
+                f.write(chunk)
+        os.rename(tmp_path, INCEPTION_V3_PATH)
+def _create_feature_graph(input_batch):
+    _download_inception_model()
+    prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}"
+    with open(INCEPTION_V3_PATH, "rb") as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+    pool3, spatial = tf.import_graph_def(
+        graph_def,
+        input_map={f"ExpandDims:0": input_batch},
+        return_elements=[FID_POOL_NAME, FID_SPATIAL_NAME],
+        name=prefix,
+    )
+    _update_shapes(pool3)
+    spatial = spatial[..., :7]
+    return pool3, spatial
+def _create_softmax_graph(input_batch):
+    _download_inception_model()
+    prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}"
+    with open(INCEPTION_V3_PATH, "rb") as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+    (matmul,) = tf.import_graph_def(
+        graph_def, return_elements=[f"softmax/logits/MatMul"], name=prefix
+    )
+    w = matmul.inputs[1]
+    logits = tf.matmul(input_batch, w)
+    return tf.nn.softmax(logits)
+def _update_shapes(pool3):
+    # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L50-L63
+    ops = pool3.graph.get_operations()
+    for op in ops:
+        for o in op.outputs:
+            shape = o.get_shape()
+            if shape._dims is not None:  # pylint: disable=protected-access
+                # shape = [s.value for s in shape] TF 1.x
+                shape = [s for s in shape]  # TF 2.x
+                new_shape = []
+                for j, s in enumerate(shape):
+                    if s == 1 and j == 0:
+                        new_shape.append(None)
+                    else:
+                        new_shape.append(s)
+                o.__dict__["_shape_val"] = tf.TensorShape(new_shape)
+    return pool3
+def _numpy_partition(arr, kth, **kwargs):
+    num_workers = min(cpu_count(), len(arr))
+    chunk_size = len(arr) // num_workers
+    extra = len(arr) % num_workers
+    start_idx = 0
+    batches = []
+    for i in range(num_workers):
+        size = chunk_size + (1 if i < extra else 0)
+        batches.append(arr[start_idx : start_idx + size])
+        start_idx += size
+    with ThreadPool(num_workers) as pool:
+        return list(pool.map(partial(np.partition, kth=kth, **kwargs), batches))
+if __name__ == "__main__":
+    main()
+# nohup python evaluator_rf.py > evaluator_rf_iter22.log 2>&1 &

evaluator_rf_iter22.log ADDED Viewed

@@ -0,0 +1,25 @@
  0%|          | 0/1 [00:00<?, ?it/s]2026-03-25 14:55:37.841840: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled
  0%|          | 0/211 [00:00<?, ?it/s]
  0%|          | 1/211 [00:02<07:28,  2.13s/it]
  1%|          | 2/211 [00:03<06:40,  1.92s/it]
  1%|▏         | 3/211 [00:06<08:10,  2.36s/it]
  2%|▏         | 4/211 [00:08<07:14,  2.10s/it]
  2%|▏         | 5/211 [00:10<06:43,  1.96s/it]
  3%|▎         | 6/211 [00:11<06:27,  1.89s/it]
  3%|▎         | 7/211 [00:13<06:15,  1.84s/it]
  4%|▍         | 8/211 [00:15<06:05,  1.80s/it]
  4%|▍         | 9/211 [00:17<06:00,  1.79s/it]
  5%|▍         | 10/211 [00:18<05:53,  1.76s/it]
  5%|▌         | 11/211 [00:20<05:47,  1.74s/it]
  6%|▌         | 12/211 [00:22<05:40,  1.71s/it]
  6%|▌         | 13/211 [00:23<05:34,  1.69s/it]
  7%|▋         | 14/211 [00:25<05:32,  1.69s/it]
  7%|▋         | 15/211 [00:27<05:39,  1.73s/it]
  8%|▊         | 16/211 [00:29<05:34,  1.71s/it]
  8%|▊         | 17/211 [00:30<05:34,  1.72s/it]
  9%|▊         | 18/211 [00:32<06:00,  1.87s/it]
  9%|▉         | 19/211 [00:34<05:48,  1.82s/it]
  9%|▉         | 20/211 [00:36<05:44,  1.80s/it]
 10%|▉         | 21/211 [00:38<05:39,  1.78s/it]
 10%|█         | 22/211 [00:40<05:40,  1.80s/it]
 11%|█         | 23/211 [00:41<05:45,  1.84s/it]
 11%|█▏        | 24/211 [00:43<05:43,  1.84s/it]
 12%|█▏        | 25/211 [00:45<05:35,  1.80s/it]
 12%|█▏        | 26/211 [00:47<05:41,  1.84s/it]
 13%|█▎        | 27/211 [00:49<05:42,  1.86s/it]
 13%|█▎        | 28/211 [00:51<05:36,  1.84s/it]
 14%|█▎        | 29/211 [00:52<05:27,  1.80s/it]
 14%|█▍        | 30/211 [00:54<05:18,  1.76s/it]
 15%|█▍        | 31/211 [00:56<05:28,  1.83s/it]
 15%|█▌        | 32/211 [00:58<05:24,  1.81s/it]
 16%|█▌        | 33/211 [01:00<05:28,  1.85s/it]
 16%|█▌        | 34/211 [01:01<05:20,  1.81s/it]
 17%|█▋        | 35/211 [01:03<05:19,  1.82s/it]
 17%|█▋        | 36/211 [01:05<05:14,  1.80s/it]
 18%|█▊        | 37/211 [01:07<05:07,  1.77s/it]
 18%|█▊        | 38/211 [01:08<05:04,  1.76s/it]
 18%|█▊        | 39/211 [01:10<05:17,  1.85s/it]
 19%|█▉        | 40/211 [01:12<05:10,  1.82s/it]
 19%|█▉        | 41/211 [01:14<05:23,  1.90s/it]
 20%|█▉        | 42/211 [01:16<05:15,  1.87s/it]
 20%|██        | 43/211 [01:18<05:06,  1.82s/it]
 21%|██        | 44/211 [01:20<05:01,  1.81s/it]
 21%|██▏       | 45/211 [01:21<04:55,  1.78s/it]
 22%|██▏       | 46/211 [01:23<04:54,  1.78s/it]
 22%|██▏       | 47/211 [01:25<04:49,  1.77s/it]
 23%|██▎       | 48/211 [01:27<04:54,  1.81s/it]
 23%|██▎       | 49/211 [01:28<04:48,  1.78s/it]
 24%|██▎       | 50/211 [01:30<04:42,  1.75s/it]
 24%|██▍       | 51/211 [01:32<04:36,  1.73s/it]
 25%|██▍       | 52/211 [01:34<04:44,  1.79s/it]
 25%|██▌       | 53/211 [01:36<05:01,  1.91s/it]
 26%|██▌       | 54/211 [01:38<04:49,  1.84s/it]
 26%|██▌       | 55/211 [01:40<05:11,  2.00s/it]
 27%|██▋       | 56/211 [01:42<04:57,  1.92s/it]
 27%|██▋       | 57/211 [01:43<04:43,  1.84s/it]
 27%|█��▋       | 58/211 [01:45<04:48,  1.88s/it]
 28%|██▊       | 59/211 [01:47<04:34,  1.81s/it]
 28%|██▊       | 60/211 [01:49<04:36,  1.83s/it]
 29%|██▉       | 61/211 [01:51<04:25,  1.77s/it]
 29%|██▉       | 62/211 [01:52<04:17,  1.73s/it]
 30%|██▉       | 63/211 [01:54<04:13,  1.71s/it]
 30%|███       | 64/211 [01:55<04:08,  1.69s/it]
 31%|███       | 65/211 [01:57<04:09,  1.71s/it]
 31%|███▏      | 66/211 [01:59<04:11,  1.73s/it]
 32%|███▏      | 67/211 [02:01<04:11,  1.75s/it]
 32%|███▏      | 68/211 [02:03<04:16,  1.79s/it]
 33%|███▎      | 69/211 [02:04<04:11,  1.77s/it]
 33%|███▎      | 70/211 [02:07<04:30,  1.92s/it]
 34%|███▎      | 71/211 [02:08<04:20,  1.86s/it]
 34%|███▍      | 72/211 [02:10<04:12,  1.82s/it]
 35%|███▍      | 73/211 [02:12<04:07,  1.79s/it]
 35%|███▌      | 74/211 [02:14<04:12,  1.84s/it]
 36%|███▌      | 75/211 [02:16<04:19,  1.91s/it]
 36%|███▌      | 76/211 [02:18<04:06,  1.83s/it]
 36%|███▋      | 77/211 [02:20<04:17,  1.92s/it]
 37%|███▋      | 78/211 [02:21<04:06,  1.86s/it]
 37%|███▋      | 79/211 [02:23<04:03,  1.84s/it]
 38%|███▊      | 80/211 [02:26<04:34,  2.10s/it]
 38%|███▊      | 81/211 [02:28<04:16,  1.97s/it]
 39%|███▉      | 82/211 [02:29<04:01,  1.88s/it]
 39%|███▉      | 83/211 [02:31<03:54,  1.84s/it]
 40%|███▉      | 84/211 [02:33<03:48,  1.80s/it]
 40%|████      | 85/211 [02:34<03:42,  1.77s/it]
 41%|████      | 86/211 [02:36<03:39,  1.75s/it]
 41%|████      | 87/211 [02:38<03:37,  1.76s/it]
 42%|████▏     | 88/211 [02:39<03:33,  1.73s/it]
 42%|████▏     | 89/211 [02:41<03:27,  1.70s/it]
 43%|████▎     | 90/211 [02:43<03:27,  1.71s/it]
 43%|████▎     | 91/211 [02:45<03:27,  1.73s/it]
 44%|████▎     | 92/211 [02:46<03:23,  1.71s/it]
 44%|████▍     | 93/211 [02:48<03:22,  1.71s/it]
 45%|████▍     | 94/211 [02:57<07:25,  3.81s/it]
 45%|████▌     | 95/211 [02:58<06:11,  3.20s/it]
 45%|████▌     | 96/211 [03:00<05:17,  2.76s/it]
 46%|████▌     | 97/211 [03:02<04:47,  2.52s/it]
 46%|████▋     | 98/211 [03:04<04:15,  2.27s/it]
 47%|████▋     | 99/211 [03:06<03:56,  2.11s/it]
 47%|████▋     | 100/211 [03:07<03:42,  2.00s/it]
 48%|████▊     | 101/211 [03:09<03:28,  1.90s/it]
 48%|████▊     | 102/211 [03:11<03:17,  1.81s/it]
 49%|████▉     | 103/211 [03:12<03:12,  1.78s/it]
 49%|████▉     | 104/211 [03:14<03:05,  1.74s/it]
 50%|████▉     | 105/211 [03:16<03:01,  1.71s/it]
 50%|█████     | 106/211 [03:17<03:05,  1.76s/it]
 51%|█████     | 107/211 [03:19<03:03,  1.76s/it]
 51%|█████     | 108/211 [03:21<02:58,  1.74s/it]
 52%|█████▏    | 109/211 [03:23<02:58,  1.75s/it]
 52%|█████▏    | 110/211 [03:25<02:58,  1.77s/it]
 53%|█████▎    | 111/211 [03:26<02:51,  1.72s/it]
 53%|█████▎    | 112/211 [03:28<02:49,  1.71s/it]
 54%|█████▎    | 113/211 [03:29<02:46,  1.70s/it]
 54%|█████▍    | 114/211 [03:31<02:44,  1.69s/it]
 55%|█████▍    | 115/211 [03:33<02:41,  1.68s/it]
 55%|█████▍    | 116/211 [03:35<02:42,  1.71s/it]
 55%|█████▌    | 117/211 [03:36<02:38,  1.69s/it]
 56%|█████▌    | 118/211 [03:38<02:38,  1.70s/it]
 56%|█████▋    | 119/211 [03:40<02:35,  1.69s/it]
 57%|█████▋    | 120/211 [03:41<02:36,  1.72s/it]
 57%|█████▋    | 121/211 [03:43<02:36,  1.74s/it]
 58%|█████▊    | 122/211 [03:45<02:31,  1.70s/it]
 58%|█████▊    | 123/211 [03:47<02:34,  1.76s/it]
 59%|█████▉    | 124/211 [03:48<02:30,  1.73s/it]
 59%|█████▉    | 125/211 [03:50<02:25,  1.70s/it]
 60%|█████▉    | 126/211 [03:52<02:22,  1.68s/it]
 60%|██████    | 127/211 [03:53<02:21,  1.69s/it]
 61%|██████    | 128/211 [03:55<02:19,  1.68s/it]
 61%|██████    | 129/211 [03:57<02:17,  1.68s/it]
 62%|██████▏   | 130/211 [03:59<02:21,  1.75s/it]
 62%|██████▏   | 131/211 [04:00<02:20,  1.75s/it]
 63%|██████▎   | 132/211 [04:02<02:18,  1.75s/it]
 63%|██████▎   | 133/211 [04:04<02:21,  1.81s/it]
 64%|██████▎   | 134/211 [04:06<02:15,  1.76s/it]
 64%|██████▍   | 135/211 [04:08<02:17,  1.81s/it]
 64%|██████▍   | 136/211 [04:10<02:20,  1.88s/it]
 65%|██████▍   | 137/211 [04:11<02:13,  1.80s/it]
 65%|██████▌   | 138/211 [04:13<02:07,  1.74s/it]
 66%|██████▌   | 139/211 [04:16<02:38,  2.20s/it]
 66%|██████▋   | 140/211 [04:18<02:27,  2.08s/it]
 67%|██████▋   | 141/211 [04:20<02:16,  1.95s/it]
 67%|██████▋   | 142/211 [04:21<02:11,  1.91s/it]
 68%|██████▊   | 143/211 [04:23<02:06,  1.86s/it]
 68%|██████▊   | 144/211 [04:25<02:02,  1.82s/it]
 69%|██████▊   | 145/211 [04:27<01:58,  1.80s/it]
 69%|██████▉   | 146/211 [04:28<01:53,  1.75s/it]
 70%|██████▉   | 147/211 [04:30<01:54,  1.78s/it]
 70%|███████   | 148/211 [04:32<01:53,  1.80s/it]
 71%|███████   | 149/211 [04:34<01:54,  1.85s/it]
 71%|███████   | 150/211 [04:36<01:49,  1.80s/it]
 72%|███████▏  | 151/211 [04:37<01:46,  1.77s/it]
 72%|███████▏  | 152/211 [04:39<01:45,  1.79s/it]
 73%|███████▎  | 153/211 [04:41<01:43,  1.78s/it]
 73%|███████▎  | 154/211 [04:43<01:41,  1.78s/it]
 73%|███████▎  | 155/211 [04:44<01:37,  1.73s/it]
 74%|███████▍  | 156/211 [04:46<01:35,  1.73s/it]
 74%|███████▍  | 157/211 [04:48<01:33,  1.73s/it]
 75%|███████▍  | 158/211 [04:50<01:33,  1.77s/it]
 75%|███████▌  | 159/211 [04:51<01:30,  1.74s/it]
 76%|███████▌  | 160/211 [04:53<01:27,  1.72s/it]
 76%|███████▋  | 161/211 [04:55<01:28,  1.77s/it]
 77%|███████▋  | 162/211 [04:57<01:28,  1.80s/it]
 77%|███████▋  | 163/211 [04:58<01:24,  1.75s/it]
 78%|███████▊  | 164/211 [05:00<01:26,  1.84s/it]
 78%|███████▊  | 165/211 [05:02<01:21,  1.77s/it]
 79%|███████▊  | 166/211 [05:04<01:17,  1.72s/it]
 79%|███████▉  | 167/211 [05:05<01:15,  1.72s/it]
 80%|███████▉  | 168/211 [05:07<01:16,  1.79s/it]
 80%|████████  | 169/211 [05:09<01:17,  1.84s/it]
 81%|████████  | 170/211 [05:11<01:15,  1.83s/it]
 81%|████████  | 171/211 [05:13<01:12,  1.81s/it]
 82%|████████▏ | 172/211 [05:15<01:09,  1.79s/it]
 82%|████████▏ | 173/211 [05:16<01:06,  1.74s/it]
 82%|████████▏ | 174/211 [05:18<01:04,  1.75s/it]
 83%|████████▎ | 175/211 [05:20<01:02,  1.74s/it]
 83%|████████▎ | 176/211 [05:41<04:21,  7.47s/it]
 84%|████████▍ | 177/211 [05:42<03:17,  5.81s/it]
 84%|████████▍ | 178/211 [05:44<02:32,  4.63s/it]
 85%|████████▍ | 179/211 [05:46<02:00,  3.75s/it]
 85%|████████▌ | 180/211 [05:48<01:36,  3.13s/it]
 86%|████████▌ | 181/211 [05:49<01:21,  2.72s/it]
 86%|████████▋ | 182/211 [05:51<01:10,  2.42s/it]
 87%|████████▋ | 183/211 [05:53<01:01,  2.19s/it]
 87%|████████▋ | 184/211 [05:55<00:54,  2.03s/it]
 88%|████████▊ | 185/211 [05:56<00:50,  1.94s/it]
 88%|████████▊ | 186/211 [05:58<00:46,  1.88s/it]
 89%|████████▊ | 187/211 [06:00<00:43,  1.82s/it]
 89%|████████▉ | 188/211 [06:01<00:41,  1.79s/it]
 90%|████████▉ | 189/211 [06:03<00:40,  1.86s/it]
 90%|█████████ | 190/211 [06:06<00:45,  2.14s/it]
 91%|█████████ | 191/211 [06:08<00:40,  2.02s/it]
 91%|█████████ | 192/211 [06:10<00:36,  1.91s/it]
 91%|█████████▏| 193/211 [06:11<00:33,  1.85s/it]
 92%|█████████▏| 194/211 [06:13<00:30,  1.82s/it]
 92%|█████████▏| 195/211 [06:15<00:28,  1.78s/it]
 93%|█████████▎| 196/211 [06:17<00:28,  1.92s/it]
 93%|█████████▎| 197/211 [06:19<00:25,  1.84s/it]
 94%|█████████▍| 198/211 [06:20<00:23,  1.80s/it]
 94%|█████████▍| 199/211 [06:22<00:22,  1.84s/it]
 95%|█████████▍| 200/211 [06:24<00:19,  1.80s/it]
 95%|█████████▌| 201/211 [06:26<00:17,  1.77s/it]
 96%|█████████▌| 202/211 [06:28<00:16,  1.82s/it]
 96%|█████████▌| 203/211 [06:29<00:14,  1.77s/it]
 97%|█████████▋| 204/211 [06:31<00:12,  1.73s/it]
 97%|█████████▋| 205/211 [06:33<00:10,  1.72s/it]
 98%|█████████▊| 206/211 [06:34<00:08,  1.73s/it]
 98%|█████████▊| 207/211 [06:36<00:06,  1.71s/it]
 99%|█████████▊| 208/211 [06:39<00:05,  1.97s/it]
 99%|█████████▉| 209/211 [06:40<00:03,  1.93s/it]
  0%|          | 0/469 [00:00<?, ?it/s]
  0%|          | 1/469 [00:02<15:45,  2.02s/it]
  0%|          | 2/469 [00:03<14:20,  1.84s/it]
  1%|          | 3/469 [00:05<13:51,  1.78s/it]
  1%|          | 4/469 [00:07<13:37,  1.76s/it]
  1%|          | 5/469 [00:08<13:23,  1.73s/it]
  1%|▏         | 6/469 [00:10<13:11,  1.71s/it]
  1%|▏         | 7/469 [00:12<13:03,  1.70s/it]
  2%|▏         | 8/469 [00:13<12:57,  1.69s/it]
  2%|▏         | 9/469 [00:15<12:53,  1.68s/it]
  2%|▏         | 10/469 [00:17<13:08,  1.72s/it]
  2%|▏         | 11/469 [00:26<31:14,  4.09s/it]
  3%|▎         | 12/469 [00:28<25:29,  3.35s/it]
  3%|▎         | 13/469 [00:30<21:32,  2.84s/it]
  3%|▎         | 14/469 [00:31<18:50,  2.49s/it]
  3%|▎         | 15/469 [00:33<16:50,  2.23s/it]
  3%|▎         | 16/469 [00:35<15:32,  2.06s/it]
  4%|▎         | 17/469 [00:36<14:57,  1.99s/it]
  4%|▍         | 18/469 [00:38<14:12,  1.89s/it]
  4%|▍         | 19/469 [00:40<13:45,  1.83s/it]
  4%|▍         | 20/469 [00:41<13:17,  1.78s/it]
  4%|▍         | 21/469 [00:43<13:08,  1.76s/it]
  5%|▍         | 22/469 [00:45<12:51,  1.73s/it]
  5%|▍         | 23/469 [00:47<12:51,  1.73s/it]
  5%|▌         | 24/469 [00:48<12:38,  1.70s/it]
  5%|▌         | 25/469 [00:50<12:33,  1.70s/it]
  6%|▌         | 26/469 [00:51<12:26,  1.68s/it]
  6%|▌         | 27/469 [00:53<12:26,  1.69s/it]
  6%|▌         | 28/469 [00:55<12:17,  1.67s/it]
  6%|▌         | 29/469 [00:57<12:18,  1.68s/it]
  6%|▋         | 30/469 [00:58<12:12,  1.67s/it]
  7%|▋         | 31/469 [01:00<12:15,  1.68s/it]
  7%|▋         | 32/469 [01:02<12:10,  1.67s/it]
  7%|▋         | 33/469 [01:04<12:55,  1.78s/it]
  7%|▋         | 34/469 [01:05<12:41,  1.75s/it]
  7%|▋         | 35/469 [01:07<12:24,  1.72s/it]
  8%|▊         | 36/469 [01:09<12:34,  1.74s/it]
  8%|▊         | 37/469 [01:10<12:19,  1.71s/it]
  8%|▊         | 38/469 [01:12<12:27,  1.73s/it]
  8%|▊         | 39/469 [01:14<12:14,  1.71s/it]
  9%|▊         | 40/469 [01:15<12:10,  1.70s/it]
  9%|▊         | 41/469 [01:17<12:07,  1.70s/it]
  9%|▉         | 42/469 [01:19<12:09,  1.71s/it]
  9%|▉         | 43/469 [01:21<12:18,  1.73s/it]
  9%|▉         | 44/469 [01:22<12:13,  1.72s/it]
 10%|▉         | 45/469 [01:24<12:06,  1.71s/it]
 10%|▉         | 46/469 [01:26<12:22,  1.75s/it]
 10%|█         | 47/469 [01:28<12:35,  1.79s/it]
 10%|█         | 48/469 [01:29<12:12,  1.74s/it]
 10%|█         | 49/469 [01:31<12:01,  1.72s/it]
 11%|█         | 50/469 [01:33<12:24,  1.78s/it]
 11%|█         | 51/469 [01:35<12:32,  1.80s/it]
 11%|█         | 52/469 [01:37<12:29,  1.80s/it]
 11%|█▏        | 53/469 [01:39<13:30,  1.95s/it]
 12%|█▏        | 54/469 [01:41<12:50,  1.86s/it]
 12%|█▏        | 55/469 [01:42<12:27,  1.81s/it]
 12%|█▏        | 56/469 [01:44<12:05,  1.76s/it]
 12%|█▏        | 57/469 [01:46<12:17,  1.79s/it]
 12%|█▏        | 58/469 [01:47<12:08,  1.77s/it]
 13%|█▎        | 59/469 [01:49<12:09,  1.78s/it]
 13%|█▎        | 60/469 [01:51<12:17,  1.80s/it]
 13%|█▎        | 61/469 [01:53<12:07,  1.78s/it]
 13%|█▎        | 62/469 [01:55<12:24,  1.83s/it]
 13%|█▎        | 63/469 [01:56<12:00,  1.77s/it]
 14%|█▎        | 64/469 [01:58<11:42,  1.73s/it]
 14%|█▍        | 65/469 [02:00<11:35,  1.72s/it]
 14%|█▍        | 66/469 [02:02<11:34,  1.72s/it]
 14%|█▍        | 67/469 [02:04<12:11,  1.82s/it]
 14%|█▍        | 68/469 [02:05<11:45,  1.76s/it]
 15%|█▍        | 69/469 [02:07<11:31,  1.73s/it]
 15%|█▍        | 70/469 [02:09<11:27,  1.72s/it]
 15%|█▌        | 71/469 [02:10<11:25,  1.72s/it]
 15%|█▌        | 72/469 [02:12<11:16,  1.70s/it]
 16%|█▌        | 73/469 [02:14<11:09,  1.69s/it]
 16%|█▌        | 74/469 [02:15<11:32,  1.75s/it]
 16%|█▌        | 75/469 [02:17<11:47,  1.80s/it]
 16%|█▌        | 76/469 [02:19<11:43,  1.79s/it]
 16%|█▋        | 77/469 [02:21<12:10,  1.86s/it]
 17%|█▋        | 78/469 [02:23<11:45,  1.80s/it]
 17%|█▋        | 79/469 [02:25<12:10,  1.87s/it]
 17%|█▋        | 80/469 [02:27<11:48,  1.82s/it]
 17%|█▋        | 81/469 [02:28<11:30,  1.78s/it]
 17%|█▋        | 82/469 [02:30<11:36,  1.80s/it]
 18%|█▊        | 83/469 [02:32<11:18,  1.76s/it]
 18%|█▊        | 84/469 [02:34<11:12,  1.75s/it]
 18%|█▊        | 85/469 [02:35<11:03,  1.73s/it]
 18%|█▊        | 86/469 [02:37<11:06,  1.74s/it]
 19%|█▊        | 87/469 [02:39<11:04,  1.74s/it]
 19%|█▉        | 88/469 [02:40<10:56,  1.72s/it]
 19%|█▉        | 89/469 [02:42<11:07,  1.76s/it]
 19%|█▉        | 90/469 [02:45<12:56,  2.05s/it]
 19%|█▉        | 91/469 [02:47<12:10,  1.93s/it]
 20%|█▉        | 92/469 [02:48<11:35,  1.84s/it]
 20%|█▉        | 93/469 [02:50<11:19,  1.81s/it]
 20%|██        | 94/469 [02:52<11:04,  1.77s/it]
 20%|██        | 95/469 [02:53<10:48,  1.73s/it]
 20%|██        | 96/469 [02:55<10:38,  1.71s/it]
 21%|██        | 97/469 [02:57<11:01,  1.78s/it]
 21%|██        | 98/469 [02:59<10:50,  1.75s/it]
 21%|██        | 99/469 [03:00<10:33,  1.71s/it]
 21%|██▏       | 100/469 [03:02<10:35,  1.72s/it]
 22%|██▏       | 101/469 [03:04<11:42,  1.91s/it]
 22%|██▏       | 102/469 [03:06<11:19,  1.85s/it]
 22%|██▏       | 103/469 [03:08<11:48,  1.94s/it]
 22%|██▏       | 104/469 [03:10<11:23,  1.87s/it]
 22%|██▏       | 105/469 [03:11<10:53,  1.80s/it]
 23%|██▎       | 106/469 [03:13<10:34,  1.75s/it]
 23%|██▎       | 107/469 [03:15<10:32,  1.75s/it]
 23%|██▎       | 108/469 [03:17<10:19,  1.71s/it]
 23%|██▎       | 109/469 [03:18<10:08,  1.69s/it]
 23%|██▎       | 110/469 [03:20<11:07,  1.86s/it]
 24%|██▎       | 111/469 [03:22<10:49,  1.81s/it]
 24%|██▍       | 112/469 [03:24<10:35,  1.78s/it]
 24%|██▍       | 113/469 [03:25<10:22,  1.75s/it]
 24%|██▍       | 114/469 [03:27<10:13,  1.73s/it]
 25%|██▍       | 115/469 [03:29<10:01,  1.70s/it]
 25%|██▍       | 116/469 [03:30<09:56,  1.69s/it]
 25%|██▍       | 117/469 [03:32<09:56,  1.69s/it]
 25%|██▌       | 118/469 [03:34<09:45,  1.67s/it]
 25%|██▌       | 119/469 [03:35<09:45,  1.67s/it]
 26%|██▌       | 120/469 [03:37<09:45,  1.68s/it]
 26%|██▌       | 121/469 [03:40<11:56,  2.06s/it]
 26%|██▌       | 122/469 [03:42<11:18,  1.95s/it]
 26%|██▌       | 123/469 [03:44<11:00,  1.91s/it]
 26%|██▋       | 124/469 [03:45<10:31,  1.83s/it]
 27%|██▋       | 125/469 [03:47<10:06,  1.76s/it]
 27%|██▋       | 126/469 [03:48<09:48,  1.72s/it]
 27%|██▋       | 127/469 [03:50<09:57,  1.75s/it]
 27%|██▋       | 128/469 [03:52<09:52,  1.74s/it]
 28%|██▊       | 129/469 [03:54<09:46,  1.72s/it]
 28%|██▊       | 130/469 [03:56<09:59,  1.77s/it]
 28%|██▊       | 131/469 [03:57<09:56,  1.76s/it]
 28%|██▊       | 132/469 [03:59<09:40,  1.72s/it]
 28%|██▊       | 133/469 [04:01<10:56,  1.95s/it]
 29%|██▊       | 134/469 [04:03<10:35,  1.90s/it]
 29%|██▉       | 135/469 [04:05<10:05,  1.81s/it]
 29%|██▉       | 136/469 [04:06<09:45,  1.76s/it]
 29%|██▉       | 137/469 [04:08<09:30,  1.72s/it]
 29%|██▉       | 138/469 [04:10<09:35,  1.74s/it]
 30%|██▉       | 139/469 [04:11<09:19,  1.69s/it]
 30%|██▉       | 140/469 [04:13<09:06,  1.66s/it]
 30%|███       | 141/469 [04:15<09:00,  1.65s/it]
 30%|███       | 142/469 [04:16<08:55,  1.64s/it]
 30%|███       | 143/469 [04:18<08:55,  1.64s/it]
 31%|███       | 144/469 [04:19<08:47,  1.62s/it]
 31%|███       | 145/469 [04:21<09:06,  1.69s/it]
 31%|███       | 146/469 [04:23<09:06,  1.69s/it]
 31%|███▏      | 147/469 [04:25<08:56,  1.67s/it]
 32%|███▏      | 148/469 [04:26<08:56,  1.67s/it]
 32%|███▏      | 149/469 [04:28<09:09,  1.72s/it]
 32%|███▏      | 150/469 [04:30<08:57,  1.69s/it]
 32%|███▏      | 151/469 [04:31<08:47,  1.66s/it]
 32%|███▏      | 152/469 [04:33<08:41,  1.64s/it]
 33%|███▎      | 153/469 [04:35<08:34,  1.63s/it]
 33%|███▎      | 154/469 [04:36<08:32,  1.63s/it]
 33%|███▎      | 155/469 [04:38<08:25,  1.61s/it]
 33%|███▎      | 156/469 [04:39<08:23,  1.61s/it]
 33%|███▎      | 157/469 [04:41<08:22,  1.61s/it]
 34%|███▎      | 158/469 [04:47<15:58,  3.08s/it]
 34%|███▍      | 159/469 [04:49<13:44,  2.66s/it]
 34%|███▍      | 160/469 [04:51<12:27,  2.42s/it]
 34%|███▍      | 161/469 [04:53<11:14,  2.19s/it]
 35%|███▍      | 162/469 [04:54<10:22,  2.03s/it]
 35%|███▍      | 163/469 [04:56<09:58,  1.96s/it]
 35%|███▍      | 164/469 [04:58<09:35,  1.89s/it]
 35%|███▌      | 165/469 [04:59<09:06,  1.80s/it]
 35%|███▌      | 166/469 [05:01<08:47,  1.74s/it]
 36%|███▌      | 167/469 [05:03<08:31,  1.69s/it]
 36%|███▌      | 168/469 [05:05<08:55,  1.78s/it]
 36%|███▌      | 169/469 [05:07<09:07,  1.82s/it]
 36%|███▌      | 170/469 [05:08<08:54,  1.79s/it]
 36%|███▋      | 171/469 [05:10<08:46,  1.77s/it]
 37%|███▋      | 172/469 [05:12<08:31,  1.72s/it]
 37%|███▋      | 173/469 [05:13<08:23,  1.70s/it]
 37%|███▋      | 174/469 [05:15<08:16,  1.68s/it]
 37%|███▋      | 175/469 [05:17<08:12,  1.67s/it]
 38%|███▊      | 176/469 [05:18<08:14,  1.69s/it]
 38%|███▊      | 177/469 [05:20<08:13,  1.69s/it]
 38%|███▊      | 178/469 [05:22<08:07,  1.68s/it]
 38%|███▊      | 179/469 [05:23<08:00,  1.66s/it]
 38%|███▊      | 180/469 [05:25<07:58,  1.66s/it]
 39%|███▊      | 181/469 [05:26<07:53,  1.64s/it]
 39%|███▉      | 182/469 [05:28<07:47,  1.63s/it]
 39%|███▉      | 183/469 [05:30<07:48,  1.64s/it]
 39%|███▉      | 184/469 [05:31<07:50,  1.65s/it]
 39%|███▉      | 185/469 [05:33<07:44,  1.64s/it]
 40%|███▉      | 186/469 [05:35<07:50,  1.66s/it]
 40%|███▉      | 187/469 [05:37<08:00,  1.70s/it]
 40%|████      | 188/469 [05:38<07:55,  1.69s/it]
 40%|████      | 189/469 [05:40<07:48,  1.67s/it]
 41%|████      | 190/469 [05:42<07:53,  1.70s/it]
 41%|████      | 191/469 [05:43<07:44,  1.67s/it]
 41%|████      | 192/469 [05:45<07:38,  1.65s/it]
 41%|████      | 193/469 [05:46<07:32,  1.64s/it]
 41%|████▏     | 194/469 [05:48<07:54,  1.73s/it]
 42%|███��▏     | 195/469 [05:50<07:45,  1.70s/it]
 42%|████▏     | 196/469 [05:52<07:44,  1.70s/it]
 42%|████▏     | 197/469 [05:53<07:48,  1.72s/it]
 42%|████▏     | 198/469 [05:55<07:40,  1.70s/it]
 42%|████▏     | 199/469 [05:57<08:02,  1.79s/it]
 43%|████▎     | 200/469 [05:59<07:49,  1.74s/it]
 43%|████▎     | 201/469 [06:00<07:41,  1.72s/it]
 43%|████▎     | 202/469 [06:02<07:38,  1.72s/it]
 43%|████▎     | 203/469 [06:04<07:37,  1.72s/it]
 43%|████▎     | 204/469 [06:05<07:29,  1.70s/it]
 44%|████▎     | 205/469 [06:07<07:31,  1.71s/it]
 44%|████▍     | 206/469 [06:09<07:21,  1.68s/it]
 44%|████▍     | 207/469 [06:10<07:16,  1.67s/it]
 44%|████▍     | 208/469 [06:12<07:09,  1.64s/it]
 45%|████▍     | 209/469 [06:14<07:07,  1.64s/it]
 45%|████▍     | 210/469 [06:15<07:11,  1.67s/it]
 45%|████▍     | 211/469 [06:17<07:21,  1.71s/it]
 45%|████▌     | 212/469 [06:19<07:27,  1.74s/it]
 45%|████▌     | 213/469 [06:21<07:20,  1.72s/it]
 46%|████▌     | 214/469 [06:22<07:09,  1.69s/it]
 46%|████▌     | 215/469 [06:24<07:07,  1.68s/it]
 46%|████▌     | 216/469 [06:26<07:00,  1.66s/it]
 46%|████▋     | 217/469 [06:27<06:56,  1.65s/it]
 46%|████▋     | 218/469 [06:29<06:48,  1.63s/it]
 47%|████▋     | 219/469 [06:31<07:08,  1.72s/it]
 47%|████▋     | 220/469 [06:32<06:57,  1.68s/it]
 47%|████▋     | 221/469 [06:34<06:56,  1.68s/it]
 47%|████▋     | 222/469 [06:36<06:54,  1.68s/it]
 48%|████▊     | 223/469 [06:37<06:47,  1.66s/it]
 48%|████▊     | 224/469 [06:39<06:44,  1.65s/it]
 48%|████▊     | 225/469 [06:41<06:42,  1.65s/it]
 48%|████▊     | 226/469 [06:42<06:35,  1.63s/it]
 48%|████▊     | 227/469 [06:44<06:31,  1.62s/it]
 49%|████▊     | 228/469 [06:46<06:57,  1.73s/it]
 49%|████▉     | 229/469 [06:47<06:47,  1.70s/it]
 49%|████▉     | 230/469 [06:49<06:46,  1.70s/it]
 49%|████▉     | 231/469 [06:51<06:40,  1.68s/it]
 49%|████▉     | 232/469 [06:52<06:33,  1.66s/it]
 50%|████▉     | 233/469 [06:54<06:46,  1.72s/it]
 50%|████▉     | 234/469 [06:56<07:17,  1.86s/it]
 50%|█████     | 235/469 [06:58<07:00,  1.80s/it]
 50%|█████     | 236/469 [07:00<06:47,  1.75s/it]
 51%|█████     | 237/469 [07:01<06:40,  1.72s/it]
 51%|█████     | 238/469 [07:03<06:31,  1.69s/it]
 51%|█████     | 239/469 [07:05<06:23,  1.67s/it]
 51%|█████     | 240/469 [07:06<06:18,  1.65s/it]
 51%|█████▏    | 241/469 [07:08<06:14,  1.64s/it]
 52%|█████▏    | 242/469 [07:09<06:10,  1.63s/it]
 52%|█████▏    | 243/469 [07:11<06:41,  1.77s/it]
 52%|█████▏    | 244/469 [07:13<06:27,  1.72s/it]
 52%|█████▏    | 245/469 [07:15<06:49,  1.83s/it]
 52%|█████▏    | 246/469 [07:17<06:33,  1.76s/it]
 53%|█████▎    | 247/469 [07:18<06:22,  1.72s/it]
 53%|█████▎    | 248/469 [07:20<06:19,  1.72s/it]
 53%|█████▎    | 249/469 [07:22<06:12,  1.69s/it]
 53%|█████▎    | 250/469 [07:24<06:16,  1.72s/it]
 54%|█████▎    | 251/469 [07:25<06:10,  1.70s/it]
 54%|█████▎    | 252/469 [07:27<06:11,  1.71s/it]
 54%|█████▍    | 253/469 [07:29<06:03,  1.68s/it]
 54%|█████▍    | 254/469 [07:30<06:04,  1.70s/it]
 54%|█████▍    | 255/469 [07:32<05:56,  1.67s/it]
 55%|█████▍    | 256/469 [07:34<06:02,  1.70s/it]
 55%|█████▍    | 257/469 [07:35<05:56,  1.68s/it]
 55%|█████▌    | 258/469 [07:37<06:02,  1.72s/it]
 55%|█████▌    | 259/469 [07:39<05:57,  1.70s/it]
 55%|█████▌    | 260/469 [07:40<05:50,  1.68s/it]
 56%|█████▌    | 261/469 [07:42<05:50,  1.69s/it]
 56%|█████▌    | 262/469 [07:44<05:52,  1.70s/it]
 56%|█████▌    | 263/469 [07:46<05:50,  1.70s/it]
 56%|█████▋    | 264/469 [07:47<05:48,  1.70s/it]
 57%|█████▋    | 265/469 [07:49<05:45,  1.69s/it]
 57%|█████▋    | 266/469 [07:50<05:37,  1.66s/it]
 57%|█████▋    | 267/469 [07:52<05:33,  1.65s/it]
 57%|█████▋    | 268/469 [07:58<09:27,  2.82s/it]
 57%|█████▋    | 269/469 [07:59<08:23,  2.52s/it]
 58%|█████▊    | 270/469 [08:01<07:26,  2.24s/it]
 58%|█████▊    | 271/469 [08:03<06:53,  2.09s/it]
 58%|█████▊    | 272/469 [08:04<06:26,  1.96s/it]
 58%|█████▊    | 273/469 [08:06<06:03,  1.86s/it]
 58%|█████▊    | 274/469 [08:08<05:44,  1.77s/it]
 59%|█████▊    | 275/469 [08:09<05:33,  1.72s/it]
 59%|█████▉    | 276/469 [08:11<05:25,  1.69s/it]
 59%|█████▉    | 277/469 [08:12<05:21,  1.68s/it]
 59%|█████▉    | 278/469 [08:14<05:21,  1.68s/it]
 59%|█████▉    | 279/469 [08:16<05:26,  1.72s/it]
 60%|█████▉    | 280/469 [08:18<05:16,  1.67s/it]
 60%|█████▉    | 281/469 [08:20<05:45,  1.84s/it]
 60%|██████    | 282/469 [08:21<05:31,  1.77s/it]
 60%|██████    | 283/469 [08:23<05:26,  1.75s/it]
 61%|██████    | 284/469 [08:25<05:17,  1.72s/it]
 61%|██████    | 285/469 [08:26<05:10,  1.69s/it]
 61%|██████    | 286/469 [08:28<05:21,  1.76s/it]
 61%|██████    | 287/469 [08:30<05:16,  1.74s/it]
 61%|██████▏   | 288/469 [08:32<05:13,  1.73s/it]
 62%|██████▏   | 289/469 [08:33<05:03,  1.69s/it]
 62%|██████▏   | 290/469 [08:35<05:10,  1.74s/it]
 62%|██████▏   | 291/469 [08:37<05:17,  1.78s/it]
 62%|██████▏   | 292/469 [08:39<05:17,  1.79s/it]
 62%|██████▏   | 293/469 [08:41<05:19,  1.81s/it]
 63%|██████▎   | 294/469 [08:42<05:06,  1.75s/it]
 63%|██████▎   | 295/469 [08:44<04:55,  1.70s/it]
 63%|██████▎   | 296/469 [08:46<04:53,  1.70s/it]
 63%|██████▎   | 297/469 [08:47<04:46,  1.67s/it]
 64%|██████▎   | 298/469 [08:49<04:40,  1.64s/it]
 64%|██████▍   | 299/469 [08:50<04:39,  1.65s/it]
 64%|██████▍   | 300/469 [08:52<04:37,  1.64s/it]
 64%|██████▍   | 301/469 [08:54<04:39,  1.66s/it]
 64%|██████▍   | 302/469 [08:55<04:35,  1.65s/it]
 65%|██████▍   | 303/469 [08:57<04:36,  1.66s/it]
 65%|██████▍   | 304/469 [08:59<04:36,  1.67s/it]
 65%|██████▌   | 305/469 [09:00<04:34,  1.68s/it]
 65%|██████▌   | 306/469 [09:02<04:32,  1.67s/it]
 65%|██████▌   | 307/469 [09:04<04:30,  1.67s/it]
 66%|██████▌   | 308/469 [09:05<04:25,  1.65s/it]
 66%|██████▌   | 309/469 [09:07<04:23,  1.65s/it]
 66%|██████▌   | 310/469 [09:09<04:20,  1.64s/it]
 66%|██████▋   | 311/469 [09:10<04:18,  1.64s/it]
 67%|██████▋   | 312/469 [09:12<04:19,  1.65s/it]
 67%|██████▋   | 313/469 [09:14<04:15,  1.64s/it]
 67%|██████▋   | 314/469 [09:15<04:10,  1.62s/it]
 67%|██████▋   | 315/469 [09:17<04:08,  1.61s/it]
 67%|██████▋   | 316/469 [09:19<04:21,  1.71s/it]
 68%|██████▊   | 317/469 [09:20<04:23,  1.73s/it]
 68%|██████▊   | 318/469 [09:22<04:18,  1.71s/it]
 68%|██████▊   | 319/469 [09:24<04:20,  1.74s/it]
 68%|██████▊   | 320/469 [09:26<04:15,  1.71s/it]
 68%|██████▊   | 321/469 [09:27<04:21,  1.77s/it]
 69%|██████▊   | 322/469 [09:29<04:12,  1.72s/it]
 69%|██████▉   | 323/469 [09:31<04:04,  1.67s/it]
 69%|██████▉   | 324/469 [09:32<04:01,  1.67s/it]
 69%|██████▉   | 325/469 [09:34<03:55,  1.64s/it]
 70%|██████▉   | 326/469 [09:35<03:52,  1.62s/it]
 70%|██████▉   | 327/469 [09:37<03:49,  1.61s/it]
 70%|██████▉   | 328/469 [09:39<03:52,  1.65s/it]
 70%|███████   | 329/469 [09:41<04:01,  1.72s/it]
 70%|███████   | 330/469 [09:43<04:06,  1.77s/it]
 71%|███████   | 331/469 [09:44<04:03,  1.76s/it]
 71%|███████   | 332/469 [09:46<03:55,  1.72s/it]
 71%|███████   | 333/469 [09:47<03:48,  1.68s/it]
 71%|███████   | 334/469 [09:49<03:46,  1.68s/it]
 71%|███████▏  | 335/469 [09:51<03:42,  1.66s/it]
 72%|███████▏  | 336/469 [09:52<03:37,  1.63s/it]
 72%|███████▏  | 337/469 [09:54<03:33,  1.62s/it]
 72%|███████▏  | 338/469 [09:56<03:33,  1.63s/it]
 72%|███████▏  | 339/469 [09:57<03:31,  1.62s/it]
 72%|███████▏  | 340/469 [09:59<03:27,  1.61s/it]
 73%|███████▎  | 341/469 [10:01<03:32,  1.66s/it]
 73%|███████▎  | 342/469 [10:03<03:41,  1.75s/it]
 73%|███████▎  | 343/469 [10:04<03:35,  1.71s/it]
 73%|███████▎  | 344/469 [10:06<03:31,  1.70s/it]
 74%|███████▎  | 345/469 [10:27<15:27,  7.48s/it]
 74%|███████▍  | 346/469 [10:29<11:48,  5.76s/it]
 74%|███████▍  | 347/469 [10:30<09:19,  4.59s/it]
 74%|███████▍  | 348/469 [10:32<07:29,  3.71s/it]
 74%|███████▍  | 349/469 [10:34<06:11,  3.10s/it]
 75%|███████▍  | 350/469 [10:36<05:49,  2.93s/it]
 75%|███████▍  | 351/469 [10:38<05:09,  2.63s/it]
 75%|███████▌  | 352/469 [10:40<04:39,  2.39s/it]
 75%|███████▌  | 353/469 [10:42<04:16,  2.21s/it]
 75%|███████▌  | 354/469 [10:43<03:53,  2.03s/it]
 76%|███████▌  | 355/469 [10:45<03:35,  1.89s/it]
 76%|███████▌  | 356/469 [10:47<03:30,  1.86s/it]
 76%|███████▌  | 357/469 [10:48<03:23,  1.82s/it]
 76%|███████▋  | 358/469 [10:50<03:19,  1.80s/it]
 77%|███████▋  | 359/469 [10:52<03:20,  1.82s/it]
 77%|███████▋  | 360/469 [10:54<03:23,  1.87s/it]
 77%|███████▋  | 361/469 [10:56<03:16,  1.82s/it]
 77%|███████▋  | 362/469 [10:58<03:24,  1.91s/it]
 77%|███████▋  | 363/469 [11:00<03:12,  1.82s/it]
 78%|███████▊  | 364/469 [11:01<03:05,  1.77s/it]
 78%|███████▊  | 365/469 [11:03<02:58,  1.72s/it]
 78%|███████▊  | 366/469 [11:04<02:54,  1.69s/it]
 78%|███████▊  | 367/469 [11:06<02:52,  1.69s/it]
 78%|███████▊  | 368/469 [11:08<02:50,  1.68s/it]
 79%|███████▊  | 369/469 [11:10<02:50,  1.71s/it]
 79%|███████▉  | 370/469 [11:11<02:49,  1.71s/it]
 79%|███████▉  | 371/469 [11:13<02:46,  1.70s/it]
 79%|███████▉  | 372/469 [11:15<02:49,  1.74s/it]
 80%|███████▉  | 373/469 [11:16<02:43,  1.70s/it]
 80%|███████▉  | 374/469 [11:18<02:42,  1.71s/it]
 80%|███████▉  | 375/469 [11:20<02:44,  1.75s/it]
 80%|████████  | 376/469 [11:22<02:47,  1.80s/it]
 80%|████████  | 377/469 [11:24<02:41,  1.76s/it]
 81%|████████  | 378/469 [11:25<02:34,  1.70s/it]
 81%|████████  | 379/469 [11:27<02:30,  1.67s/it]
 81%|████████  | 380/469 [11:29<02:33,  1.72s/it]
 81%|████████  | 381/469 [11:30<02:28,  1.69s/it]
 81%|████████▏ | 382/469 [11:32<02:24,  1.66s/it]
 82%|████████▏ | 383/469 [11:34<02:29,  1.73s/it]
 82%|████████▏ | 384/469 [11:36<02:33,  1.80s/it]
 82%|████████▏ | 385/469 [11:38<02:34,  1.84s/it]
 82%|████████▏ | 386/469 [11:39<02:27,  1.77s/it]
 83%|████████▎ | 387/469 [11:41<02:26,  1.79s/it]
 83%|████████▎ | 388/469 [11:43<02:21,  1.74s/it]
 83%|████████▎ | 389/469 [11:45<02:23,  1.79s/it]
 83%|████████▎ | 390/469 [11:46<02:16,  1.73s/it]
 83%|████████▎ | 391/469 [11:48<02:12,  1.70s/it]
 84%|████████▎ | 392/469 [11:49<02:09,  1.69s/it]
 84%|████████▍ | 393/469 [11:51<02:05,  1.66s/it]
 84%|████████▍ | 394/469 [11:53<02:03,  1.65s/it]
 84%|████████▍ | 395/469 [11:54<02:01,  1.65s/it]
 84%|████████▍ | 396/469 [11:56<01:59,  1.63s/it]
 85%|████████▍ | 397/469 [11:58<01:58,  1.64s/it]
 85%|████████▍ | 398/469 [11:59<02:00,  1.70s/it]
 85%|████████▌ | 399/469 [12:01<01:57,  1.68s/it]
 85%|████████▌ | 400/469 [12:03<01:54,  1.66s/it]
 86%|████████▌ | 401/469 [12:04<01:51,  1.64s/it]
 86%|████████▌ | 402/469 [12:25<08:21,  7.49s/it]
 86%|████████▌ | 403/469 [12:27<06:17,  5.72s/it]
 86%|████████▌ | 404/469 [12:29<04:51,  4.49s/it]
 86%|████████▋ | 405/469 [12:30<03:53,  3.65s/it]
 87%|████████▋ | 406/469 [12:32<03:12,  3.06s/it]
 87%|████████▋ | 407/469 [12:34<02:42,  2.63s/it]
 87%|████████▋ | 408/469 [12:35<02:21,  2.32s/it]
 87%|████████▋ | 409/469 [12:37<02:06,  2.10s/it]
 87%|████████▋ | 410/469 [12:38<01:54,  1.94s/it]
 88%|████████▊ | 411/469 [12:40<01:46,  1.83s/it]
 88%|████████▊ | 412/469 [12:41<01:39,  1.75s/it]
 88%|████████▊ | 413/469 [12:43<01:36,  1.73s/it]
 88%|████████▊ | 414/469 [12:45<01:33,  1.69s/it]
 88%|████████▊ | 415/469 [12:46<01:29,  1.65s/it]
 89%|████████▊ | 416/469 [12:48<01:28,  1.68s/it]
 89%|████████▉ | 417/469 [12:50<01:26,  1.67s/it]
 89%|████████▉ | 418/469 [12:51<01:24,  1.65s/it]
 89%|████████▉ | 419/469 [12:53<01:21,  1.64s/it]
 90%|████████▉ | 420/469 [12:55<01:21,  1.66s/it]
 90%|████████▉ | 421/469 [12:56<01:19,  1.66s/it]
 90%|████████▉ | 422/469 [12:58<01:18,  1.67s/it]
 90%|█████████ | 423/469 [13:00<01:18,  1.71s/it]
 90%|█████████ | 424/469 [13:02<01:18,  1.73s/it]
 91%|█████████ | 425/469 [13:03<01:14,  1.69s/it]
 91%|█████████ | 426/469 [13:05<01:11,  1.66s/it]
 91%|█████████ | 427/469 [13:07<01:13,  1.74s/it]
 91%|█████████▏| 428/469 [13:08<01:09,  1.70s/it]
 91%|█████████▏| 429/469 [13:10<01:07,  1.68s/it]
 92%|█████████▏| 430/469 [13:12<01:05,  1.68s/it]
 92%|█████████▏| 431/469 [13:13<01:05,  1.71s/it]
 92%|█████████▏| 432/469 [13:15<01:02,  1.68s/it]
 92%|█████████▏| 433/469 [13:17<01:01,  1.71s/it]
 93%|█████████▎| 434/469 [13:18<00:58,  1.69s/it]
 93%|█████████▎| 435/469 [13:20<00:57,  1.69s/it]
 93%|█████████▎| 436/469 [13:22<00:55,  1.68s/it]
 93%|█████████▎| 437/469 [13:23<00:53,  1.67s/it]
 93%|█████████▎| 438/469 [13:25<00:51,  1.66s/it]
 94%|█████████▎| 439/469 [13:27<00:49,  1.64s/it]
 94%|█████████▍| 440/469 [13:28<00:47,  1.65s/it]
 94%|█████████▍| 441/469 [13:30<00:46,  1.65s/it]
 94%|█████████▍| 442/469 [13:32<00:44,  1.63s/it]
 94%|█████████▍| 443/469 [13:33<00:43,  1.67s/it]
 95%|█████████▍| 444/469 [13:35<00:41,  1.65s/it]
 95%|█████████▍| 445/469 [13:36<00:39,  1.64s/it]
 95%|█████████▌| 446/469 [13:39<00:40,  1.77s/it]
 95%|█████████▌| 447/469 [13:40<00:37,  1.72s/it]
 96%|█████████▌| 448/469 [13:42<00:35,  1.68s/it]
 96%|█████████▌| 449/469 [13:44<00:36,  1.84s/it]
 96%|█████████▌| 450/469 [13:46<00:33,  1.78s/it]
 96%|█████████▌| 451/469 [13:47<00:30,  1.72s/it]
 96%|█████████▋| 452/469 [13:49<00:30,  1.78s/it]
 97%|█████████▋| 453/469 [13:51<00:27,  1.73s/it]
 97%|█████████▋| 454/469 [13:52<00:25,  1.70s/it]
 97%|█████████▋| 455/469 [13:54<00:23,  1.70s/it]
 97%|█████████▋| 456/469 [13:56<00:21,  1.67s/it]
 97%|█████████▋| 457/469 [13:57<00:19,  1.66s/it]
 98%|█████████▊| 458/469 [13:59<00:18,  1.65s/it]
 98%|█████████▊| 459/469 [14:01<00:16,  1.64s/it]
 98%|█████████▊| 460/469 [14:02<00:14,  1.65s/it]
 98%|█████████▊| 461/469 [14:04<00:13,  1.65s/it]
 99%|█████████▊| 462/469 [14:06<00:11,  1.67s/it]
 99%|█████████▊| 463/469 [14:07<00:10,  1.67s/it]
 99%|█████████▉| 464/469 [14:09<00:09,  1.83s/it]
 99%|█████████▉| 465/469 [14:11<00:07,  1.84s/it]
 99%|█████████▉| 466/469 [14:13<00:05,  1.80s/it]

+nohup: ignoring input
+2026-03-25 14:55:31.459313: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-03-25 14:55:36.111147: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
+To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
+2026-03-25 14:55:36.137507: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
+2026-03-25 14:55:36.137574: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: 66d2d54653616c6252364513da490658-taskrole1-0
+2026-03-25 14:55:36.137623: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: 66d2d54653616c6252364513da490658-taskrole1-0
+2026-03-25 14:55:36.137710: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program
+2026-03-25 14:55:36.137756: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 535.154.5
+2026-03-25 14:55:37.175397: W tensorflow/core/framework/op_def_util.cc:371] Op BatchNormWithGlobalNormalization is deprecated. It will cease to work in GraphDef version 9. Use tf.nn.batch_normalization().
+warming up TensorFlow...
  0%|          | 0/1 [00:00<?, ?it/s]2026-03-25 14:55:37.841840: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled
+computing reference batch activations...
  0%|          | 0/211 [00:00<?, ?it/s]
  0%|          | 1/211 [00:02<07:28,  2.13s/it]
  1%|          | 2/211 [00:03<06:40,  1.92s/it]
  1%|▏         | 3/211 [00:06<08:10,  2.36s/it]
  2%|▏         | 4/211 [00:08<07:14,  2.10s/it]
  2%|▏         | 5/211 [00:10<06:43,  1.96s/it]
  3%|▎         | 6/211 [00:11<06:27,  1.89s/it]
  3%|▎         | 7/211 [00:13<06:15,  1.84s/it]
  4%|▍         | 8/211 [00:15<06:05,  1.80s/it]
  4%|▍         | 9/211 [00:17<06:00,  1.79s/it]
  5%|▍         | 10/211 [00:18<05:53,  1.76s/it]
  5%|▌         | 11/211 [00:20<05:47,  1.74s/it]
  6%|▌         | 12/211 [00:22<05:40,  1.71s/it]
  6%|▌         | 13/211 [00:23<05:34,  1.69s/it]
  7%|▋         | 14/211 [00:25<05:32,  1.69s/it]
  7%|▋         | 15/211 [00:27<05:39,  1.73s/it]
  8%|▊         | 16/211 [00:29<05:34,  1.71s/it]
  8%|▊         | 17/211 [00:30<05:34,  1.72s/it]
  9%|▊         | 18/211 [00:32<06:00,  1.87s/it]
  9%|▉         | 19/211 [00:34<05:48,  1.82s/it]
  9%|▉         | 20/211 [00:36<05:44,  1.80s/it]
 10%|▉         | 21/211 [00:38<05:39,  1.78s/it]
 10%|█         | 22/211 [00:40<05:40,  1.80s/it]
 11%|█         | 23/211 [00:41<05:45,  1.84s/it]
 11%|█▏        | 24/211 [00:43<05:43,  1.84s/it]
 12%|█▏        | 25/211 [00:45<05:35,  1.80s/it]
 12%|█▏        | 26/211 [00:47<05:41,  1.84s/it]
 13%|█▎        | 27/211 [00:49<05:42,  1.86s/it]
 13%|█▎        | 28/211 [00:51<05:36,  1.84s/it]
 14%|█▎        | 29/211 [00:52<05:27,  1.80s/it]
 14%|█▍        | 30/211 [00:54<05:18,  1.76s/it]
 15%|█▍        | 31/211 [00:56<05:28,  1.83s/it]
 15%|█▌        | 32/211 [00:58<05:24,  1.81s/it]
 16%|█▌        | 33/211 [01:00<05:28,  1.85s/it]
 16%|█▌        | 34/211 [01:01<05:20,  1.81s/it]
 17%|█▋        | 35/211 [01:03<05:19,  1.82s/it]
 17%|█▋        | 36/211 [01:05<05:14,  1.80s/it]
 18%|█▊        | 37/211 [01:07<05:07,  1.77s/it]
 18%|█▊        | 38/211 [01:08<05:04,  1.76s/it]
 18%|█▊        | 39/211 [01:10<05:17,  1.85s/it]
 19%|█▉        | 40/211 [01:12<05:10,  1.82s/it]
 19%|█▉        | 41/211 [01:14<05:23,  1.90s/it]
 20%|█▉        | 42/211 [01:16<05:15,  1.87s/it]
 20%|██        | 43/211 [01:18<05:06,  1.82s/it]
 21%|██        | 44/211 [01:20<05:01,  1.81s/it]
 21%|██▏       | 45/211 [01:21<04:55,  1.78s/it]
 22%|██▏       | 46/211 [01:23<04:54,  1.78s/it]
 22%|██▏       | 47/211 [01:25<04:49,  1.77s/it]
 23%|██▎       | 48/211 [01:27<04:54,  1.81s/it]
 23%|██▎       | 49/211 [01:28<04:48,  1.78s/it]
 24%|██▎       | 50/211 [01:30<04:42,  1.75s/it]
 24%|██▍       | 51/211 [01:32<04:36,  1.73s/it]
 25%|██▍       | 52/211 [01:34<04:44,  1.79s/it]
 25%|██▌       | 53/211 [01:36<05:01,  1.91s/it]
 26%|██▌       | 54/211 [01:38<04:49,  1.84s/it]
 26%|██▌       | 55/211 [01:40<05:11,  2.00s/it]
 27%|██▋       | 56/211 [01:42<04:57,  1.92s/it]
 27%|██▋       | 57/211 [01:43<04:43,  1.84s/it]
 27%|█��▋       | 58/211 [01:45<04:48,  1.88s/it]
 28%|██▊       | 59/211 [01:47<04:34,  1.81s/it]
 28%|██▊       | 60/211 [01:49<04:36,  1.83s/it]
 29%|██▉       | 61/211 [01:51<04:25,  1.77s/it]
 29%|██▉       | 62/211 [01:52<04:17,  1.73s/it]
 30%|██▉       | 63/211 [01:54<04:13,  1.71s/it]
 30%|███       | 64/211 [01:55<04:08,  1.69s/it]
 31%|███       | 65/211 [01:57<04:09,  1.71s/it]
 31%|███▏      | 66/211 [01:59<04:11,  1.73s/it]
 32%|███▏      | 67/211 [02:01<04:11,  1.75s/it]
 32%|███▏      | 68/211 [02:03<04:16,  1.79s/it]
 33%|███▎      | 69/211 [02:04<04:11,  1.77s/it]
 33%|███▎      | 70/211 [02:07<04:30,  1.92s/it]
 34%|███▎      | 71/211 [02:08<04:20,  1.86s/it]
 34%|███▍      | 72/211 [02:10<04:12,  1.82s/it]
 35%|███▍      | 73/211 [02:12<04:07,  1.79s/it]
 35%|███▌      | 74/211 [02:14<04:12,  1.84s/it]
 36%|███▌      | 75/211 [02:16<04:19,  1.91s/it]
 36%|███▌      | 76/211 [02:18<04:06,  1.83s/it]
 36%|███▋      | 77/211 [02:20<04:17,  1.92s/it]
 37%|███▋      | 78/211 [02:21<04:06,  1.86s/it]
 37%|███▋      | 79/211 [02:23<04:03,  1.84s/it]
 38%|███▊      | 80/211 [02:26<04:34,  2.10s/it]
 38%|███▊      | 81/211 [02:28<04:16,  1.97s/it]
 39%|███▉      | 82/211 [02:29<04:01,  1.88s/it]
 39%|███▉      | 83/211 [02:31<03:54,  1.84s/it]
 40%|███▉      | 84/211 [02:33<03:48,  1.80s/it]
 40%|████      | 85/211 [02:34<03:42,  1.77s/it]
 41%|████      | 86/211 [02:36<03:39,  1.75s/it]
 41%|████      | 87/211 [02:38<03:37,  1.76s/it]
 42%|████▏     | 88/211 [02:39<03:33,  1.73s/it]
 42%|████▏     | 89/211 [02:41<03:27,  1.70s/it]
 43%|████▎     | 90/211 [02:43<03:27,  1.71s/it]
 43%|████▎     | 91/211 [02:45<03:27,  1.73s/it]
 44%|████▎     | 92/211 [02:46<03:23,  1.71s/it]
 44%|████▍     | 93/211 [02:48<03:22,  1.71s/it]
 45%|████▍     | 94/211 [02:57<07:25,  3.81s/it]
 45%|████▌     | 95/211 [02:58<06:11,  3.20s/it]
 45%|████▌     | 96/211 [03:00<05:17,  2.76s/it]
 46%|████▌     | 97/211 [03:02<04:47,  2.52s/it]
 46%|████▋     | 98/211 [03:04<04:15,  2.27s/it]
 47%|████▋     | 99/211 [03:06<03:56,  2.11s/it]
 47%|████▋     | 100/211 [03:07<03:42,  2.00s/it]
 48%|████▊     | 101/211 [03:09<03:28,  1.90s/it]
 48%|████▊     | 102/211 [03:11<03:17,  1.81s/it]
 49%|████▉     | 103/211 [03:12<03:12,  1.78s/it]
 49%|████▉     | 104/211 [03:14<03:05,  1.74s/it]
 50%|████▉     | 105/211 [03:16<03:01,  1.71s/it]
 50%|█████     | 106/211 [03:17<03:05,  1.76s/it]
 51%|█████     | 107/211 [03:19<03:03,  1.76s/it]
 51%|█████     | 108/211 [03:21<02:58,  1.74s/it]
 52%|█████▏    | 109/211 [03:23<02:58,  1.75s/it]
 52%|█████▏    | 110/211 [03:25<02:58,  1.77s/it]
 53%|█████▎    | 111/211 [03:26<02:51,  1.72s/it]
 53%|█████▎    | 112/211 [03:28<02:49,  1.71s/it]
 54%|█████▎    | 113/211 [03:29<02:46,  1.70s/it]
 54%|█████▍    | 114/211 [03:31<02:44,  1.69s/it]
 55%|█████▍    | 115/211 [03:33<02:41,  1.68s/it]
 55%|█████▍    | 116/211 [03:35<02:42,  1.71s/it]
 55%|█████▌    | 117/211 [03:36<02:38,  1.69s/it]
 56%|█████▌    | 118/211 [03:38<02:38,  1.70s/it]
 56%|█████▋    | 119/211 [03:40<02:35,  1.69s/it]
 57%|█████▋    | 120/211 [03:41<02:36,  1.72s/it]
 57%|█████▋    | 121/211 [03:43<02:36,  1.74s/it]
 58%|█████▊    | 122/211 [03:45<02:31,  1.70s/it]
 58%|█████▊    | 123/211 [03:47<02:34,  1.76s/it]
 59%|█████▉    | 124/211 [03:48<02:30,  1.73s/it]
 59%|█████▉    | 125/211 [03:50<02:25,  1.70s/it]
 60%|█████▉    | 126/211 [03:52<02:22,  1.68s/it]
 60%|██████    | 127/211 [03:53<02:21,  1.69s/it]
 61%|██████    | 128/211 [03:55<02:19,  1.68s/it]
 61%|██████    | 129/211 [03:57<02:17,  1.68s/it]
 62%|██████▏   | 130/211 [03:59<02:21,  1.75s/it]
 62%|██████▏   | 131/211 [04:00<02:20,  1.75s/it]
 63%|██████▎   | 132/211 [04:02<02:18,  1.75s/it]
 63%|██████▎   | 133/211 [04:04<02:21,  1.81s/it]
 64%|██████▎   | 134/211 [04:06<02:15,  1.76s/it]
 64%|██████▍   | 135/211 [04:08<02:17,  1.81s/it]
 64%|██████▍   | 136/211 [04:10<02:20,  1.88s/it]
 65%|██████▍   | 137/211 [04:11<02:13,  1.80s/it]
 65%|██████▌   | 138/211 [04:13<02:07,  1.74s/it]
 66%|██████▌   | 139/211 [04:16<02:38,  2.20s/it]
 66%|██████▋   | 140/211 [04:18<02:27,  2.08s/it]
 67%|██████▋   | 141/211 [04:20<02:16,  1.95s/it]
 67%|██████▋   | 142/211 [04:21<02:11,  1.91s/it]
 68%|██████▊   | 143/211 [04:23<02:06,  1.86s/it]
 68%|██████▊   | 144/211 [04:25<02:02,  1.82s/it]
 69%|██████▊   | 145/211 [04:27<01:58,  1.80s/it]
 69%|██████▉   | 146/211 [04:28<01:53,  1.75s/it]
 70%|██████▉   | 147/211 [04:30<01:54,  1.78s/it]
 70%|███████   | 148/211 [04:32<01:53,  1.80s/it]
 71%|███████   | 149/211 [04:34<01:54,  1.85s/it]
 71%|███████   | 150/211 [04:36<01:49,  1.80s/it]
 72%|███████▏  | 151/211 [04:37<01:46,  1.77s/it]
 72%|███████▏  | 152/211 [04:39<01:45,  1.79s/it]
 73%|███████▎  | 153/211 [04:41<01:43,  1.78s/it]
 73%|███████▎  | 154/211 [04:43<01:41,  1.78s/it]
 73%|███████▎  | 155/211 [04:44<01:37,  1.73s/it]
 74%|███████▍  | 156/211 [04:46<01:35,  1.73s/it]
 74%|███████▍  | 157/211 [04:48<01:33,  1.73s/it]
 75%|███████▍  | 158/211 [04:50<01:33,  1.77s/it]
 75%|███████▌  | 159/211 [04:51<01:30,  1.74s/it]
 76%|███████▌  | 160/211 [04:53<01:27,  1.72s/it]
 76%|███████▋  | 161/211 [04:55<01:28,  1.77s/it]
 77%|███████▋  | 162/211 [04:57<01:28,  1.80s/it]
 77%|███████▋  | 163/211 [04:58<01:24,  1.75s/it]
 78%|███████▊  | 164/211 [05:00<01:26,  1.84s/it]
 78%|███████▊  | 165/211 [05:02<01:21,  1.77s/it]
 79%|███████▊  | 166/211 [05:04<01:17,  1.72s/it]
 79%|███████▉  | 167/211 [05:05<01:15,  1.72s/it]
 80%|███████▉  | 168/211 [05:07<01:16,  1.79s/it]
 80%|████████  | 169/211 [05:09<01:17,  1.84s/it]
 81%|████████  | 170/211 [05:11<01:15,  1.83s/it]
 81%|████████  | 171/211 [05:13<01:12,  1.81s/it]
 82%|████████▏ | 172/211 [05:15<01:09,  1.79s/it]
 82%|████████▏ | 173/211 [05:16<01:06,  1.74s/it]
 82%|████████▏ | 174/211 [05:18<01:04,  1.75s/it]
 83%|████████▎ | 175/211 [05:20<01:02,  1.74s/it]
 83%|████████▎ | 176/211 [05:41<04:21,  7.47s/it]
 84%|████████▍ | 177/211 [05:42<03:17,  5.81s/it]
 84%|████████▍ | 178/211 [05:44<02:32,  4.63s/it]
 85%|████████▍ | 179/211 [05:46<02:00,  3.75s/it]
 85%|████████▌ | 180/211 [05:48<01:36,  3.13s/it]
 86%|████████▌ | 181/211 [05:49<01:21,  2.72s/it]
 86%|████████▋ | 182/211 [05:51<01:10,  2.42s/it]
 87%|████████▋ | 183/211 [05:53<01:01,  2.19s/it]
 87%|████████▋ | 184/211 [05:55<00:54,  2.03s/it]
 88%|████████▊ | 185/211 [05:56<00:50,  1.94s/it]
 88%|████████▊ | 186/211 [05:58<00:46,  1.88s/it]
 89%|████████▊ | 187/211 [06:00<00:43,  1.82s/it]
 89%|████████▉ | 188/211 [06:01<00:41,  1.79s/it]
 90%|████████▉ | 189/211 [06:03<00:40,  1.86s/it]
 90%|█████████ | 190/211 [06:06<00:45,  2.14s/it]
 91%|█████████ | 191/211 [06:08<00:40,  2.02s/it]
 91%|█████████ | 192/211 [06:10<00:36,  1.91s/it]
 91%|█████████▏| 193/211 [06:11<00:33,  1.85s/it]
 92%|█████████▏| 194/211 [06:13<00:30,  1.82s/it]
 92%|█████████▏| 195/211 [06:15<00:28,  1.78s/it]
 93%|█████████▎| 196/211 [06:17<00:28,  1.92s/it]
 93%|█████████▎| 197/211 [06:19<00:25,  1.84s/it]
 94%|█████████▍| 198/211 [06:20<00:23,  1.80s/it]
 94%|█████████▍| 199/211 [06:22<00:22,  1.84s/it]
 95%|█████████▍| 200/211 [06:24<00:19,  1.80s/it]
 95%|█████████▌| 201/211 [06:26<00:17,  1.77s/it]
 96%|█████████▌| 202/211 [06:28<00:16,  1.82s/it]
 96%|█████████▌| 203/211 [06:29<00:14,  1.77s/it]
 97%|█████████▋| 204/211 [06:31<00:12,  1.73s/it]
 97%|█████████▋| 205/211 [06:33<00:10,  1.72s/it]
 98%|█████████▊| 206/211 [06:34<00:08,  1.73s/it]
 98%|█████████▊| 207/211 [06:36<00:06,  1.71s/it]
 99%|█████████▊| 208/211 [06:39<00:05,  1.97s/it]
 99%|█████████▉| 209/211 [06:40<00:03,  1.93s/it]
+computing/reading reference batch statistics...
+computing sample batch activations...
  0%|          | 0/469 [00:00<?, ?it/s]
  0%|          | 1/469 [00:02<15:45,  2.02s/it]
  0%|          | 2/469 [00:03<14:20,  1.84s/it]
  1%|          | 3/469 [00:05<13:51,  1.78s/it]
  1%|          | 4/469 [00:07<13:37,  1.76s/it]
  1%|          | 5/469 [00:08<13:23,  1.73s/it]
  1%|▏         | 6/469 [00:10<13:11,  1.71s/it]
  1%|▏         | 7/469 [00:12<13:03,  1.70s/it]
  2%|▏         | 8/469 [00:13<12:57,  1.69s/it]
  2%|▏         | 9/469 [00:15<12:53,  1.68s/it]
  2%|▏         | 10/469 [00:17<13:08,  1.72s/it]
  2%|▏         | 11/469 [00:26<31:14,  4.09s/it]
  3%|▎         | 12/469 [00:28<25:29,  3.35s/it]
  3%|▎         | 13/469 [00:30<21:32,  2.84s/it]
  3%|▎         | 14/469 [00:31<18:50,  2.49s/it]
  3%|▎         | 15/469 [00:33<16:50,  2.23s/it]
  3%|▎         | 16/469 [00:35<15:32,  2.06s/it]
  4%|▎         | 17/469 [00:36<14:57,  1.99s/it]
  4%|▍         | 18/469 [00:38<14:12,  1.89s/it]
  4%|▍         | 19/469 [00:40<13:45,  1.83s/it]
  4%|▍         | 20/469 [00:41<13:17,  1.78s/it]
  4%|▍         | 21/469 [00:43<13:08,  1.76s/it]
  5%|▍         | 22/469 [00:45<12:51,  1.73s/it]
  5%|▍         | 23/469 [00:47<12:51,  1.73s/it]
  5%|▌         | 24/469 [00:48<12:38,  1.70s/it]
  5%|▌         | 25/469 [00:50<12:33,  1.70s/it]
  6%|▌         | 26/469 [00:51<12:26,  1.68s/it]
  6%|▌         | 27/469 [00:53<12:26,  1.69s/it]
  6%|▌         | 28/469 [00:55<12:17,  1.67s/it]
  6%|▌         | 29/469 [00:57<12:18,  1.68s/it]
  6%|▋         | 30/469 [00:58<12:12,  1.67s/it]
  7%|▋         | 31/469 [01:00<12:15,  1.68s/it]
  7%|▋         | 32/469 [01:02<12:10,  1.67s/it]
  7%|▋         | 33/469 [01:04<12:55,  1.78s/it]
  7%|▋         | 34/469 [01:05<12:41,  1.75s/it]
  7%|▋         | 35/469 [01:07<12:24,  1.72s/it]
  8%|▊         | 36/469 [01:09<12:34,  1.74s/it]
  8%|▊         | 37/469 [01:10<12:19,  1.71s/it]
  8%|▊         | 38/469 [01:12<12:27,  1.73s/it]
  8%|▊         | 39/469 [01:14<12:14,  1.71s/it]
  9%|▊         | 40/469 [01:15<12:10,  1.70s/it]
  9%|▊         | 41/469 [01:17<12:07,  1.70s/it]
  9%|▉         | 42/469 [01:19<12:09,  1.71s/it]
  9%|▉         | 43/469 [01:21<12:18,  1.73s/it]
  9%|▉         | 44/469 [01:22<12:13,  1.72s/it]
 10%|▉         | 45/469 [01:24<12:06,  1.71s/it]
 10%|▉         | 46/469 [01:26<12:22,  1.75s/it]
 10%|█         | 47/469 [01:28<12:35,  1.79s/it]
 10%|█         | 48/469 [01:29<12:12,  1.74s/it]
 10%|█         | 49/469 [01:31<12:01,  1.72s/it]
 11%|█         | 50/469 [01:33<12:24,  1.78s/it]
 11%|█         | 51/469 [01:35<12:32,  1.80s/it]
 11%|█         | 52/469 [01:37<12:29,  1.80s/it]
 11%|█▏        | 53/469 [01:39<13:30,  1.95s/it]
 12%|█▏        | 54/469 [01:41<12:50,  1.86s/it]
 12%|█▏        | 55/469 [01:42<12:27,  1.81s/it]
 12%|█▏        | 56/469 [01:44<12:05,  1.76s/it]
 12%|█▏        | 57/469 [01:46<12:17,  1.79s/it]
 12%|█▏        | 58/469 [01:47<12:08,  1.77s/it]
 13%|█▎        | 59/469 [01:49<12:09,  1.78s/it]
 13%|█▎        | 60/469 [01:51<12:17,  1.80s/it]
 13%|█▎        | 61/469 [01:53<12:07,  1.78s/it]
 13%|█▎        | 62/469 [01:55<12:24,  1.83s/it]
 13%|█▎        | 63/469 [01:56<12:00,  1.77s/it]
 14%|█▎        | 64/469 [01:58<11:42,  1.73s/it]
 14%|█▍        | 65/469 [02:00<11:35,  1.72s/it]
 14%|█▍        | 66/469 [02:02<11:34,  1.72s/it]
 14%|█▍        | 67/469 [02:04<12:11,  1.82s/it]
 14%|█▍        | 68/469 [02:05<11:45,  1.76s/it]
 15%|█▍        | 69/469 [02:07<11:31,  1.73s/it]
 15%|█▍        | 70/469 [02:09<11:27,  1.72s/it]
 15%|█▌        | 71/469 [02:10<11:25,  1.72s/it]
 15%|█▌        | 72/469 [02:12<11:16,  1.70s/it]
 16%|█▌        | 73/469 [02:14<11:09,  1.69s/it]
 16%|█▌        | 74/469 [02:15<11:32,  1.75s/it]
 16%|█▌        | 75/469 [02:17<11:47,  1.80s/it]
 16%|█▌        | 76/469 [02:19<11:43,  1.79s/it]
 16%|█▋        | 77/469 [02:21<12:10,  1.86s/it]
 17%|█▋        | 78/469 [02:23<11:45,  1.80s/it]
 17%|█▋        | 79/469 [02:25<12:10,  1.87s/it]
 17%|█▋        | 80/469 [02:27<11:48,  1.82s/it]
 17%|█▋        | 81/469 [02:28<11:30,  1.78s/it]
 17%|█▋        | 82/469 [02:30<11:36,  1.80s/it]
 18%|█▊        | 83/469 [02:32<11:18,  1.76s/it]
 18%|█▊        | 84/469 [02:34<11:12,  1.75s/it]
 18%|█▊        | 85/469 [02:35<11:03,  1.73s/it]
 18%|█▊        | 86/469 [02:37<11:06,  1.74s/it]
 19%|█▊        | 87/469 [02:39<11:04,  1.74s/it]
 19%|█▉        | 88/469 [02:40<10:56,  1.72s/it]
 19%|█▉        | 89/469 [02:42<11:07,  1.76s/it]
 19%|█▉        | 90/469 [02:45<12:56,  2.05s/it]
 19%|█▉        | 91/469 [02:47<12:10,  1.93s/it]
 20%|█▉        | 92/469 [02:48<11:35,  1.84s/it]
 20%|█▉        | 93/469 [02:50<11:19,  1.81s/it]
 20%|██        | 94/469 [02:52<11:04,  1.77s/it]
 20%|██        | 95/469 [02:53<10:48,  1.73s/it]
 20%|██        | 96/469 [02:55<10:38,  1.71s/it]
 21%|██        | 97/469 [02:57<11:01,  1.78s/it]
 21%|██        | 98/469 [02:59<10:50,  1.75s/it]
 21%|██        | 99/469 [03:00<10:33,  1.71s/it]
 21%|██▏       | 100/469 [03:02<10:35,  1.72s/it]
 22%|██▏       | 101/469 [03:04<11:42,  1.91s/it]
 22%|██▏       | 102/469 [03:06<11:19,  1.85s/it]
 22%|██▏       | 103/469 [03:08<11:48,  1.94s/it]
 22%|██▏       | 104/469 [03:10<11:23,  1.87s/it]
 22%|██▏       | 105/469 [03:11<10:53,  1.80s/it]
 23%|██▎       | 106/469 [03:13<10:34,  1.75s/it]
 23%|██▎       | 107/469 [03:15<10:32,  1.75s/it]
 23%|██▎       | 108/469 [03:17<10:19,  1.71s/it]
 23%|██▎       | 109/469 [03:18<10:08,  1.69s/it]
 23%|██▎       | 110/469 [03:20<11:07,  1.86s/it]
 24%|██▎       | 111/469 [03:22<10:49,  1.81s/it]
 24%|██▍       | 112/469 [03:24<10:35,  1.78s/it]
 24%|██▍       | 113/469 [03:25<10:22,  1.75s/it]
 24%|██▍       | 114/469 [03:27<10:13,  1.73s/it]
 25%|██▍       | 115/469 [03:29<10:01,  1.70s/it]
 25%|██▍       | 116/469 [03:30<09:56,  1.69s/it]
 25%|██▍       | 117/469 [03:32<09:56,  1.69s/it]
 25%|██▌       | 118/469 [03:34<09:45,  1.67s/it]
 25%|██▌       | 119/469 [03:35<09:45,  1.67s/it]
 26%|██▌       | 120/469 [03:37<09:45,  1.68s/it]
 26%|██▌       | 121/469 [03:40<11:56,  2.06s/it]
 26%|██▌       | 122/469 [03:42<11:18,  1.95s/it]
 26%|██▌       | 123/469 [03:44<11:00,  1.91s/it]
 26%|██▋       | 124/469 [03:45<10:31,  1.83s/it]
 27%|██▋       | 125/469 [03:47<10:06,  1.76s/it]
 27%|██▋       | 126/469 [03:48<09:48,  1.72s/it]
 27%|██▋       | 127/469 [03:50<09:57,  1.75s/it]
 27%|██▋       | 128/469 [03:52<09:52,  1.74s/it]
 28%|██▊       | 129/469 [03:54<09:46,  1.72s/it]
 28%|██▊       | 130/469 [03:56<09:59,  1.77s/it]
 28%|██▊       | 131/469 [03:57<09:56,  1.76s/it]
 28%|██▊       | 132/469 [03:59<09:40,  1.72s/it]
 28%|██▊       | 133/469 [04:01<10:56,  1.95s/it]
 29%|██▊       | 134/469 [04:03<10:35,  1.90s/it]
 29%|██▉       | 135/469 [04:05<10:05,  1.81s/it]
 29%|██▉       | 136/469 [04:06<09:45,  1.76s/it]
 29%|██▉       | 137/469 [04:08<09:30,  1.72s/it]
 29%|██▉       | 138/469 [04:10<09:35,  1.74s/it]
 30%|██▉       | 139/469 [04:11<09:19,  1.69s/it]
 30%|██▉       | 140/469 [04:13<09:06,  1.66s/it]
 30%|███       | 141/469 [04:15<09:00,  1.65s/it]
 30%|███       | 142/469 [04:16<08:55,  1.64s/it]
 30%|███       | 143/469 [04:18<08:55,  1.64s/it]
 31%|███       | 144/469 [04:19<08:47,  1.62s/it]
 31%|███       | 145/469 [04:21<09:06,  1.69s/it]
 31%|███       | 146/469 [04:23<09:06,  1.69s/it]
 31%|███▏      | 147/469 [04:25<08:56,  1.67s/it]
 32%|███▏      | 148/469 [04:26<08:56,  1.67s/it]
 32%|███▏      | 149/469 [04:28<09:09,  1.72s/it]
 32%|███▏      | 150/469 [04:30<08:57,  1.69s/it]
 32%|███▏      | 151/469 [04:31<08:47,  1.66s/it]
 32%|███▏      | 152/469 [04:33<08:41,  1.64s/it]
 33%|███▎      | 153/469 [04:35<08:34,  1.63s/it]
 33%|███▎      | 154/469 [04:36<08:32,  1.63s/it]
 33%|███▎      | 155/469 [04:38<08:25,  1.61s/it]
 33%|███▎      | 156/469 [04:39<08:23,  1.61s/it]
 33%|███▎      | 157/469 [04:41<08:22,  1.61s/it]
 34%|███▎      | 158/469 [04:47<15:58,  3.08s/it]
 34%|███▍      | 159/469 [04:49<13:44,  2.66s/it]
 34%|███▍      | 160/469 [04:51<12:27,  2.42s/it]
 34%|███▍      | 161/469 [04:53<11:14,  2.19s/it]
 35%|███▍      | 162/469 [04:54<10:22,  2.03s/it]
 35%|███▍      | 163/469 [04:56<09:58,  1.96s/it]
 35%|███▍      | 164/469 [04:58<09:35,  1.89s/it]
 35%|███▌      | 165/469 [04:59<09:06,  1.80s/it]
 35%|███▌      | 166/469 [05:01<08:47,  1.74s/it]
 36%|███▌      | 167/469 [05:03<08:31,  1.69s/it]
 36%|███▌      | 168/469 [05:05<08:55,  1.78s/it]
 36%|███▌      | 169/469 [05:07<09:07,  1.82s/it]
 36%|███▌      | 170/469 [05:08<08:54,  1.79s/it]
 36%|███▋      | 171/469 [05:10<08:46,  1.77s/it]
 37%|███▋      | 172/469 [05:12<08:31,  1.72s/it]
 37%|███▋      | 173/469 [05:13<08:23,  1.70s/it]
 37%|███▋      | 174/469 [05:15<08:16,  1.68s/it]
 37%|███▋      | 175/469 [05:17<08:12,  1.67s/it]
 38%|███▊      | 176/469 [05:18<08:14,  1.69s/it]
 38%|███▊      | 177/469 [05:20<08:13,  1.69s/it]
 38%|███▊      | 178/469 [05:22<08:07,  1.68s/it]
 38%|███▊      | 179/469 [05:23<08:00,  1.66s/it]
 38%|███▊      | 180/469 [05:25<07:58,  1.66s/it]
 39%|███▊      | 181/469 [05:26<07:53,  1.64s/it]
 39%|███▉      | 182/469 [05:28<07:47,  1.63s/it]
 39%|███▉      | 183/469 [05:30<07:48,  1.64s/it]
 39%|███▉      | 184/469 [05:31<07:50,  1.65s/it]
 39%|███▉      | 185/469 [05:33<07:44,  1.64s/it]
 40%|███▉      | 186/469 [05:35<07:50,  1.66s/it]
 40%|███▉      | 187/469 [05:37<08:00,  1.70s/it]
 40%|████      | 188/469 [05:38<07:55,  1.69s/it]
 40%|████      | 189/469 [05:40<07:48,  1.67s/it]
 41%|████      | 190/469 [05:42<07:53,  1.70s/it]
 41%|████      | 191/469 [05:43<07:44,  1.67s/it]
 41%|████      | 192/469 [05:45<07:38,  1.65s/it]
 41%|████      | 193/469 [05:46<07:32,  1.64s/it]
 41%|████▏     | 194/469 [05:48<07:54,  1.73s/it]
 42%|███��▏     | 195/469 [05:50<07:45,  1.70s/it]
 42%|████▏     | 196/469 [05:52<07:44,  1.70s/it]
 42%|████▏     | 197/469 [05:53<07:48,  1.72s/it]
 42%|████▏     | 198/469 [05:55<07:40,  1.70s/it]
 42%|████▏     | 199/469 [05:57<08:02,  1.79s/it]
 43%|████▎     | 200/469 [05:59<07:49,  1.74s/it]
 43%|████▎     | 201/469 [06:00<07:41,  1.72s/it]
 43%|████▎     | 202/469 [06:02<07:38,  1.72s/it]
 43%|████▎     | 203/469 [06:04<07:37,  1.72s/it]
 43%|████▎     | 204/469 [06:05<07:29,  1.70s/it]
 44%|████▎     | 205/469 [06:07<07:31,  1.71s/it]
 44%|████▍     | 206/469 [06:09<07:21,  1.68s/it]
 44%|████▍     | 207/469 [06:10<07:16,  1.67s/it]
 44%|████▍     | 208/469 [06:12<07:09,  1.64s/it]
 45%|████▍     | 209/469 [06:14<07:07,  1.64s/it]
 45%|████▍     | 210/469 [06:15<07:11,  1.67s/it]
 45%|████▍     | 211/469 [06:17<07:21,  1.71s/it]
 45%|████▌     | 212/469 [06:19<07:27,  1.74s/it]
 45%|████▌     | 213/469 [06:21<07:20,  1.72s/it]
 46%|████▌     | 214/469 [06:22<07:09,  1.69s/it]
 46%|████▌     | 215/469 [06:24<07:07,  1.68s/it]
 46%|████▌     | 216/469 [06:26<07:00,  1.66s/it]
 46%|████▋     | 217/469 [06:27<06:56,  1.65s/it]
 46%|████▋     | 218/469 [06:29<06:48,  1.63s/it]
 47%|████▋     | 219/469 [06:31<07:08,  1.72s/it]
 47%|████▋     | 220/469 [06:32<06:57,  1.68s/it]
 47%|████▋     | 221/469 [06:34<06:56,  1.68s/it]
 47%|████▋     | 222/469 [06:36<06:54,  1.68s/it]
 48%|████▊     | 223/469 [06:37<06:47,  1.66s/it]
 48%|████▊     | 224/469 [06:39<06:44,  1.65s/it]
 48%|████▊     | 225/469 [06:41<06:42,  1.65s/it]
 48%|████▊     | 226/469 [06:42<06:35,  1.63s/it]
 48%|████▊     | 227/469 [06:44<06:31,  1.62s/it]
 49%|████▊     | 228/469 [06:46<06:57,  1.73s/it]
 49%|████▉     | 229/469 [06:47<06:47,  1.70s/it]
 49%|████▉     | 230/469 [06:49<06:46,  1.70s/it]
 49%|████▉     | 231/469 [06:51<06:40,  1.68s/it]
 49%|████▉     | 232/469 [06:52<06:33,  1.66s/it]
 50%|████▉     | 233/469 [06:54<06:46,  1.72s/it]
 50%|████▉     | 234/469 [06:56<07:17,  1.86s/it]
 50%|█████     | 235/469 [06:58<07:00,  1.80s/it]
 50%|█████     | 236/469 [07:00<06:47,  1.75s/it]
 51%|█████     | 237/469 [07:01<06:40,  1.72s/it]
 51%|█████     | 238/469 [07:03<06:31,  1.69s/it]
 51%|█████     | 239/469 [07:05<06:23,  1.67s/it]
 51%|█████     | 240/469 [07:06<06:18,  1.65s/it]
 51%|█████▏    | 241/469 [07:08<06:14,  1.64s/it]
 52%|█████▏    | 242/469 [07:09<06:10,  1.63s/it]
 52%|█████▏    | 243/469 [07:11<06:41,  1.77s/it]
 52%|█████▏    | 244/469 [07:13<06:27,  1.72s/it]
 52%|█████▏    | 245/469 [07:15<06:49,  1.83s/it]
 52%|█████▏    | 246/469 [07:17<06:33,  1.76s/it]
 53%|█████▎    | 247/469 [07:18<06:22,  1.72s/it]
 53%|█████▎    | 248/469 [07:20<06:19,  1.72s/it]
 53%|█████▎    | 249/469 [07:22<06:12,  1.69s/it]
 53%|█████▎    | 250/469 [07:24<06:16,  1.72s/it]
 54%|█████▎    | 251/469 [07:25<06:10,  1.70s/it]
 54%|█████▎    | 252/469 [07:27<06:11,  1.71s/it]
 54%|█████▍    | 253/469 [07:29<06:03,  1.68s/it]
 54%|█████▍    | 254/469 [07:30<06:04,  1.70s/it]
 54%|█████▍    | 255/469 [07:32<05:56,  1.67s/it]
 55%|█████▍    | 256/469 [07:34<06:02,  1.70s/it]
 55%|█████▍    | 257/469 [07:35<05:56,  1.68s/it]
 55%|█████▌    | 258/469 [07:37<06:02,  1.72s/it]
 55%|█████▌    | 259/469 [07:39<05:57,  1.70s/it]
 55%|█████▌    | 260/469 [07:40<05:50,  1.68s/it]
 56%|█████▌    | 261/469 [07:42<05:50,  1.69s/it]
 56%|█████▌    | 262/469 [07:44<05:52,  1.70s/it]
 56%|█████▌    | 263/469 [07:46<05:50,  1.70s/it]
 56%|█████▋    | 264/469 [07:47<05:48,  1.70s/it]
 57%|█████▋    | 265/469 [07:49<05:45,  1.69s/it]
 57%|█████▋    | 266/469 [07:50<05:37,  1.66s/it]
 57%|█████▋    | 267/469 [07:52<05:33,  1.65s/it]
 57%|█████▋    | 268/469 [07:58<09:27,  2.82s/it]
 57%|█████▋    | 269/469 [07:59<08:23,  2.52s/it]
 58%|█████▊    | 270/469 [08:01<07:26,  2.24s/it]
 58%|█████▊    | 271/469 [08:03<06:53,  2.09s/it]
 58%|█████▊    | 272/469 [08:04<06:26,  1.96s/it]
 58%|█████▊    | 273/469 [08:06<06:03,  1.86s/it]
 58%|█████▊    | 274/469 [08:08<05:44,  1.77s/it]
 59%|█████▊    | 275/469 [08:09<05:33,  1.72s/it]
 59%|█████▉    | 276/469 [08:11<05:25,  1.69s/it]
 59%|█████▉    | 277/469 [08:12<05:21,  1.68s/it]
 59%|█████▉    | 278/469 [08:14<05:21,  1.68s/it]
 59%|█████▉    | 279/469 [08:16<05:26,  1.72s/it]
 60%|█████▉    | 280/469 [08:18<05:16,  1.67s/it]
 60%|█████▉    | 281/469 [08:20<05:45,  1.84s/it]
 60%|██████    | 282/469 [08:21<05:31,  1.77s/it]
 60%|██████    | 283/469 [08:23<05:26,  1.75s/it]
 61%|██████    | 284/469 [08:25<05:17,  1.72s/it]
 61%|██████    | 285/469 [08:26<05:10,  1.69s/it]
 61%|██████    | 286/469 [08:28<05:21,  1.76s/it]
 61%|██████    | 287/469 [08:30<05:16,  1.74s/it]
 61%|██████▏   | 288/469 [08:32<05:13,  1.73s/it]
 62%|██████▏   | 289/469 [08:33<05:03,  1.69s/it]
 62%|██████▏   | 290/469 [08:35<05:10,  1.74s/it]
 62%|██████▏   | 291/469 [08:37<05:17,  1.78s/it]
 62%|██████▏   | 292/469 [08:39<05:17,  1.79s/it]
 62%|██████▏   | 293/469 [08:41<05:19,  1.81s/it]
 63%|██████▎   | 294/469 [08:42<05:06,  1.75s/it]
 63%|██████▎   | 295/469 [08:44<04:55,  1.70s/it]
 63%|██████▎   | 296/469 [08:46<04:53,  1.70s/it]
 63%|██████▎   | 297/469 [08:47<04:46,  1.67s/it]
 64%|██████▎   | 298/469 [08:49<04:40,  1.64s/it]
 64%|██████▍   | 299/469 [08:50<04:39,  1.65s/it]
 64%|██████▍   | 300/469 [08:52<04:37,  1.64s/it]
 64%|██████▍   | 301/469 [08:54<04:39,  1.66s/it]
 64%|██████▍   | 302/469 [08:55<04:35,  1.65s/it]
 65%|██████▍   | 303/469 [08:57<04:36,  1.66s/it]
 65%|██████▍   | 304/469 [08:59<04:36,  1.67s/it]
 65%|██████▌   | 305/469 [09:00<04:34,  1.68s/it]
 65%|██████▌   | 306/469 [09:02<04:32,  1.67s/it]
 65%|██████▌   | 307/469 [09:04<04:30,  1.67s/it]
 66%|██████▌   | 308/469 [09:05<04:25,  1.65s/it]
 66%|██████▌   | 309/469 [09:07<04:23,  1.65s/it]
 66%|██████▌   | 310/469 [09:09<04:20,  1.64s/it]
 66%|██████▋   | 311/469 [09:10<04:18,  1.64s/it]
 67%|██████▋   | 312/469 [09:12<04:19,  1.65s/it]
 67%|██████▋   | 313/469 [09:14<04:15,  1.64s/it]
 67%|██████▋   | 314/469 [09:15<04:10,  1.62s/it]
 67%|██████▋   | 315/469 [09:17<04:08,  1.61s/it]
 67%|██████▋   | 316/469 [09:19<04:21,  1.71s/it]
 68%|██████▊   | 317/469 [09:20<04:23,  1.73s/it]
 68%|██████▊   | 318/469 [09:22<04:18,  1.71s/it]
 68%|██████▊   | 319/469 [09:24<04:20,  1.74s/it]
 68%|██████▊   | 320/469 [09:26<04:15,  1.71s/it]
 68%|██████▊   | 321/469 [09:27<04:21,  1.77s/it]
 69%|██████▊   | 322/469 [09:29<04:12,  1.72s/it]
 69%|██████▉   | 323/469 [09:31<04:04,  1.67s/it]
 69%|██████▉   | 324/469 [09:32<04:01,  1.67s/it]
 69%|██████▉   | 325/469 [09:34<03:55,  1.64s/it]
 70%|██████▉   | 326/469 [09:35<03:52,  1.62s/it]
 70%|██████▉   | 327/469 [09:37<03:49,  1.61s/it]
 70%|██████▉   | 328/469 [09:39<03:52,  1.65s/it]
 70%|███████   | 329/469 [09:41<04:01,  1.72s/it]
 70%|███████   | 330/469 [09:43<04:06,  1.77s/it]
 71%|███████   | 331/469 [09:44<04:03,  1.76s/it]
 71%|███████   | 332/469 [09:46<03:55,  1.72s/it]
 71%|███████   | 333/469 [09:47<03:48,  1.68s/it]
 71%|███████   | 334/469 [09:49<03:46,  1.68s/it]
 71%|███████▏  | 335/469 [09:51<03:42,  1.66s/it]
 72%|███████▏  | 336/469 [09:52<03:37,  1.63s/it]
 72%|███████▏  | 337/469 [09:54<03:33,  1.62s/it]
 72%|███████▏  | 338/469 [09:56<03:33,  1.63s/it]
 72%|███████▏  | 339/469 [09:57<03:31,  1.62s/it]
 72%|███████▏  | 340/469 [09:59<03:27,  1.61s/it]
 73%|███████▎  | 341/469 [10:01<03:32,  1.66s/it]
 73%|███████▎  | 342/469 [10:03<03:41,  1.75s/it]
 73%|███████▎  | 343/469 [10:04<03:35,  1.71s/it]
 73%|███████▎  | 344/469 [10:06<03:31,  1.70s/it]
 74%|███████▎  | 345/469 [10:27<15:27,  7.48s/it]
 74%|███████▍  | 346/469 [10:29<11:48,  5.76s/it]
 74%|███████▍  | 347/469 [10:30<09:19,  4.59s/it]
 74%|███████▍  | 348/469 [10:32<07:29,  3.71s/it]
 74%|███████▍  | 349/469 [10:34<06:11,  3.10s/it]
 75%|███████▍  | 350/469 [10:36<05:49,  2.93s/it]
 75%|███████▍  | 351/469 [10:38<05:09,  2.63s/it]
 75%|███████▌  | 352/469 [10:40<04:39,  2.39s/it]
 75%|███████▌  | 353/469 [10:42<04:16,  2.21s/it]
 75%|███████▌  | 354/469 [10:43<03:53,  2.03s/it]
 76%|███████▌  | 355/469 [10:45<03:35,  1.89s/it]
 76%|███████▌  | 356/469 [10:47<03:30,  1.86s/it]
 76%|███████▌  | 357/469 [10:48<03:23,  1.82s/it]
 76%|███████▋  | 358/469 [10:50<03:19,  1.80s/it]
 77%|███████▋  | 359/469 [10:52<03:20,  1.82s/it]
 77%|███████▋  | 360/469 [10:54<03:23,  1.87s/it]
 77%|███████▋  | 361/469 [10:56<03:16,  1.82s/it]
 77%|███████▋  | 362/469 [10:58<03:24,  1.91s/it]
 77%|███████▋  | 363/469 [11:00<03:12,  1.82s/it]
 78%|███████▊  | 364/469 [11:01<03:05,  1.77s/it]
 78%|███████▊  | 365/469 [11:03<02:58,  1.72s/it]
 78%|███████▊  | 366/469 [11:04<02:54,  1.69s/it]
 78%|███████▊  | 367/469 [11:06<02:52,  1.69s/it]
 78%|███████▊  | 368/469 [11:08<02:50,  1.68s/it]
 79%|███████▊  | 369/469 [11:10<02:50,  1.71s/it]
 79%|███████▉  | 370/469 [11:11<02:49,  1.71s/it]
 79%|███████▉  | 371/469 [11:13<02:46,  1.70s/it]
 79%|███████▉  | 372/469 [11:15<02:49,  1.74s/it]
 80%|███████▉  | 373/469 [11:16<02:43,  1.70s/it]
 80%|███████▉  | 374/469 [11:18<02:42,  1.71s/it]
 80%|███████▉  | 375/469 [11:20<02:44,  1.75s/it]
 80%|████████  | 376/469 [11:22<02:47,  1.80s/it]
 80%|████████  | 377/469 [11:24<02:41,  1.76s/it]
 81%|████████  | 378/469 [11:25<02:34,  1.70s/it]
 81%|████████  | 379/469 [11:27<02:30,  1.67s/it]
 81%|████████  | 380/469 [11:29<02:33,  1.72s/it]
 81%|████████  | 381/469 [11:30<02:28,  1.69s/it]
 81%|████████▏ | 382/469 [11:32<02:24,  1.66s/it]
 82%|████████▏ | 383/469 [11:34<02:29,  1.73s/it]
 82%|████████▏ | 384/469 [11:36<02:33,  1.80s/it]
 82%|████████▏ | 385/469 [11:38<02:34,  1.84s/it]
 82%|████████▏ | 386/469 [11:39<02:27,  1.77s/it]
 83%|████████▎ | 387/469 [11:41<02:26,  1.79s/it]
 83%|████████▎ | 388/469 [11:43<02:21,  1.74s/it]
 83%|████████▎ | 389/469 [11:45<02:23,  1.79s/it]
 83%|████████▎ | 390/469 [11:46<02:16,  1.73s/it]
 83%|████████▎ | 391/469 [11:48<02:12,  1.70s/it]
 84%|████████▎ | 392/469 [11:49<02:09,  1.69s/it]
 84%|████████▍ | 393/469 [11:51<02:05,  1.66s/it]
 84%|████████▍ | 394/469 [11:53<02:03,  1.65s/it]
 84%|████████▍ | 395/469 [11:54<02:01,  1.65s/it]
 84%|████████▍ | 396/469 [11:56<01:59,  1.63s/it]
 85%|████████▍ | 397/469 [11:58<01:58,  1.64s/it]
 85%|████████▍ | 398/469 [11:59<02:00,  1.70s/it]
 85%|████████▌ | 399/469 [12:01<01:57,  1.68s/it]
 85%|████████▌ | 400/469 [12:03<01:54,  1.66s/it]
 86%|████████▌ | 401/469 [12:04<01:51,  1.64s/it]
 86%|████████▌ | 402/469 [12:25<08:21,  7.49s/it]
 86%|████████▌ | 403/469 [12:27<06:17,  5.72s/it]
 86%|████████▌ | 404/469 [12:29<04:51,  4.49s/it]
 86%|████████▋ | 405/469 [12:30<03:53,  3.65s/it]
 87%|████████▋ | 406/469 [12:32<03:12,  3.06s/it]
 87%|████████▋ | 407/469 [12:34<02:42,  2.63s/it]
 87%|████████▋ | 408/469 [12:35<02:21,  2.32s/it]
 87%|████████▋ | 409/469 [12:37<02:06,  2.10s/it]
 87%|████████▋ | 410/469 [12:38<01:54,  1.94s/it]
 88%|████████▊ | 411/469 [12:40<01:46,  1.83s/it]
 88%|████████▊ | 412/469 [12:41<01:39,  1.75s/it]
 88%|████████▊ | 413/469 [12:43<01:36,  1.73s/it]
 88%|████████▊ | 414/469 [12:45<01:33,  1.69s/it]
 88%|████████▊ | 415/469 [12:46<01:29,  1.65s/it]
 89%|████████▊ | 416/469 [12:48<01:28,  1.68s/it]
 89%|████████▉ | 417/469 [12:50<01:26,  1.67s/it]
 89%|████████▉ | 418/469 [12:51<01:24,  1.65s/it]
 89%|████████▉ | 419/469 [12:53<01:21,  1.64s/it]
 90%|████████▉ | 420/469 [12:55<01:21,  1.66s/it]
 90%|████████▉ | 421/469 [12:56<01:19,  1.66s/it]
 90%|████████▉ | 422/469 [12:58<01:18,  1.67s/it]
 90%|█████████ | 423/469 [13:00<01:18,  1.71s/it]
 90%|█████████ | 424/469 [13:02<01:18,  1.73s/it]
 91%|█████████ | 425/469 [13:03<01:14,  1.69s/it]
 91%|█████████ | 426/469 [13:05<01:11,  1.66s/it]
 91%|█████████ | 427/469 [13:07<01:13,  1.74s/it]
 91%|█████████▏| 428/469 [13:08<01:09,  1.70s/it]
 91%|█████████▏| 429/469 [13:10<01:07,  1.68s/it]
 92%|█████████▏| 430/469 [13:12<01:05,  1.68s/it]
 92%|█████████▏| 431/469 [13:13<01:05,  1.71s/it]
 92%|█████████▏| 432/469 [13:15<01:02,  1.68s/it]
 92%|█████████▏| 433/469 [13:17<01:01,  1.71s/it]
 93%|█████████▎| 434/469 [13:18<00:58,  1.69s/it]
 93%|█████████▎| 435/469 [13:20<00:57,  1.69s/it]
 93%|█████████▎| 436/469 [13:22<00:55,  1.68s/it]
 93%|█████████▎| 437/469 [13:23<00:53,  1.67s/it]
 93%|█████████▎| 438/469 [13:25<00:51,  1.66s/it]
 94%|█████████▎| 439/469 [13:27<00:49,  1.64s/it]
 94%|█████████▍| 440/469 [13:28<00:47,  1.65s/it]
 94%|█████████▍| 441/469 [13:30<00:46,  1.65s/it]
 94%|█████████▍| 442/469 [13:32<00:44,  1.63s/it]
 94%|█████████▍| 443/469 [13:33<00:43,  1.67s/it]
 95%|█████████▍| 444/469 [13:35<00:41,  1.65s/it]
 95%|█████████▍| 445/469 [13:36<00:39,  1.64s/it]
 95%|█████████▌| 446/469 [13:39<00:40,  1.77s/it]
 95%|█████████▌| 447/469 [13:40<00:37,  1.72s/it]
 96%|█████████▌| 448/469 [13:42<00:35,  1.68s/it]
 96%|█████████▌| 449/469 [13:44<00:36,  1.84s/it]
 96%|█████████▌| 450/469 [13:46<00:33,  1.78s/it]
 96%|█████████▌| 451/469 [13:47<00:30,  1.72s/it]
 96%|█████████▋| 452/469 [13:49<00:30,  1.78s/it]
 97%|█████████▋| 453/469 [13:51<00:27,  1.73s/it]
 97%|█████████▋| 454/469 [13:52<00:25,  1.70s/it]
 97%|█████████▋| 455/469 [13:54<00:23,  1.70s/it]
 97%|█████████▋| 456/469 [13:56<00:21,  1.67s/it]
 97%|█████████▋| 457/469 [13:57<00:19,  1.66s/it]
 98%|█████████▊| 458/469 [13:59<00:18,  1.65s/it]
 98%|█████████▊| 459/469 [14:01<00:16,  1.64s/it]
 98%|█████████▊| 460/469 [14:02<00:14,  1.65s/it]
 98%|█████████▊| 461/469 [14:04<00:13,  1.65s/it]
 99%|█████████▊| 462/469 [14:06<00:11,  1.67s/it]
 99%|█████████▊| 463/469 [14:07<00:10,  1.67s/it]
 99%|█████████▉| 464/469 [14:09<00:09,  1.83s/it]
 99%|█████████▉| 465/469 [14:11<00:07,  1.84s/it]
 99%|█████████▉| 466/469 [14:13<00:05,  1.80s/it]
+computing/reading sample batch statistics...
+Computing evaluations...
+Inception Score: 37.646392822265625
+FID: 21.19386100577333
+sFID: 71.79977998851734
+Precision: 0.690407122136641
+Recall: 0.358997247638176

pic_npz copy.py ADDED Viewed

	@@ -0,0 +1,259 @@

+#!/usr/bin/env python3
+"""
+将文件夹下所有PNG或JPG文件读取并生成对应NPZ文件
+基于 sample_ddp_new.py 中的 create_npz_from_sample_folder 函数改进
+支持自动检测图片数量，支持PNG和JPG格式，输出到父级目录
+支持从 metadata.jsonl 文件读取图片路径
+"""
+import os
+import argparse
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+import glob
+import json
+def create_npz_from_metadata(metadata_jsonl_path, output_path=None):
+    """
+    从 metadata.jsonl 文件读取图片路径并构建 .npz 文件
+    Args:
+        metadata_jsonl_path (str): metadata.jsonl 文件路径
+        output_path (str, optional): 输出 npz 文件路径，默认在 metadata.jsonl 同目录下生成
+    Returns:
+        str: 生成的 npz 文件路径
+    """
+    # 确保 metadata.jsonl 存在
+    if not os.path.exists(metadata_jsonl_path):
+        raise ValueError(f"metadata.jsonl 文件不存在: {metadata_jsonl_path}")
+    # 获取基础目录
+    base_dir = os.path.dirname(metadata_jsonl_path)
+    # 读取 metadata.jsonl
+    image_files = []
+    with open(metadata_jsonl_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                try:
+                    data = json.loads(line)
+                    file_name = data.get('file_name')
+                    if file_name:
+                        full_path = os.path.join(base_dir, file_name)
+                        image_files.append(full_path)
+                except json.JSONDecodeError as e:
+                    print(f"警告: 跳过无效的 JSON 行: {e}")
+                    continue
+    if len(image_files) == 0:
+        raise ValueError(f"在 {metadata_jsonl_path} 中未找到任何有效的图片路径")
+    print(f"从 metadata.jsonl 读取到 {len(image_files)} 张图片路径")
+    # 读取所有图片
+    samples = []
+    for img_path in tqdm(image_files, desc="读取图片并转换为numpy数组"):
+        try:
+            # 打开图片并转换为RGB格式（确保一致性）
+            with Image.open(img_path) as img:
+                # 转换为RGB，确保所有图片都是3通道
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                # 将图片resize到512x512
+                img = img.resize((512, 512), Image.LANCZOS)
+                sample_np = np.asarray(img).astype(np.uint8)
+                # 确保图片是3通道
+                if len(sample_np.shape) != 3 or sample_np.shape[2] != 3:
+                    print(f"警告: 跳过非3通道图片 {img_path}, 形状: {sample_np.shape}")
+                    continue
+                samples.append(sample_np)
+        except Exception as e:
+            print(f"警告: 无法读取图片 {img_path}: {e}")
+            continue
+    if len(samples) == 0:
+        raise ValueError("没有成功读取任何有效的图片文件")
+    # 转换为numpy数组
+    samples = np.stack(samples)
+    print(f"成功读取 {len(samples)} 张图片，形状: {samples.shape}")
+    # 验证数据形状
+    assert len(samples.shape) == 4, f"期望4维数组，得到形状: {samples.shape}"
+    assert samples.shape[3] == 3, f"期望3通道图片，得到: {samples.shape[3]}通道"
+    # 生成输出路径
+    if output_path is None:
+        base_name = os.path.splitext(os.path.basename(metadata_jsonl_path))[0]
+        output_path = os.path.join(base_dir, f"{base_name}.npz")
+    # 保存为npz文件
+    np.savez(output_path, arr_0=samples)
+    print(f"已保存 .npz 文件到 {output_path} [形状={samples.shape}]")
+    return output_path
+def main():
+    """
+    主函数：解析命令行参数并执行图片到npz的转换
+    """
+    parser = argparse.ArgumentParser(
+        description="将文件夹下所有PNG或JPG文件转换为NPZ格式",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+使用示例:
+  python pic_npz.py /path/to/image/folder
+  python pic_npz.py /path/to/image/folder --output-dir /custom/output/path
+        """
+    )
+    parser.add_argument(
+        "--image_folder",
+        type=str,
+        default="/gemini/space/gzy_new/models/Sida/sd3_rectified_samples",
+        help="包含PNG或JPG图片文件的文件夹路径"
+    )
+    # parser.add_argument(
+    #     "--metadata_jsonl",
+    #     type=str,
+    #     default="/gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl",
+    #     help="metadata.jsonl 文件路径，用于从 JSONL 文件读取图片路径"
+    # )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="自定义输出目录（默认为输入文件夹的父级目录或 metadata.jsonl 所在目录）"
+    )
+    args = parser.parse_args()
+    try:
+        if args.metadata_jsonl and os.path.exists(args.metadata_jsonl):
+            # 使用 metadata.jsonl
+            metadata_path = os.path.abspath(args.metadata_jsonl)
+            base_dir = os.path.dirname(metadata_path)
+            base_name = os.path.splitext(os.path.basename(metadata_path))[0]
+            if args.output_dir:
+                os.makedirs(args.output_dir, exist_ok=True)
+                output_path = os.path.join(args.output_dir, f"{base_name}.npz")
+            else:
+                output_path = os.path.join(base_dir, f"{base_name}.npz")
+            npz_path = create_npz_from_metadata(metadata_path, output_path)
+        else:
+            # 使用图片文件夹
+            image_folder_path = os.path.abspath(args.image_folder)
+            if args.output_dir:
+                # 如果指定了输出目录，修改生成逻辑
+                folder_name = os.path.basename(image_folder_path.rstrip('/'))
+                custom_output_path = os.path.join(args.output_dir, f"{folder_name}.npz")
+                # 创建输出目录（如果不存在）
+                os.makedirs(args.output_dir, exist_ok=True)
+                # 临时修改函数以支持自定义输出路径
+                npz_path = create_npz_from_image_folder_custom(image_folder_path, custom_output_path)
+            else:
+                npz_path = create_npz_from_image_folder(image_folder_path)
+        print(f"转换完成！NPZ文件已保存至: {npz_path}")
+    except Exception as e:
+        print(f"错误: {e}")
+        return 1
+    return 0
+def create_npz_from_image_folder_custom(image_folder_path, output_path):
+    """
+    从包含图片的文件夹构建单个 .npz 文件（自定义输出路径版本）
+    Args:
+        image_folder_path (str): 包含图片文件的文件夹路径
+        output_path (str): 输出npz文件的完整路径
+    Returns:
+        str: 生成的 npz 文件路径
+    """
+    # 确保路径存在
+    if not os.path.exists(image_folder_path):
+        raise ValueError(f"文件夹路径不存在: {image_folder_path}")
+    # 获取所有支持的图片文件
+    supported_extensions = ['*.png', '*.PNG', '*.jpg', '*.JPG', '*.jpeg', '*.JPEG']
+    image_files = []
+    for extension in supported_extensions:
+        pattern = os.path.join(image_folder_path, extension)
+        image_files.extend(glob.glob(pattern))
+    # 按文件名排序确保一致性
+    image_files.sort()
+    if len(image_files) == 0:
+        raise ValueError(f"在文件夹 {image_folder_path} 中未找到任何PNG或JPG图片文件")
+    print(f"找到 {len(image_files)} 张图片文件")
+    # 读取所有图片
+    samples = []
+    for img_path in tqdm(image_files, desc="读取图片并转换为numpy数组"):
+        try:
+            # 打开图片并转换为RGB格式（确保一致性）
+            with Image.open(img_path) as img:
+                # 转换为RGB，确保所有图片都是3通道
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                # 将图片resize到512x512
+                img = img.resize((512, 512), Image.LANCZOS)
+                sample_np = np.asarray(img).astype(np.uint8)
+                # 确保图片是3通道
+                if len(sample_np.shape) != 3 or sample_np.shape[2] != 3:
+                    print(f"警告: 跳过非3通道图片 {img_path}, 形状: {sample_np.shape}")
+                    continue
+                samples.append(sample_np)
+        except Exception as e:
+            print(f"警告: 无法读取图片 {img_path}: {e}")
+            continue
+    if len(samples) == 0:
+        raise ValueError("没有成功读取任何有效的图片文件")
+    # 转换为numpy数组
+    samples = np.stack(samples)
+    print(f"成功读取 {len(samples)} 张图片，形状: {samples.shape}")
+    # 验证数据形状
+    assert len(samples.shape) == 4, f"期望4维数组，得到形状: {samples.shape}"
+    assert samples.shape[3] == 3, f"期望3通道图片，得到: {samples.shape[3]}通道"
+    # 保存为npz文件
+    np.savez(output_path, arr_0=samples)
+    print(f"已保存 .npz 文件到 {output_path} [形状={samples.shape}]")
+    return output_path
+if __name__ == "__main__":
+    exit(main())

pic_npz.py ADDED Viewed

	@@ -0,0 +1,157 @@

+#!/usr/bin/env python3
+"""
+将文件夹下所有PNG或JPG文件读取并生成对应NPZ文件
+基于 sample_ddp_new.py 中的 create_npz_from_sample_folder 函数改进
+支持自动检测图片数量，支持PNG和JPG格式，输出到父级目录
+"""
+import os
+import argparse
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+import glob
+def main():
+    """
+    主函数：解析命令行参数并执行图片到npz的转换
+    """
+    parser = argparse.ArgumentParser(
+        description="将文件夹下所有PNG或JPG文件转换为NPZ格式",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+使用示例:
+  python pic_npz.py /path/to/image/folder
+  python pic_npz.py /path/to/image/folder --output-dir /custom/output/path
+        """
+    )
+    parser.add_argument(
+        "--image_folder",
+        type=str,
+        default="/gemini/space/gzy_new/models/Sida/sd3_rectified_samples_new_batch_2",
+        help="包含PNG或JPG图片文件的文件夹路径"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="自定义输出目录（默认为输入文件夹的父级目录）"
+    )
+    args = parser.parse_args()
+    try:
+        # 仅支持从图片文件夹生成 npz
+        image_folder_path = os.path.abspath(args.image_folder)
+        if args.output_dir:
+            # 如果指定了输出目录，修改生成逻辑
+            folder_name = os.path.basename(image_folder_path.rstrip('/'))
+            custom_output_path = os.path.join(args.output_dir, f"{folder_name}.npz")
+            # 创建输出目录（如果不存在）
+            os.makedirs(args.output_dir, exist_ok=True)
+            npz_path = create_npz_from_image_folder_custom(image_folder_path, custom_output_path)
+        else:
+            npz_path = create_npz_from_image_folder(image_folder_path)
+        print(f"转换完成！NPZ文件已保存至: {npz_path}")
+    except Exception as e:
+        print(f"错误: {e}")
+        return 1
+    return 0
+def create_npz_from_image_folder_custom(image_folder_path, output_path):
+    """
+    从包含图片的文件夹构建单个 .npz 文件（自定义输出路径版本）
+    Args:
+        image_folder_path (str): 包含图片文件的文件夹路径
+        output_path (str): 输出npz文件的完整路径
+    Returns:
+        str: 生成的 npz 文件路径
+    """
+    # 确保路径存在
+    if not os.path.exists(image_folder_path):
+        raise ValueError(f"文件夹路径不存在: {image_folder_path}")
+    # 获取所有支持的图片文件
+    supported_extensions = ['*.png', '*.PNG', '*.jpg', '*.JPG', '*.jpeg', '*.JPEG']
+    image_files = []
+    for extension in supported_extensions:
+        pattern = os.path.join(image_folder_path, extension)
+        image_files.extend(glob.glob(pattern))
+    # 按文件名排序确保一致性
+    image_files.sort()
+    if len(image_files) == 0:
+        raise ValueError(f"在文件夹 {image_folder_path} 中未找到任何PNG或JPG图片文件")
+    print(f"找到 {len(image_files)} 张图片文件")
+    # 读取所有图片
+    samples = []
+    for img_path in tqdm(image_files, desc="读取图片并转换为numpy数组"):
+        try:
+            # 打开图片并转换为RGB格式（确保一致性）
+            with Image.open(img_path) as img:
+                # 转换为RGB，确保所有图片都是3通道
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                # 将图片resize到512x512
+                img = img.resize((512, 512), Image.LANCZOS)
+                sample_np = np.asarray(img).astype(np.uint8)
+                # 确保图片是3通道
+                if len(sample_np.shape) != 3 or sample_np.shape[2] != 3:
+                    print(f"警告: 跳过非3通道图片 {img_path}, 形状: {sample_np.shape}")
+                    continue
+                samples.append(sample_np)
+        except Exception as e:
+            print(f"警告: 无法读取图片 {img_path}: {e}")
+            continue
+    if len(samples) == 0:
+        raise ValueError("没有成功读取任何有效的图片文件")
+    # 转换为numpy数组
+    samples = np.stack(samples)
+    print(f"成功读取 {len(samples)} 张图片，形状: {samples.shape}")
+    # 验证数据形状
+    assert len(samples.shape) == 4, f"期望4维数组，得到形状: {samples.shape}"
+    assert samples.shape[3] == 3, f"期望3通道图片，得到: {samples.shape[3]}通道"
+    # 保存为npz文件
+    np.savez(output_path, arr_0=samples)
+    print(f"已保存 .npz 文件到 {output_path} [形状={samples.shape}]")
+    return output_path
+def create_npz_from_image_folder(image_folder_path):
+    """
+    从图片文件夹构建 .npz，输出到该文件夹的父目录，文件名为 <文件夹名>.npz
+    """
+    parent_dir = os.path.dirname(os.path.abspath(image_folder_path))
+    folder_name = os.path.basename(os.path.abspath(image_folder_path).rstrip("/"))
+    output_path = os.path.join(parent_dir, f"{folder_name}.npz")
+    return create_npz_from_image_folder_custom(image_folder_path, output_path)
+if __name__ == "__main__":
+    exit(main())

pipeline_stable_diffusion_3.py ADDED Viewed

	@@ -0,0 +1,1378 @@

+# Copyright 2025 Stability AI, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    SiglipImageProcessor,
+    SiglipVisionModel,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.transformers import SD3Transformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import StableDiffusion3PipelineOutput
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+#logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusion3Pipeline
+        >>> pipe = StableDiffusion3Pipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> image = pipe(prompt).images[0]
+        >>> image.save("sd3.png")
+        ```
+"""
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin):
+    r"""
+    Args:
+        transformer ([`SD3Transformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant,
+            with an additional added projection layer that is initialized with a diagonal matrix with the `hidden_size`
+            as its dimension.
+        text_encoder_2 ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        text_encoder_3 ([`T5EncoderModel`]):
+            Frozen text-encoder. Stable Diffusion 3 uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_3 (`T5TokenizerFast`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        image_encoder (`SiglipVisionModel`, *optional*):
+            Pre-trained Vision Model for IP Adapter.
+        feature_extractor (`SiglipImageProcessor`, *optional*):
+            Image processor for IP Adapter.
+        model (`SD3WithRectifiedNoise`, *optional*):
+            Optional SD3WithRectifiedNoise model for enhanced noise prediction. If provided, will be used instead of
+            the default transformer for denoising.
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
+    _optional_components = ["image_encoder", "feature_extractor"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "pooled_prompt_embeds"]
+    def __init__(
+        self,
+        transformer: SD3Transformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer_2: CLIPTokenizer,
+        text_encoder_3: T5EncoderModel,
+        tokenizer_3: T5TokenizerFast,
+        image_encoder: SiglipVisionModel = None,
+        feature_extractor: SiglipImageProcessor = None,
+        model = None,  # 添加 model 参数
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            text_encoder_3=text_encoder_3,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            tokenizer_3=tokenizer_3,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            model=model,  # 添加 model 参数到 register_modules
+        )
+        #print(f"VAE is None: {getattr(self, 'vae', None) is None}")
+        #if getattr(self, 'vae', None) is not None:
+        #    print(f"VAE config block_out_channels: {self.vae.config.block_out_channels}")
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        #print(f"VAE scale factor: {self.vae_scale_factor}")
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = (
+            self.transformer.config.sample_size
+            if hasattr(self, "transformer") and self.transformer is not None
+            else 64
+        )
+        self.patch_size = (
+            self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2
+        )
+        # 添加对 SD3WithRectifiedNoise 模型的支持
+        self.model = model
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 256,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder_3.dtype
+        #max_sequence_length=77
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        # print(f"T5处理 - 输入提示: {prompt}")
+        # print(f"T5处理 - batch_size: {batch_size}")
+        # print(f"T5处理 - num_images_per_prompt: {num_images_per_prompt}")
+        # print(f"T5处理 - max_sequence_length: {max_sequence_length}")
+        # print(f"T5处理 - device: {device}")
+        # print(f"T5处理 - dtype: {dtype}")
+        if self.text_encoder_3 is None:
+            #print("T5处理 - text_encoder_3为None，返回零张量")
+            return torch.zeros(
+                (
+                    batch_size * num_images_per_prompt,
+                    self.tokenizer_max_length,
+                    self.transformer.config.joint_attention_dim,
+                ),
+                device=device,
+                dtype=dtype,
+            )
+        text_inputs = self.tokenizer_3(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+           # add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        # if torch.isnan(text_input_ids).any():
+        #     print("T5处理 - text_input_ids输入提示包含NaN值")
+        # else:
+        #     print("T5处理 - text_input_ids",text_input_ids)
+        # print(f"T5处理 - text_input_ids形状: {text_input_ids.shape}")
+        # print(f"T5处理 - text_input_ids范围: [{text_input_ids.min().item()}, {text_input_ids.max().item()}]")
+        # print(f"T5处理 - tokenizer_3.vocab_size: {self.tokenizer_3.vocab_size}")
+        # # 检查输入token IDs是否包含非法值
+        # if torch.any(text_input_ids < 0):
+        #     print(f"警告：发现负数token ID，最小值: {text_input_ids.min().item()}")
+        # if torch.any(text_input_ids >= self.tokenizer_3.vocab_size):
+        #     print(f"警告：发现超过词汇表大小的token ID，最大值: {text_input_ids.max().item()}, vocab_size: {self.tokenizer_3.vocab_size}")
+        untruncated_ids = self.tokenizer_3(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_3.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        # 将输入移动到设备并确保数据类型正确
+        text_input_ids = text_input_ids.to(device)#, dtype=torch.long)
+        # print(f"T5处理 - text_input_ids形状: ",text_input_ids)
+        # print(f"T5处理 - text_input_ids设备: {text_input_ids.device}, dtype: {text_input_ids.dtype}")
+        # 检查text_encoder_3的状态
+        # print(f"T5处理 - text_encoder_3设备: {next(self.text_encoder_3.parameters()).device}")
+        # print(f"T5处理 - text_encoder_3.dtype: {self.text_encoder_3.dtype}")
+        with torch.autocast(device.type if isinstance(device, torch.device) else "cuda", enabled=False):
+            prompt_embeds = self.text_encoder_3(text_input_ids.to(device))[0]
+        #prompt_embeds = self.text_encoder_3(text_input_ids)[0]
+        # print(f"T5处理 - T5编码器输出形状: {prompt_embeds.shape}")
+        # print(f"T5处理 - T5编码器输出设备: {prompt_embeds.device}")
+        # print(f"T5处理 - T5编码器输出dtype: {prompt_embeds.dtype}")
+        # # 检查T5编码器输出是否包含NaN或inf
+        # has_nan = torch.isnan(prompt_embeds).any()
+        # has_inf = torch.isinf(prompt_embeds).any()
+        # if has_nan or has_inf:
+        #     print(f"警告：T5编码器输出包含NaN: {has_nan} 或inf: {has_inf}")
+        #     print(f"T5编码器输出统计 - min: {prompt_embeds.min().item()}, max: {prompt_embeds.max().item()}, mean: {prompt_embeds.mean().item()}")
+        #     if has_nan:
+        #         nan_locations = torch.where(torch.isnan(prompt_embeds))
+        #         print(f"NaN位置 - 前10个: {[(nan_locations[i][:10].tolist()) for i in range(len(nan_locations))]}")
+        dtype = self.text_encoder_3.dtype
+        # 强制在无 autocast 的上下文中运行 T5 以避免 fp16 溢出为 NaN
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        #print(f"T5处理 - 转换后prompt_embeds dtype: {prompt_embeds.dtype}, device: {prompt_embeds.device}")
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        # print(f"T5处理 - 最终输出形状: {prompt_embeds.shape}")
+        # # 检查最终输出是否包含NaN
+        # if torch.isnan(prompt_embeds).any():
+        #     print(f"警告：T5最终输出包含NaN，位置: {torch.where(torch.isnan(prompt_embeds))}")
+        #     print(f"T5最终输出统计 - min: {prompt_embeds.min().item()}, max: {prompt_embeds.max().item()}, mean: {prompt_embeds.mean().item()}")
+        # print("最终输出",prompt_embeds)
+        return prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        clip_skip: Optional[int] = None,
+        clip_model_index: int = 0,
+    ):
+        device = device or self._execution_device
+        clip_tokenizers = [self.tokenizer, self.tokenizer_2]
+        clip_text_encoders = [self.text_encoder, self.text_encoder_2]
+        tokenizer = clip_tokenizers[clip_model_index]
+        text_encoder = clip_text_encoders[clip_model_index]
+        # print(f"CLIP处理 - clip_model_index: {clip_model_index}")
+        # print(f"CLIP处理 - 使用的tokenizer: {type(tokenizer)}")
+        # print(f"CLIP处理 - 使用的text_encoder: {type(text_encoder)}")
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        # print(f"CLIP处理 - 输入提示: {prompt}")
+        # print(f"CLIP处理 - batch_size: {batch_size}")
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        # print(f"CLIP处理 - text_input_ids形状: {text_input_ids.shape}")
+        # print(f"CLIP处理 - text_input_ids范围: [{text_input_ids.min().item()}, {text_input_ids.max().item()}]")
+        untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+        pooled_prompt_embeds = prompt_embeds[0]
+        # print(f"CLIP处理 - pooled_prompt_embeds形状: {pooled_prompt_embeds.shape}")
+        # print(f"CLIP处理 - pooled_prompt_embeds设备: {pooled_prompt_embeds.device}")
+        # print(f"CLIP处理 - pooled_prompt_embeds dtype: {pooled_prompt_embeds.dtype}")
+        # 检查CLIP编码器输出是否包含NaN或inf
+        # has_nan = torch.isnan(pooled_prompt_embeds).any()
+        # has_inf = torch.isinf(pooled_prompt_embeds).any()
+        # if has_nan or has_inf:
+        #     print(f"警告：CLIP编码器pooled输出包含NaN: {has_nan} 或inf: {has_inf}")
+        #     print(f"CLIP编码器pooled输出统计 - min: {pooled_prompt_embeds.min().item()}, max: {pooled_prompt_embeds.max().item()}, mean: {pooled_prompt_embeds.mean().item()}")
+        if clip_skip is None:
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+        else:
+            prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+        # 检查CLIP编码器embeds输出是否包含NaN或inf
+        # has_nan = torch.isnan(prompt_embeds).any()
+        # has_inf = torch.isinf(prompt_embeds).any()
+        # if has_nan or has_inf:
+        #     print(f"警告：CLIP编码器embeds输出包含NaN: {has_nan} 或inf: {has_inf}")
+        #     print(f"CLIP编码器embeds输出统计 - min: {prompt_embeds.min().item()}, max: {prompt_embeds.max().item()}, mean: {prompt_embeds.mean().item()}")
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        #print(f"CLIP处理 - 转换后prompt_embeds dtype: {prompt_embeds.dtype}, device: {prompt_embeds.device}")
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        # print(f"CLIP处理 - 最终prompt_embeds形状: {prompt_embeds.shape}")
+        # # 检查CLIP编码器embeds最终输出是否包含NaN
+        # if torch.isnan(prompt_embeds).any():
+        #     print(f"警告：CLIP编码器embeds最终输出包含NaN，位置: {torch.where(torch.isnan(prompt_embeds))}")
+        #     print(f"CLIP编码器embeds最终输出统计 - min: {prompt_embeds.min().item()}, max: {prompt_embeds.max().item()}, mean: {prompt_embeds.mean().item()}")
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        # print(f"CLIP处理 - 最终pooled_prompt_embeds形状: {pooled_prompt_embeds.shape}")
+        # # 检查CLIP编码器pooled最终输出是否包含NaN
+        # if torch.isnan(pooled_prompt_embeds).any():
+        #     print(f"警告：CLIP编码器pooled最终输出包含NaN，位置: {torch.where(torch.isnan(pooled_prompt_embeds))}")
+        #     print(f"CLIP编码器pooled最终输出统计 - min: {pooled_prompt_embeds.min().item()}, max: {pooled_prompt_embeds.max().item()}, mean: {pooled_prompt_embeds.mean().item()}")
+        return prompt_embeds, pooled_prompt_embeds
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        prompt_3: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clip_skip: Optional[int] = None,
+        max_sequence_length: int = 256,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            negative_prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # print(f"encode_prompt - 开始处理提示编码")
+        # print(f"encode_prompt - device: {device}")
+        # print(f"encode_prompt - num_images_per_prompt: {num_images_per_prompt}")
+        # print(f"encode_prompt - do_classifier_free_guidance: {do_classifier_free_guidance}")
+        # print(f"encode_prompt - max_sequence_length: {max_sequence_length}")
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+       # print(f"encode_prompt - batch_size: {batch_size}")
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            prompt_3 = prompt_3 or prompt
+            prompt_3 = [prompt_3] if isinstance(prompt_3, str) else prompt_3
+            prompt_embed, pooled_prompt_embed = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=0,
+            )
+            prompt_2_embed, pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                prompt=prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=1,
+            )
+            clip_prompt_embeds = torch.cat([prompt_embed, prompt_2_embed], dim=-1)
+            t5_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+            clip_prompt_embeds = torch.nn.functional.pad(
+                clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+            )
+            prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+            pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1)
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+            negative_prompt_3 = negative_prompt_3 or negative_prompt
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+            negative_prompt_3 = (
+                batch_size * [negative_prompt_3] if isinstance(negative_prompt_3, str) else negative_prompt_3
+            )
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            negative_prompt_embed, negative_pooled_prompt_embed = self._get_clip_prompt_embeds(
+                negative_prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=0,
+            )
+            negative_prompt_2_embed, negative_pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                negative_prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=1,
+            )
+            negative_clip_prompt_embeds = torch.cat([negative_prompt_embed, negative_prompt_2_embed], dim=-1)
+            t5_negative_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=negative_prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+            negative_clip_prompt_embeds = torch.nn.functional.pad(
+                negative_clip_prompt_embeds,
+                (0, t5_negative_prompt_embed.shape[-1] - negative_clip_prompt_embeds.shape[-1]),
+            )
+            negative_prompt_embeds = torch.cat([negative_clip_prompt_embeds, t5_negative_prompt_embed], dim=-2)
+            negative_pooled_prompt_embeds = torch.cat(
+                [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        prompt_3,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        negative_prompt_3=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if (
+            height % (self.vae_scale_factor * self.patch_size) != 0
+            or width % (self.vae_scale_factor * self.patch_size) != 0
+        ):
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * self.patch_size} but are {height} and {width}."
+                f"You can use height {height - height % (self.vae_scale_factor * self.patch_size)} and width {width - width % (self.vae_scale_factor * self.patch_size)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_3 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_3`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        elif prompt_3 is not None and (not isinstance(prompt_3, str) and not isinstance(prompt_3, list)):
+            raise ValueError(f"`prompt_3` has to be of type `str` or `list` but is {type(prompt_3)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_3 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_3`: {negative_prompt_3} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def skip_guidance_layers(self):
+        return self._skip_guidance_layers
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_image
+    def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor:
+        """Encodes the given image into a feature representation using a pre-trained image encoder.
+        Args:
+            image (`PipelineImageInput`):
+                Input image to be encoded.
+            device: (`torch.device`):
+                Torch device.
+        Returns:
+            `torch.Tensor`: The encoded image feature representation.
+        """
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=self.dtype)
+        return self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[torch.Tensor] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+    ) -> torch.Tensor:
+        """Prepares image embeddings for use in the IP-Adapter.
+        Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed.
+        Args:
+            ip_adapter_image (`PipelineImageInput`, *optional*):
+                The input image to extract features from for IP-Adapter.
+            ip_adapter_image_embeds (`torch.Tensor`, *optional*):
+                Precomputed image embeddings.
+            device: (`torch.device`, *optional*):
+                Torch device.
+            num_images_per_prompt (`int`, defaults to 1):
+                Number of images that should be generated per prompt.
+            do_classifier_free_guidance (`bool`, defaults to True):
+                Whether to use classifier free guidance or not.
+        """
+        device = device or self._execution_device
+        if ip_adapter_image_embeds is not None:
+            if do_classifier_free_guidance:
+                single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2)
+            else:
+                single_image_embeds = ip_adapter_image_embeds
+        elif ip_adapter_image is not None:
+            single_image_embeds = self.encode_image(ip_adapter_image, device)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.zeros_like(single_image_embeds)
+        else:
+            raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.")
+        image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+        if do_classifier_free_guidance:
+            negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0)
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
+        return image_embeds.to(device=device)
+    def enable_sequential_cpu_offload(self, *args, **kwargs):
+        if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload:
+            logger.warning(
+                "`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses "
+                "`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling "
+                "`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`."
+            )
+        super().enable_sequential_cpu_offload(*args, **kwargs)
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        prompt_3: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 256,
+        skip_guidance_layers: List[int] = None,
+        skip_layer_guidance_scale: float = 2.8,
+        skip_layer_guidance_stop: float = 0.2,
+        skip_layer_guidance_start: float = 0.01,
+        mu: Optional[float] = None,
+        model: Optional[Any] = None,  # 添加 model 参数，默认为 None
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used instead
+            negative_prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used instead
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`torch.Tensor`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
+                emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
+                `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] instead of
+                a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+            skip_guidance_layers (`List[int]`, *optional*):
+                A list of integers that specify layers to skip during guidance. If not provided, all layers will be
+                used for guidance. If provided, the guidance will only be applied to the layers specified in the list.
+                Recommended value by StabiltyAI for Stable Diffusion 3.5 Medium is [7, 8, 9].
+            skip_layer_guidance_scale (`int`, *optional*): The scale of the guidance for the layers specified in
+                `skip_guidance_layers`. The guidance will be applied to the layers specified in `skip_guidance_layers`
+                with a scale of `skip_layer_guidance_scale`. The guidance will be applied to the rest of the layers
+                with a scale of `1`.
+            skip_layer_guidance_stop (`int`, *optional*): The step at which the guidance for the layers specified in
+                `skip_guidance_layers` will stop. The guidance will be applied to the layers specified in
+                `skip_guidance_layers` until the fraction specified in `skip_layer_guidance_stop`. Recommended value by
+                StabiltyAI for Stable Diffusion 3.5 Medium is 0.2.
+            skip_layer_guidance_start (`int`, *optional*): The step at which the guidance for the layers specified in
+                `skip_guidance_layers` will start. The guidance will be applied to the layers specified in
+                `skip_guidance_layers` from the fraction specified in `skip_layer_guidance_start`. Recommended value by
+                StabiltyAI for Stable Diffusion 3.5 Medium is 0.01.
+            mu (`float`, *optional*): `mu` value used for `dynamic_shifting`.
+            model (`SD3WithRectifiedNoise`, *optional*):
+                Optional SD3WithRectifiedNoise model for enhanced noise prediction. If provided, will be used instead of
+                the default transformer for denoising. The model should be an instance of SD3WithRectifiedNoise class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        #height=512
+        #width=512
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            prompt_3,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._skip_layer_guidance_scale = skip_layer_guidance_scale
+        self._clip_skip = clip_skip
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_3=prompt_3,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            device=device,
+            clip_skip=self.clip_skip,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        if self.do_classifier_free_guidance:
+            if skip_guidance_layers is not None:
+                original_prompt_embeds = prompt_embeds
+                original_pooled_prompt_embeds = pooled_prompt_embeds
+           # print("检测negative_prompt_embeds",negative_prompt_embeds)
+            #print("检测pooled_prompt_embeds",prompt_embeds)
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 5. Prepare timesteps
+        scheduler_kwargs = {}
+        if self.scheduler.config.get("use_dynamic_shifting", None) and mu is None:
+            _, _, height, width = latents.shape
+            image_seq_len = (height // self.transformer.config.patch_size) * (
+                width // self.transformer.config.patch_size
+            )
+            mu = calculate_shift(
+                image_seq_len,
+                self.scheduler.config.get("base_image_seq_len", 256),
+                self.scheduler.config.get("max_image_seq_len", 4096),
+                self.scheduler.config.get("base_shift", 0.5),
+                self.scheduler.config.get("max_shift", 1.16),
+            )
+            scheduler_kwargs["mu"] = mu
+        elif mu is not None:
+            scheduler_kwargs["mu"] = mu
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            **scheduler_kwargs,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # 6. Prepare image embeddings
+        if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None:
+            ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+            if self.joint_attention_kwargs is None:
+                self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds}
+            else:
+                self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds)
+        # 7. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+                # Check for NaN in latents before transformer
+                if torch.isnan(latents).any():
+                    # print(f"NaN detected in latents at step {i}")
+                    # print(f"NaN locations: {torch.where(torch.isnan(latents))}")
+                    break
+                # 优先使用传入的 model 参数，其次使用类属性中的 model，最后回退到默认 transformer
+                effective_model = model or getattr(self, 'model', None) or self.transformer
+                if hasattr(effective_model, '__call__') and callable(effective_model):
+                    # 使用有效的模型进行预测
+                    # 检查模型是否支持 skip_layers 参数
+                    if hasattr(effective_model, 'forward') and 'skip_layers' in inspect.signature(effective_model.forward).parameters:
+                        noise_pred_output = effective_model(
+                            hidden_states=latent_model_input,
+                            timestep=timestep,
+                            encoder_hidden_states=prompt_embeds,
+                            pooled_projections=pooled_prompt_embeds,
+                            #joint_attention_kwargs=self.joint_attention_kwargs,
+                            return_dict=False,
+                            #skip_layers=skip_guidance_layers if skip_guidance_layers is not None else None,
+                        )
+                        #print(f"effective_model type: {type(effective_model)}")
+                        #print(f"noise_pred_output: {noise_pred_output}")
+                    else:
+                        # SD3WithRectifiedNoise 不支持 skip_layers 参数
+                        noise_pred_output = effective_model(
+                            hidden_states=latent_model_input,
+                            timestep=timestep,
+                            encoder_hidden_states=prompt_embeds,
+                            pooled_projections=pooled_prompt_embeds,
+                            joint_attention_kwargs=self.joint_attention_kwargs,
+                            return_dict=False,
+                        )
+                    # 正确处理 SD3WithRectifiedNoise 模型的输出
+                    # SD3WithRectifiedNoise 返回 (final_output, mean_out, var_out) 元组，我们只需要第一个元素
+                    # 如果返回的是字典，则使用 "sample" 键
+                    if isinstance(noise_pred_output, dict):
+                        noise_pred = noise_pred_output["sample"]
+                    elif isinstance(noise_pred_output, tuple):
+                        # 对于 SD3WithRectifiedNoise，取第一个输出作为主要预测结果
+                        noise_pred = noise_pred_output[0]
+                    else:
+                        noise_pred = noise_pred_output
+                else:
+                    # 使用默认的 transformer 进行预测
+                    noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=prompt_embeds,
+                        pooled_projections=pooled_prompt_embeds,
+                        joint_attention_kwargs=self.joint_attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                # Check for NaN in noise prediction
+                if torch.isnan(noise_pred).any():
+                    # print(f"NaN detected in noise_pred at step {i}")
+                    # print(f"NaN locations: {torch.where(torch.isnan(noise_pred))}")
+                    # print(f"noise_pred stats - min: {noise_pred.min().item()}, max: {noise_pred.max().item()}, mean: {noise_pred.mean().item()}")
+                    break
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    # Check for NaN after guidance
+                    if torch.isnan(noise_pred).any():
+                        # print(f"NaN detected in noise_pred after guidance at step {i}")
+                        # print(f"noise_pred_uncond stats - min: {noise_pred_uncond.min().item()}, max: {noise_pred_uncond.max().item()}, mean: {noise_pred_uncond.mean().item()}")
+                        # print(f"noise_pred_text stats - min: {noise_pred_text.min().item()}, max: {noise_pred_text.max().item()}, mean: {noise_pred_text.mean().item()}")
+                        # print(f"guidance_scale: {self.guidance_scale}")
+                        break
+                    should_skip_layers = (
+                        True
+                        if i > num_inference_steps * skip_layer_guidance_start
+                        and i < num_inference_steps * skip_layer_guidance_stop
+                        else False
+                    )
+                    if skip_guidance_layers is not None and should_skip_layers:
+                        timestep = t.expand(latents.shape[0])
+                        latent_model_input = latents
+                        # 修改 skip_guidance_layers 部分的 transformer 调用逻辑以支持 SD3WithRectifiedNoise 模型
+                        # 优先使用传入的 model 参数，其次使用类属性中的 model，最后回退到默认 transformer
+                        effective_model = model or getattr(self, 'model', None) or self.transformer
+                        if hasattr(effective_model, '__call__') and callable(effective_model):
+                            # 使用有效的模型进行预测
+                            # 检查模型是否支持 skip_layers 参数
+                            if hasattr(effective_model, 'forward') and 'skip_layers' in inspect.signature(effective_model.forward).parameters:
+                                noise_pred_skip_output = effective_model(
+                                    hidden_states=latent_model_input,
+                                    timestep=timestep,
+                                    encoder_hidden_states=original_prompt_embeds,
+                                    pooled_projections=original_pooled_prompt_embeds,
+                                    joint_attention_kwargs=self.joint_attention_kwargs,
+                                    return_dict=False,
+                                    skip_layers=skip_guidance_layers,
+                                )
+                            else:
+                                # SD3WithRectifiedNoise 不支持 skip_layers 参数
+                                noise_pred_skip_output = effective_model(
+                                    hidden_states=latent_model_input,
+                                    timestep=timestep,
+                                    encoder_hidden_states=original_prompt_embeds,
+                                    pooled_projections=original_pooled_prompt_embeds,
+                                    joint_attention_kwargs=self.joint_attention_kwargs,
+                                    return_dict=False,
+                                )
+                            # 正确处理 SD3WithRectifiedNoise 模型的输出
+                            # SD3WithRectifiedNoise 返回 (final_output, mean_out, var_out) 元组，我们只需要第一个元素
+                            # 如果返回的是字典，则使用 "sample" 键
+                            if isinstance(noise_pred_skip_output, dict):
+                                noise_pred_skip_layers = noise_pred_skip_output["sample"]
+                            elif isinstance(noise_pred_skip_output, tuple):
+                                # 对于 SD3WithRectifiedNoise，取第一个输出作为主要预测结果
+                                noise_pred_skip_layers = noise_pred_skip_output[0]
+                            else:
+                                noise_pred_skip_layers = noise_pred_skip_output
+                        else:
+                            # 使用默认的 transformer 进行预测
+                            noise_pred_skip_layers = self.transformer(
+                                hidden_states=latent_model_input,
+                                timestep=timestep,
+                                encoder_hidden_states=original_prompt_embeds,
+                                pooled_projections=original_pooled_prompt_embeds,
+                                joint_attention_kwargs=self.joint_attention_kwargs,
+                                return_dict=False,
+                                skip_layers=skip_guidance_layers,
+                            )[0]
+                        # Check for NaN in skip layers noise prediction
+                        if torch.isnan(noise_pred_skip_layers).any():
+                            # print(f"NaN detected in noise_pred_skip_layers at step {i}")
+                            # print(f"NaN locations: {torch.where(torch.isnan(noise_pred_skip_layers))}")
+                            break
+                        noise_pred = (
+                            noise_pred + (noise_pred_text - noise_pred_skip_layers) * self._skip_layer_guidance_scale
+                        )
+                        # Check for NaN after skip layer guidance
+                        if torch.isnan(noise_pred).any():
+                            # print(f"NaN detected in noise_pred after skip layer guidance at step {i}")
+                            break
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                # Check for NaN in latents after scheduler step
+                if torch.isnan(latents).any():
+                    # print(f"NaN detected in latents after scheduler step at step {i}")
+                    # print(f"noise_pred stats - min: {noise_pred.min().item()}, max: {noise_pred.max().item()}, mean: {noise_pred.mean().item()}")
+                    break
+                # Print intermediate results
+                # print(f"Step {i+1}/{num_inference_steps}, Timestep: {t.item():.2f}, Latents mean: {latents.mean().item():.6f}, Latents std: {latents.std().item():.6f}")
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    pooled_prompt_embeds = callback_outputs.pop("pooled_prompt_embeds", pooled_prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return StableDiffusion3PipelineOutput(images=image)

rectified-noise-batch-2/checkpoint-100000/sit_weights/sit_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "num_sit_layers": 1,
+  "hidden_size": 4096,
+  "input_dim": 16,
+  "num_attention_heads": 16,
+  "intermediate_size": 16384,
+  "model_type": "rectified_noise",
+  "architecture": "SIT",
+  "version": "1.0"
+}

rectified-noise-batch-2/checkpoint-120000/sit_weights/sit_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "num_sit_layers": 1,
+  "hidden_size": 4096,
+  "input_dim": 16,
+  "num_attention_heads": 16,
+  "intermediate_size": 16384,
+  "model_type": "rectified_noise",
+  "architecture": "SIT",
+  "version": "1.0"
+}

rectified-noise-batch-2/checkpoint-140000/sit_weights/sit_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "num_sit_layers": 1,
+  "hidden_size": 4096,
+  "input_dim": 16,
+  "num_attention_heads": 16,
+  "intermediate_size": 16384,
+  "model_type": "rectified_noise",
+  "architecture": "SIT",
+  "version": "1.0"
+}

rectified-noise-batch-2/checkpoint-160000/sit_weights/sit_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "num_sit_layers": 1,
+  "hidden_size": 4096,
+  "input_dim": 16,
+  "num_attention_heads": 16,
+  "intermediate_size": 16384,
+  "model_type": "rectified_noise",
+  "architecture": "SIT",
+  "version": "1.0"
+}

rectified-noise-batch-2/checkpoint-180000/sit_weights/sit_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "num_sit_layers": 1,
+  "hidden_size": 4096,
+  "input_dim": 16,
+  "num_attention_heads": 16,
+  "intermediate_size": 16384,
+  "model_type": "rectified_noise",
+  "architecture": "SIT",
+  "version": "1.0"
+}

rectified-noise-batch-2/checkpoint-200000/sit_weights/sit_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "num_sit_layers": 1,
+  "hidden_size": 4096,
+  "input_dim": 16,
+  "num_attention_heads": 16,
+  "intermediate_size": 16384,
+  "model_type": "rectified_noise",
+  "architecture": "SIT",
+  "version": "1.0"
+}

run_sd3_lora_rn_pair_sampling.sh ADDED Viewed

	@@ -0,0 +1,50 @@

+#!/bin/bash
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+PRETRAINED_MODEL="/gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671"
+LORA_PATH="/gemini/space/gzy_new/models/Sida/sd3-lora-finetuned-batch-4/checkpoint-500000"
+RECTIFIED_WEIGHTS="/gemini/space/gzy_new/models/Sida/rectified-noise-batch-2/checkpoint-220000/sit_weights"
+CAPTIONS_JSONL="/gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl"
+SAMPLE_DIR="./sd3_lora_rn_pair_samples"
+NUM_INFERENCE_STEPS=40
+GUIDANCE_SCALE=7.0
+HEIGHT=512
+WIDTH=512
+PER_PROC_BATCH_SIZE=1
+IMAGES_PER_CAPTION=1
+MAX_SAMPLES=500
+GLOBAL_SEED=42
+MIXED_PRECISION="fp16"
+NUM_SIT_LAYERS=1
+ARGS=(
+  --pretrained_model_name_or_path "$PRETRAINED_MODEL"
+  --captions_jsonl "$CAPTIONS_JSONL"
+  --sample_dir "$SAMPLE_DIR"
+  --num_inference_steps $NUM_INFERENCE_STEPS
+  --guidance_scale $GUIDANCE_SCALE
+  --height $HEIGHT
+  --width $WIDTH
+  --per_proc_batch_size $PER_PROC_BATCH_SIZE
+  --images_per_caption $IMAGES_PER_CAPTION
+  --max_samples $MAX_SAMPLES
+  --global_seed $GLOBAL_SEED
+  --num_sit_layers $NUM_SIT_LAYERS
+  --mixed_precision $MIXED_PRECISION
+  --rectified_weights "$RECTIFIED_WEIGHTS"
+)
+if [ -n "$LORA_PATH" ]; then
+  ARGS+=(--lora_path "$LORA_PATH")
+fi
+torchrun --nproc_per_node=4 --master_port=25923 sample_sd3_lora_rn_pair_ddp.py "${ARGS[@]}" --stage lora
+torchrun --nproc_per_node=4 --master_port=25924 sample_sd3_lora_rn_pair_ddp.py "${ARGS[@]}" --stage rn
+torchrun --nproc_per_node=4 --master_port=25925 sample_sd3_lora_rn_pair_ddp.py "${ARGS[@]}" --stage pair
+echo "Sampling done. Output at: $SAMPLE_DIR"
+# nohup bash run_sd3_lora_rn_pair_sampling.sh > run_sd3_lora_rn_pair_sampling.log 2>&1 &

run_sd3_lora_sampling.log ADDED Viewed

The diff for this file is too large to render. See raw diff

run_sd3_lora_sampling.sh ADDED Viewed

	@@ -0,0 +1,94 @@

+#!/bin/bash
+# SD3 LoRA模型采样脚本
+# 使用JSONL文件进行采样的示例脚本
+# 使用方法: ./run_sd3_lora_sampling.sh
+# 设置GPU设备
+export CUDA_VISIBLE_DEVICES="0,1,2,3"  # 使用4个GPU（0,1,2,3）
+# 内存优化设置
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+# 模型和LoRA路径配置
+PRETRAINED_MODEL="/gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671"
+# LoRA checkpoint路径 - 使用accelerator checkpoint目录
+LORA_CHECKPOINT_PATH="/gemini/space/gzy_new/models/Sida/sd3-lora-finetuned-batch-4/checkpoint-500000"
+# LoRA rank（必须与训练时一致）
+LORA_RANK=32
+# 采样参数配置
+NUM_INFERENCE_STEPS=40
+GUIDANCE_SCALE=7.0
+HEIGHT=512
+WIDTH=512
+PER_PROC_BATCH_SIZE=1  # 每个GPU的批大小，建议从1开始（SD3模型很大，保持为1以避免内存溢出）
+MAX_SAMPLES=30000  # 最大采样数量限制
+# 提示词配置
+#NEGATIVE_PROMPT="blurry, low quality, distorted, ugly, bad anatomy"
+# Caption文件配置
+CAPTIONS_JSONL="/gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl"  # JSONL文件路径
+IMAGES_PER_CAPTION=3  # 每个caption生成几张图片
+# 输出配置
+SAMPLE_DIR="./sd3_lora_samples_3w"
+GLOBAL_SEED=42
+echo "开始SD3 LoRA采样（从checkpoint加载）..."
+echo "模型: $PRETRAINED_MODEL"
+echo "LoRA Checkpoint路径: $LORA_CHECKPOINT_PATH"
+echo "LoRA Rank: $LORA_RANK"
+echo "Caption文件: $CAPTIONS_JSONL"
+echo "每个caption生成图片数: $IMAGES_PER_CAPTION"
+echo "图像尺寸: ${HEIGHT}x${WIDTH}"
+echo "引导尺度: $GUIDANCE_SCALE"
+echo "推理步数: $NUM_INFERENCE_STEPS"
+# 检查必要文件
+if [ ! -f "$CAPTIONS_JSONL" ]; then
+    echo "错误: Caption文件 $CAPTIONS_JSONL 不存在"
+    exit 1
+fi
+if [ ! -d "$LORA_CHECKPOINT_PATH" ]; then
+    echo "错误: LoRA checkpoint目录 $LORA_CHECKPOINT_PATH 不存在"
+    exit 1
+fi
+# 构建命令参数数组
+CMD_ARGS=(
+    "--pretrained_model_name_or_path=$PRETRAINED_MODEL"
+    "--lora_checkpoint_path=$LORA_CHECKPOINT_PATH"
+    "--lora_rank=$LORA_RANK"
+    "--num_inference_steps=$NUM_INFERENCE_STEPS"
+    "--guidance_scale=$GUIDANCE_SCALE"
+    "--height=$HEIGHT"
+    "--width=$WIDTH"
+    "--per_proc_batch_size=$PER_PROC_BATCH_SIZE"
+    "--captions_jsonl=$CAPTIONS_JSONL"
+    "--images_per_caption=$IMAGES_PER_CAPTION"
+    "--sample_dir=$SAMPLE_DIR"
+    "--global_seed=$GLOBAL_SEED"
+    #"--max_samples=$MAX_SAMPLES"
+    "--mixed_precision=fp16"  # 使用 fp16 以减少内存占用
+    # 注意：在多GPU环境下，CPU offload会被代码自动禁用（不支持分布式）
+    # 代码会自动检测world_size > 1并禁用CPU offload
+    "--enable_cpu_offload"
+)
+# # 添加负面提示词参数（如果存在）
+# if [ ! -z "$NEGATIVE_PROMPT" ]; then
+#     CMD_ARGS+=("--negative_prompt" "$NEGATIVE_PROMPT")
+# fi
+# 运行分布式采样
+torchrun --nproc_per_node=4 --master_port=25900 sample_sd3_lora_checkpoint_ddp.py "${CMD_ARGS[@]}"
+echo "采样完成!"
+echo "结果保存在: $SAMPLE_DIR"
+echo "Caption信息保存在: $SAMPLE_DIR/*/captions.txt"
+echo "NPZ文件已生成用于FID评估"
+# nohup bash run_sd3_lora_sampling.sh > run_sd3_lora_sampling.log 2>&1 &

run_sd3_rectified_sampling.sh ADDED Viewed

	@@ -0,0 +1,55 @@

+#!/bin/bash
+# 分布式采样：指定 LoRA 与 Rectified(SIT) 权重
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+PRETRAINED_MODEL="/gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671"
+LOCAL_PIPELINE_PATH="/gemini/space/gzy_new/models/Sida/pipeline_stable_diffusion_3.py"
+LORA_PATH="/gemini/space/gzy_new/models/Sida/sd3-lora-finetuned-batch-4/checkpoint-500000"
+RECTIFIED_WEIGHTS="/gemini/space/gzy_new/models/Sida/rectified-noise-batch-2/checkpoint-220000/sit_weights"
+CAPTIONS_JSONL="/gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl"
+SAMPLE_DIR="./sd3_rectified_samples_batch2_220000"
+NUM_INFERENCE_STEPS=40
+GUIDANCE_SCALE=7.0
+HEIGHT=512
+WIDTH=512
+PER_PROC_BATCH_SIZE=32
+IMAGES_PER_CAPTION=3
+MAX_SAMPLES=30000
+GLOBAL_SEED=42
+MIXED_PRECISION="fp16"  # no / fp16 / bf16
+NUM_SIT_LAYERS=1        # 需与训练一致
+ARGS=(
+  --pretrained_model_name_or_path "$PRETRAINED_MODEL"
+  --captions_jsonl "$CAPTIONS_JSONL"
+  --sample_dir "$SAMPLE_DIR"
+  --num_inference_steps $NUM_INFERENCE_STEPS
+  --guidance_scale $GUIDANCE_SCALE
+  --height $HEIGHT
+  --width $WIDTH
+  --per_proc_batch_size $PER_PROC_BATCH_SIZE
+  --images_per_caption $IMAGES_PER_CAPTION
+  --max_samples $MAX_SAMPLES
+  --global_seed $GLOBAL_SEED
+  --num_sit_layers $NUM_SIT_LAYERS
+  --mixed_precision $MIXED_PRECISION
+)
+if [ -n "$LORA_PATH" ]; then
+  ARGS+=(--lora_path "$LORA_PATH")
+fi
+if [ -n "$RECTIFIED_WEIGHTS" ]; then
+  ARGS+=(--rectified_weights "$RECTIFIED_WEIGHTS")
+fi
+torchrun --nproc_per_node=4 --master_port=25913 sample_sd3_rectified_ddp.py "${ARGS[@]}"
+echo "Sampling done. Output at: $SAMPLE_DIR"
+# nohup bash run_sd3_rectified_sampling.sh > run_sd3_rectified_sampling.log 2>&1 &

run_sd3_rectified_sampling_old.sh ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/bin/bash
+set -euo pipefail
+# 分布式采样：指定 LoRA 与 Rectified(SIT) 权重
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+export NCCL_DEBUG=INFO
+export NCCL_DEBUG_SUBSYS=ALL
+export NCCL_IB_DISABLE=1
+export NCCL_P2P_LEVEL=SYS
+PRETRAINED_MODEL="/gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671"
+#"/gemini/space/zhaozy/zhy/hsd/project/pretrained_model/models--stabilityai--stable-diffusion-3-medium-diffusers"
+LORA_PATH="/gemini/space/gzy_new/models/Sida/sd3-lora-finetuned-batch-4/checkpoint-500000"                # 可为空
+RECTIFIED_WEIGHTS="/gemini/space/gzy_new/models/Sida/rectified-noise-batch-2/checkpoint-120000"  # 可为空（若不用 Rectified）
+CAPTIONS_JSONL="/gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl"
+SAMPLE_DIR="./sd3_rectified_samples_new_batch_2"
+NUM_INFERENCE_STEPS=40
+GUIDANCE_SCALE=7.0
+HEIGHT=512
+WIDTH=512
+PER_PROC_BATCH_SIZE=32
+IMAGES_PER_CAPTION=3
+MAX_SAMPLES=30000
+GLOBAL_SEED=42
+MIXED_PRECISION="fp16"  # no / fp16 / bf16
+NUM_SIT_LAYERS=1      # 需与训练一致
+ARGS=(
+  --pretrained_model_name_or_path "$PRETRAINED_MODEL"
+  --captions_jsonl "$CAPTIONS_JSONL"
+  --sample_dir "$SAMPLE_DIR"
+  --num_inference_steps $NUM_INFERENCE_STEPS
+  --guidance_scale $GUIDANCE_SCALE
+  --height $HEIGHT
+  --width $WIDTH
+  --per_proc_batch_size $PER_PROC_BATCH_SIZE
+  --images_per_caption $IMAGES_PER_CAPTION
+  --max_samples $MAX_SAMPLES
+  --global_seed $GLOBAL_SEED
+  --num_sit_layers $NUM_SIT_LAYERS
+  --mixed_precision $MIXED_PRECISION
+)
+if [ -n "$LORA_PATH" ]; then
+  ARGS+=(--lora_path "$LORA_PATH")
+fi
+if [ -n "$RECTIFIED_WEIGHTS" ]; then
+  ARGS+=(--rectified_weights "$RECTIFIED_WEIGHTS")
+fi
+echo "[run_sd3_rectified_sampling.sh] start torchrun: $(date)"
+# 先尝试 4 卡模式，如果失败则退到单卡模式
+if ! torchrun --nproc_per_node=4 --master_port=25913 sample_sd3_rectified_ddp.py "${ARGS[@]}"; then
+  ret=$?
+  echo "[run_sd3_rectified_sampling.sh] 4卡运行失败(退出码 ${ret})，尝试单卡模式"
+  if ! torchrun --nproc_per_node=1 --master_port=25913 sample_sd3_rectified_ddp.py "${ARGS[@]}"; then
+    ret2=$?
+    echo "[run_sd3_rectified_sampling.sh] 单卡运行也失败(退出码 ${ret2})，请查看具体错误信息。"
+    exit $ret2
+  fi
+  echo "[run_sd3_rectified_sampling.sh] 单卡运行成功，建议降低 per_proc_batch_size 或使用单卡配置继续。"
+fi
+wait
+echo "Sampling done. Output at: $SAMPLE_DIR"
+# nohup bash run_sd3_rectified_sampling.sh > run_sd3_rectified_sampling.log 2>&1 &

sample_sd3_lora_checkpoint_ddp.py ADDED Viewed

	@@ -0,0 +1,818 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""
+SD3 LoRA分布式采样脚本 - 从accelerator checkpoint加载LoRA权重
+使用微调后的LoRA权重，基于JSONL文件中的caption生成图像样本，并保存为npz格式用于评估
+"""
+import torch
+import torch.distributed as dist
+from tqdm import tqdm
+import os
+from PIL import Image
+import numpy as np
+import math
+import argparse
+import sys
+import json
+import random
+from pathlib import Path
+from diffusers import (
+    StableDiffusion3Pipeline,
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    SD3Transformer2DModel,
+)
+from transformers import CLIPTokenizer, T5TokenizerFast
+from accelerate import Accelerator
+from peft import LoraConfig, PeftModel
+from peft.utils import get_peft_model_state_dict
+from safetensors.torch import load_file, save_file
+def create_npz_from_sample_folder(sample_dir, num_samples):
+    """
+    从样本文件夹构建单个.npz文件，保持与sample_ddp_new相同的格式
+    """
+    samples = []
+    actual_files = []
+    # 收集所有PNG文件
+    for filename in sorted(os.listdir(sample_dir)):
+        if filename.endswith('.png'):
+            actual_files.append(filename)
+    # 按照数量限制处理
+    for i in tqdm(range(min(num_samples, len(actual_files))), desc="Building .npz file from samples"):
+        if i < len(actual_files):
+            sample_path = os.path.join(sample_dir, actual_files[i])
+            sample_pil = Image.open(sample_path)
+            sample_np = np.asarray(sample_pil).astype(np.uint8)
+            samples.append(sample_np)
+        else:
+            # 如果不够，创建空白图像
+            sample_np = np.zeros((512, 512, 3), dtype=np.uint8)
+            samples.append(sample_np)
+    if samples:
+        samples = np.stack(samples)
+        npz_path = f"{sample_dir}.npz"
+        np.savez(npz_path, arr_0=samples)
+        print(f"Saved .npz file to {npz_path} [shape={samples.shape}].")
+        return npz_path
+    else:
+        print("No samples found to create npz file.")
+        return None
+def extract_lora_from_checkpoint(checkpoint_path, output_lora_path, rank=64, rank0_only=True):
+    """
+    从accelerator checkpoint中提取LoRA权重并保存为标准格式
+    Args:
+        checkpoint_path: checkpoint目录路径
+        output_lora_path: 输出LoRA权重保存路径
+        rank: LoRA rank
+        rank0_only: 是否只在rank 0上执行
+    """
+    model_file = os.path.join(checkpoint_path, "model.safetensors")
+    if not os.path.exists(model_file):
+        if rank0_only:
+            print(f"Model file not found: {model_file}")
+        return False
+    try:
+        # 加载checkpoint state dict
+        state_dict = load_file(model_file)
+        if rank0_only:
+            print(f"Loaded checkpoint with {len(state_dict)} keys")
+        # 提取LoRA权重
+        # Accelerator保存的格式可能是: "transformer.lora_A.weight" 或 "model.transformer.lora_A.weight"
+        # 需要转换为diffusers格式: "transformer.lora_A.weight"
+        lora_state_dict = {}
+        # 查找所有LoRA相关的键
+        lora_keys = []
+        for key in state_dict.keys():
+            # 检查是否是LoRA权重（lora_A, lora_B, lora_embedding等）
+            if 'lora_A' in key or 'lora_B' in key or 'lora_embedding' in key:
+                lora_keys.append(key)
+        if rank0_only:
+            print(f"Found {len(lora_keys)} LoRA keys")
+            if lora_keys:
+                print(f"Sample LoRA keys: {lora_keys[:5]}")
+        if not lora_keys:
+            if rank0_only:
+                print("Warning: No LoRA keys found in checkpoint. Trying alternative extraction method...")
+            # 尝试另一种方法：检查是否有完整的transformer权重
+            # 如果是全量微调，我们需要计算LoRA权重 = 微调权重 - 基础权重
+            # 但这需要基础模型，所以这里我们假设checkpoint中已经包含了LoRA权重
+            # 或者checkpoint保存的是合并后的权重
+            # 检查是否有transformer的完整权重
+            transformer_keys = [k for k in state_dict.keys() if 'transformer' in k.lower() and 'lora' not in k.lower()]
+            if transformer_keys:
+                if rank0_only:
+                    print(f"Found {len(transformer_keys)} transformer keys (full fine-tuning checkpoint)")
+                    print("This checkpoint appears to contain full model weights, not LoRA weights.")
+                    print("You may need to use a different loading method.")
+                return False
+        # 转换键名格式：从accelerator格式转换为diffusers格式
+        for key in lora_keys:
+            # 移除可能的"model."前缀
+            new_key = key
+            if new_key.startswith("model."):
+                new_key = new_key[6:]  # 移除"model."前缀
+            # 确保键名符合diffusers格式
+            # diffusers格式通常是: "transformer.lora_A.weight" 或 "transformer.transformer_blocks.X.attn.to_q.lora_A.weight"
+            lora_state_dict[new_key] = state_dict[key]
+        if not lora_state_dict:
+            if rank0_only:
+                print("Error: Failed to extract LoRA weights from checkpoint")
+            return False
+        # 保存LoRA权重
+        if rank0_only:
+            os.makedirs(output_lora_path, exist_ok=True)
+            lora_file = os.path.join(output_lora_path, "pytorch_lora_weights.safetensors")
+            save_file(lora_state_dict, lora_file)
+            print(f"Saved LoRA weights to {lora_file} ({len(lora_state_dict)} keys)")
+        return True
+    except Exception as e:
+        if rank0_only:
+            print(f"Error extracting LoRA from checkpoint: {e}")
+            import traceback
+            traceback.print_exc()
+        return False
+def load_lora_from_checkpoint_direct(pipeline, checkpoint_path, rank=64, rank0_print=True):
+    """
+    直接从checkpoint加载LoRA权重到pipeline
+    这个方法尝试直接从checkpoint中加载LoRA权重，而不需要先提取
+    """
+    model_file = os.path.join(checkpoint_path, "model.safetensors")
+    if not os.path.exists(model_file):
+        if rank0_print:
+            print(f"Model file not found: {model_file}")
+        return False
+    try:
+        # 加载checkpoint state dict
+        state_dict = load_file(model_file)
+        if rank0_print:
+            print(f"Loaded checkpoint with {len(state_dict)} keys")
+            # 显示前10个键名以便调试
+            sample_keys = list(state_dict.keys())[:10]
+            print(f"Sample keys: {sample_keys}")
+        # 查找LoRA权重
+        lora_keys = [k for k in state_dict.keys() if 'lora_A' in k or 'lora_B' in k or 'lora_embedding' in k]
+        if not lora_keys:
+            if rank0_print:
+                print("No LoRA keys found in checkpoint.")
+                print("This checkpoint might contain merged weights or use a different format.")
+                print("Checking checkpoint structure...")
+            # 检查是否是全量微调的checkpoint（包含完整transformer权重）
+            transformer_keys = [k for k in state_dict.keys() if 'transformer' in k.lower() and 'lora' not in k.lower()]
+            if transformer_keys:
+                if rank0_print:
+                    print(f"Found {len(transformer_keys)} transformer keys")
+                    print("This appears to be a full fine-tuning checkpoint with merged weights.")
+                    print("Attempting to use Accelerator to load the checkpoint...")
+                # 尝试使用Accelerator加载checkpoint
+                try:
+                    # 配置LoRA适配器
+                    transformer_lora_config = LoraConfig(
+                        r=rank,
+                        lora_alpha=rank,
+                        init_lora_weights="gaussian",
+                        target_modules=["attn.to_k", "attn.to_q", "attn.to_v", "attn.to_out.0"],
+                    )
+                    # 为transformer添加LoRA适配器
+                    pipeline.transformer.add_adapter(transformer_lora_config)
+                    # 使用Accelerator加载checkpoint
+                    accelerator = Accelerator()
+                    # 准备模型
+                    transformer_prepared = accelerator.prepare(pipeline.transformer)
+                    # 加载状态
+                    accelerator.load_state(checkpoint_path)
+                    # 提取模型
+                    pipeline.transformer = accelerator.unwrap_model(transformer_prepared)
+                    if rank0_print:
+                        print("Successfully loaded checkpoint using Accelerator")
+                    return True
+                except Exception as e:
+                    if rank0_print:
+                        print(f"Failed to load using Accelerator: {e}")
+                    return False
+            else:
+                if rank0_print:
+                    print("Could not identify checkpoint format. Please check the checkpoint structure.")
+                return False
+        if rank0_print:
+            print(f"Found {len(lora_keys)} LoRA keys")
+            print(f"Sample LoRA keys: {lora_keys[:5]}")
+        # 配置LoRA适配器
+        transformer_lora_config = LoraConfig(
+            r=rank,
+            lora_alpha=rank,
+            init_lora_weights="gaussian",
+            target_modules=["attn.to_k", "attn.to_q", "attn.to_v", "attn.to_out.0"],
+        )
+        # 为transformer添加LoRA适配器
+        pipeline.transformer.add_adapter(transformer_lora_config)
+        if rank0_print:
+            print("LoRA adapter configured")
+        # 提取并转换LoRA权重
+        lora_state_dict = {}
+        for key in lora_keys:
+            # 移除可能的"model."或"transformer."前缀（取决于accelerator保存格式）
+            new_key = key
+            # 移除常见的accelerator前缀
+            prefixes_to_remove = ["model.", "module.", "transformer."]
+            for prefix in prefixes_to_remove:
+                if new_key.startswith(prefix):
+                    new_key = new_key[len(prefix):]
+                    break
+            # 确保键名符合PEFT格式
+            # PEFT格式通常是: "transformer_blocks.X.attn.to_q.lora_A.weight"
+            # 或者: "lora_A.weight" (如果已经包含完整路径)
+            lora_state_dict[new_key] = state_dict[key]
+        if rank0_print:
+            print(f"Extracted {len(lora_state_dict)} LoRA weights")
+            print(f"Sample extracted keys: {list(lora_state_dict.keys())[:5]}")
+        # 加载LoRA权重到模型
+        # 使用PEFT的load_state_dict方法
+        missing_keys, unexpected_keys = pipeline.transformer.load_state_dict(lora_state_dict, strict=False)
+        if rank0_print:
+            if missing_keys:
+                print(f"Missing keys: {len(missing_keys)}")
+                if len(missing_keys) <= 10:
+                    for k in missing_keys:
+                        print(f"  - {k}")
+                else:
+                    print(f"  (showing first 10 of {len(missing_keys)} missing keys)")
+                    for k in list(missing_keys)[:10]:
+                        print(f"  - {k}")
+            if unexpected_keys:
+                print(f"Unexpected keys: {len(unexpected_keys)}")
+                if len(unexpected_keys) <= 10:
+                    for k in unexpected_keys:
+                        print(f"  - {k}")
+                else:
+                    print(f"  (showing first 10 of {len(unexpected_keys)} unexpected keys)")
+                    for k in list(unexpected_keys)[:10]:
+                        print(f"  - {k}")
+        # 检查是否有peft_config
+        if hasattr(pipeline.transformer, 'peft_config'):
+            if rank0_print:
+                print(f"LoRA config found: {list(pipeline.transformer.peft_config.keys())}")
+        else:
+            if rank0_print:
+                print("Warning: No peft_config found after loading LoRA")
+        # 验证LoRA是否真的被加载
+        if rank0_print:
+            # 检查一个LoRA层的权重是否非零
+            has_lora_weights = False
+            for name, param in pipeline.transformer.named_parameters():
+                if 'lora' in name.lower() and param.requires_grad:
+                    if param.abs().max().item() > 1e-6:
+                        has_lora_weights = True
+                        if rank0_print:
+                            print(f"Verified LoRA weights loaded (found non-zero LoRA param: {name})")
+                        break
+            if not has_lora_weights:
+                print("Warning: LoRA weights may not have been loaded correctly (all LoRA params are zero or not found)")
+        if rank0_print:
+            print("LoRA weights loaded successfully")
+        return True
+    except Exception as e:
+        if rank0_print:
+            print(f"Error loading LoRA from checkpoint: {e}")
+            import traceback
+            traceback.print_exc()
+        return False
+def load_captions_from_jsonl(jsonl_path):
+    """
+    从JSONL文件加载caption列表
+    """
+    captions = []
+    try:
+        with open(jsonl_path, 'r', encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    data = json.loads(line)
+                    # 支持多种字段名
+                    caption = None
+                    for field in ['caption', 'text', 'prompt', 'description']:
+                        if field in data and isinstance(data[field], str):
+                            caption = data[field].strip()
+                            break
+                    if caption:
+                        captions.append(caption)
+                    else:
+                        # 如果没有找到标准字段，取第一个字符串值
+                        for value in data.values():
+                            if isinstance(value, str) and value.strip():
+                                captions.append(value.strip())
+                                break
+                except json.JSONDecodeError as e:
+                    print(f"Warning: Invalid JSON on line {line_num}: {e}")
+                    continue
+    except FileNotFoundError:
+        print(f"Error: JSONL file {jsonl_path} not found")
+        return []
+    except Exception as e:
+        print(f"Error loading JSONL file {jsonl_path}: {e}")
+        return []
+    print(f"Loaded {len(captions)} captions from {jsonl_path}")
+    return captions
+def main(args):
+    """
+    运行 SD3 LoRA 采样
+    """
+    assert torch.cuda.is_available(), "DDP采样需要至少一个GPU"
+    torch.set_grad_enabled(False)
+    # 设置 DDP
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    device = torch.device(f"cuda:{rank}")
+    seed = args.global_seed * world_size + rank
+    torch.manual_seed(seed)
+    torch.cuda.set_device(device)
+    print(f"Starting rank={rank}, device={device}, seed={seed}, world_size={world_size}, visible_devices={torch.cuda.device_count()}.")
+    # 加载captions
+    captions = []
+    if args.captions_jsonl:
+        if rank == 0:
+            print(f"Loading captions from {args.captions_jsonl}")
+        captions = load_captions_from_jsonl(args.captions_jsonl)
+        if not captions:
+            if rank == 0:
+                print("Warning: No captions loaded, using default caption")
+            captions = ["a beautiful high quality image"]
+    else:
+        # 使用默认caption
+        captions = ["a beautiful high quality image"]
+    # 计算总的图片数量
+    total_images_needed = len(captions) * args.images_per_caption
+    # 应用最大样本数限制
+    total_images_needed = min(total_images_needed, args.max_samples)
+    if rank == 0:
+        print(f"Will generate {args.images_per_caption} images for each of {len(captions)} captions")
+        print(f"Total images requested: {len(captions) * args.images_per_caption}")
+        print(f"Max samples limit: {args.max_samples}")
+        print(f"Total images to generate: {total_images_needed}")
+    # 设置数据类型 - 使用混合精度以减少内存占用
+    if args.mixed_precision == "fp16":
+        dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        dtype = torch.bfloat16
+    else:
+        dtype = torch.float32
+    # 加载基础模型
+    if rank == 0:
+        print(f"Loading SD3 pipeline from {args.pretrained_model_name_or_path}")
+    pipeline = StableDiffusion3Pipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=dtype,
+    )
+    # 从checkpoint加载LoRA权重
+    lora_loaded = False
+    lora_source = "baseline"
+    if args.lora_checkpoint_path:
+        if rank == 0:
+            print(f"Loading LoRA weights from checkpoint: {args.lora_checkpoint_path}")
+        # 方法1: 直接从checkpoint加载
+        lora_loaded = load_lora_from_checkpoint_direct(
+            pipeline,
+            args.lora_checkpoint_path,
+            rank=args.lora_rank,
+            rank0_print=(rank == 0)
+        )
+        if lora_loaded:
+            lora_source = os.path.basename(args.lora_checkpoint_path.rstrip('/'))
+            if rank == 0:
+                print("Successfully loaded LoRA weights from checkpoint")
+        else:
+            if rank == 0:
+                print("Failed to load LoRA weights directly from checkpoint")
+                print("Trying alternative method: extracting LoRA weights first...")
+            # 方法2: 先提取LoRA权重，再加载
+            temp_lora_path = os.path.join(args.lora_checkpoint_path, "extracted_lora")
+            if rank == 0:
+                extract_success = extract_lora_from_checkpoint(
+                    args.lora_checkpoint_path,
+                    temp_lora_path,
+                    rank=args.lora_rank,
+                    rank0_only=True
+                )
+            else:
+                extract_success = False
+            dist.barrier()  # 等待rank 0完成提取
+            if extract_success and os.path.exists(os.path.join(temp_lora_path, "pytorch_lora_weights.safetensors")):
+                if rank == 0:
+                    print(f"Loading extracted LoRA weights from {temp_lora_path}")
+                try:
+                    pipeline.load_lora_weights(temp_lora_path)
+                    lora_loaded = True
+                    lora_source = f"{os.path.basename(args.lora_checkpoint_path.rstrip('/'))}_extracted"
+                    if rank == 0:
+                        print("Successfully loaded extracted LoRA weights")
+                except Exception as e:
+                    if rank == 0:
+                        print(f"Failed to load extracted LoRA weights: {e}")
+    if not lora_loaded:
+        if rank == 0:
+            print("Warning: No LoRA weights loaded. Using baseline model.")
+    # 启用内存优化选项（必须在移动到设备之前）
+    # 注意：在分布式环境下，CPU offload 不支持多GPU，会导致所有进程挤在一张卡上
+    # 因此禁用 CPU offload，改用其他内存优化方法
+    if args.enable_cpu_offload and world_size > 1:
+        if rank == 0:
+            print(f"Warning: CPU offload is disabled in multi-GPU mode (world_size={world_size})")
+            print("Using device-specific placement instead")
+        args.enable_cpu_offload = False
+    if args.enable_cpu_offload:
+        if rank == 0:
+            print("Enabling CPU offload to save memory (single GPU mode)")
+        # CPU offload 会自动管理设备，不需要先 to(device)
+        pipeline.enable_model_cpu_offload()
+    else:
+        # 在分布式环境下，明确将pipeline移动到对应的设备
+        if rank == 0:
+            print(f"Moving pipeline to device {device} (multi-GPU mode)")
+        pipeline = pipeline.to(device)
+        if rank == 0:
+            print("Enabling memory optimization options")
+        # 检查并启用可用的内存优化方法
+        # 注意：所有进程都需要执行这些操作，不仅仅是 rank 0
+        if hasattr(pipeline, 'enable_attention_slicing'):
+            try:
+                pipeline.enable_attention_slicing()
+                if rank == 0:
+                    print("  - Attention slicing enabled")
+            except Exception as e:
+                if rank == 0:
+                    print(f"  - Warning: Failed to enable attention slicing: {e}")
+        else:
+            if rank == 0:
+                print("  - Attention slicing not available for this pipeline")
+        # SD3 pipeline 可能不支持 enable_vae_slicing，需要检查
+        # 使用 getattr 来安全地检查方法是否存在，避免触发 __getattr__ 异常
+        enable_vae_slicing_method = getattr(pipeline, 'enable_vae_slicing', None)
+        if enable_vae_slicing_method is not None and callable(enable_vae_slicing_method):
+            try:
+                enable_vae_slicing_method()
+                if rank == 0:
+                    print("  - VAE slicing enabled")
+            except Exception as e:
+                if rank == 0:
+                    print(f"  - Warning: Failed to enable VAE slicing: {e}")
+        else:
+            if rank == 0:
+                print("  - VAE slicing not available for this pipeline (SD3 may not support this)")
+    # 验证设备分配
+    if rank == 0:
+        print(f"Pipeline device verification:")
+        print(f"  - Transformer device: {next(pipeline.transformer.parameters()).device}")
+        print(f"  - VAE device: {next(pipeline.vae.parameters()).device}")
+        if hasattr(pipeline, 'text_encoder') and pipeline.text_encoder is not None:
+            print(f"  - Text encoder device: {next(pipeline.text_encoder.parameters()).device}")
+    dist.barrier()  # 等待所有进程完成设备分配
+    # 禁用进度条
+    pipeline.set_progress_bar_config(disable=True)
+    # 创建保存目录
+    folder_name = f"checkpoint-{lora_source}-rank{args.lora_rank}-guidance-{args.guidance_scale}-steps-{args.num_inference_steps}-size-{args.height}x{args.width}"
+    sample_folder_dir = os.path.join(args.sample_dir, folder_name)
+    if rank == 0:
+        os.makedirs(sample_folder_dir, exist_ok=True)
+        print(f"Saving .png samples at {sample_folder_dir}")
+        # 清空caption文件
+        caption_file = os.path.join(sample_folder_dir, "captions.txt")
+        if os.path.exists(caption_file):
+            os.remove(caption_file)
+    dist.barrier()
+    # 计算采样参数
+    n = args.per_proc_batch_size
+    global_batch_size = n * dist.get_world_size()
+    # 检查已存在的样本数量
+    existing_samples = 0
+    if os.path.exists(sample_folder_dir):
+        existing_samples = len([
+            name for name in os.listdir(sample_folder_dir)
+            if os.path.isfile(os.path.join(sample_folder_dir, name)) and name.endswith(".png")
+        ])
+    total_samples = int(math.ceil(total_images_needed / global_batch_size) * global_batch_size)
+    if rank == 0:
+        print(f"Total number of images that will be sampled: {total_samples}")
+        print(f"Existing samples: {existing_samples}")
+    assert total_samples % dist.get_world_size() == 0, "total_samples must be divisible by world_size"
+    samples_needed_this_gpu = int(total_samples // dist.get_world_size())
+    assert samples_needed_this_gpu % n == 0, "samples_needed_this_gpu must be divisible by the per-GPU batch size"
+    iterations = int(samples_needed_this_gpu // n)
+    done_iterations = int(int(existing_samples // dist.get_world_size()) // n)
+    pbar = range(done_iterations, iterations)
+    pbar = tqdm(pbar) if rank == 0 else pbar
+    # 生成caption和image的映射列表
+    caption_image_pairs = []
+    for i, caption in enumerate(captions):
+        for j in range(args.images_per_caption):
+            caption_image_pairs.append((caption, i, j))  # (caption, caption_idx, image_idx)
+    total_generated = existing_samples
+    # 采样循环
+    for i in pbar:
+        # 获取这个batch对应的caption
+        batch_prompts = []
+        batch_caption_info = []
+        for j in range(n):
+            global_index = i * global_batch_size + j * dist.get_world_size() + rank
+            if global_index < len(caption_image_pairs):
+                caption, caption_idx, image_idx = caption_image_pairs[global_index]
+                batch_prompts.append(caption)
+                batch_caption_info.append((caption, caption_idx, image_idx))
+            else:
+                # 如果超出范围，使用最后一个caption
+                if caption_image_pairs:
+                    caption, caption_idx, image_idx = caption_image_pairs[-1]
+                    batch_prompts.append(caption)
+                    batch_caption_info.append((caption, caption_idx, image_idx))
+                else:
+                    batch_prompts.append("a beautiful high quality image")
+                    batch_caption_info.append(("a beautiful high quality image", 0, 0))
+        # 生成图像 - 为每个图像使用不同的随机种子
+        # 确保使用正确的设备进行autocast
+        device_str = str(device)  # 使用明确的设备字符串，如 "cuda:0", "cuda:1" 等
+        with torch.autocast(device_str, dtype=dtype):
+            # 为每个prompt生成独立的图像（使用不同的generator）
+            images = []
+            for k, prompt in enumerate(batch_prompts):
+                # 为每个图像创建独立的随机种子
+                image_seed = seed + i * 10000 + k * 1000 + rank
+                generator = torch.Generator(device=device).manual_seed(image_seed)
+                # 调试信息（仅在第一个batch的第一个图像时打印）
+                if i == done_iterations and k == 0 and rank < 2:  # 只打印前两个rank
+                    print(f"[Rank {rank}] Generating image on device {device}, generator device: {generator.device}")
+                image = pipeline(
+                    prompt=prompt,
+                    negative_prompt=args.negative_prompt if args.negative_prompt else None,
+                    height=args.height,
+                    width=args.width,
+                    num_inference_steps=args.num_inference_steps,
+                    guidance_scale=args.guidance_scale,
+                    generator=generator,
+                    num_images_per_prompt=1,
+                ).images[0]
+                images.append(image)
+        # 清理 GPU 缓存以释放内存
+        if k == len(batch_prompts) - 1:  # 每个 batch 的最后一张图片后清理
+            torch.cuda.empty_cache()
+        # 保存图像
+        for j, (image, (caption, caption_idx, image_idx)) in enumerate(zip(images, batch_caption_info)):
+            global_index = i * global_batch_size + j * dist.get_world_size() + rank
+            if global_index < len(caption_image_pairs):
+                # 保存图片，文件名包含caption索引和图片索引
+                filename = f"{global_index:06d}_cap{caption_idx:04d}_img{image_idx:02d}.png"
+                image_path = os.path.join(sample_folder_dir, filename)
+                image.save(image_path)
+                # 保存caption信息到文本文件（只在rank 0上操作）
+                if rank == 0:
+                    caption_file = os.path.join(sample_folder_dir, "captions.txt")
+                    with open(caption_file, "a", encoding="utf-8") as f:
+                        f.write(f"{filename}\t{caption}\n")
+        total_generated += global_batch_size
+        # 每个迭代后清理 GPU 缓存
+        torch.cuda.empty_cache()
+        dist.barrier()
+    # 确保所有进程都完成采样
+    dist.barrier()
+    # 创建npz文件
+    if rank == 0:
+        # 重新计算实际生成的图片数量
+        actual_num_samples = len([name for name in os.listdir(sample_folder_dir) if name.endswith(".png")])
+        print(f"Actually generated {actual_num_samples} images")
+        # 使用实际的图片数量或用户指定的数量，取较小值
+        npz_samples = min(actual_num_samples, total_images_needed, args.max_samples)
+        create_npz_from_sample_folder(sample_folder_dir, npz_samples)
+        print("Done.")
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="SD3 LoRA分布式采样脚本 - 从checkpoint加载")
+    # 模型和路径参数
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default="stabilityai/stable-diffusion-3-medium-diffusers",
+        help="预训练模型路径或HuggingFace模型ID"
+    )
+    parser.add_argument(
+        "--lora_checkpoint_path",
+        type=str,
+        required=True,
+        help="LoRA checkpoint目录路径（包含model.safetensors的目录）"
+    )
+    parser.add_argument(
+        "--lora_rank",
+        type=int,
+        default=64,
+        help="LoRA rank（必须与训练时一致）"
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        help="模型修订版本"
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="模型变体，如fp16"
+    )
+    # 采样参数
+    parser.add_argument(
+        "--num_inference_steps",
+        type=int,
+        default=28,
+        help="推理步数"
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=7.0,
+        help="引导尺度"
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=1024,
+        help="生成图像高度"
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=1024,
+        help="生成图像宽度"
+    )
+    parser.add_argument(
+        "--negative_prompt",
+        type=str,
+        default="",
+        help="负面提示词"
+    )
+    # 批处理和数据集参数
+    parser.add_argument(
+        "--per_proc_batch_size",
+        type=int,
+        default=1,
+        help="每个进程的批处理大小"
+    )
+    parser.add_argument(
+        "--sample_dir",
+        type=str,
+        default="sd3_lora_samples",
+        help="样本保存目录"
+    )
+    # Caption相关参数
+    parser.add_argument(
+        "--captions_jsonl",
+        type=str,
+        required=True,
+        help="包含caption列表的JSONL文件路径"
+    )
+    parser.add_argument(
+        "--images_per_caption",
+        type=int,
+        default=1,
+        help="每个caption生成的图像数量"
+    )
+    parser.add_argument(
+        "--max_samples",
+        type=int,
+        default=30000,
+        help="最大样本生成数量"
+    )
+    # 其他参数
+    parser.add_argument(
+        "--global_seed",
+        type=int,
+        default=42,
+        help="全局随机种子"
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="fp16",
+        choices=["no", "fp16", "bf16"],
+        help="混合精度类型"
+    )
+    parser.add_argument(
+        "--enable_cpu_offload",
+        action="store_true",
+        help="启用CPU offload以节省显存"
+    )
+    args = parser.parse_args()
+    main(args)

sample_sd3_lora_ddp.py ADDED Viewed

	@@ -0,0 +1,675 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""
+SD3 LoRA分布式采样脚本
+使用微调后的LoRA权重，基于JSONL文件中的caption生成图像样本，并保存为npz格式用于评估
+"""
+import torch
+import torch.distributed as dist
+from tqdm import tqdm
+import os
+from PIL import Image
+import numpy as np
+import math
+import argparse
+import sys
+import json
+import random
+from pathlib import Path
+from diffusers import (
+    StableDiffusion3Pipeline,
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    SD3Transformer2DModel,
+)
+from transformers import CLIPTokenizer, T5TokenizerFast
+from accelerate import Accelerator
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict
+def create_npz_from_sample_folder(sample_dir, num_samples):
+    """
+    从样本文件夹构建单个.npz文件，保持与sample_ddp_new相同的格式
+    """
+    samples = []
+    actual_files = []
+    # 收集所有PNG文件
+    for filename in sorted(os.listdir(sample_dir)):
+        if filename.endswith('.png'):
+            actual_files.append(filename)
+    # 按照数量限制处理
+    for i in tqdm(range(min(num_samples, len(actual_files))), desc="Building .npz file from samples"):
+        if i < len(actual_files):
+            sample_path = os.path.join(sample_dir, actual_files[i])
+            sample_pil = Image.open(sample_path)
+            sample_np = np.asarray(sample_pil).astype(np.uint8)
+            samples.append(sample_np)
+        else:
+            # 如果不够，创建空白图像
+            sample_np = np.zeros((512, 512, 3), dtype=np.uint8)
+            samples.append(sample_np)
+    if samples:
+        samples = np.stack(samples)
+        npz_path = f"{sample_dir}.npz"
+        np.savez(npz_path, arr_0=samples)
+        print(f"Saved .npz file to {npz_path} [shape={samples.shape}].")
+        return npz_path
+    else:
+        print("No samples found to create npz file.")
+        return None
+def find_latest_checkpoint(output_dir):
+    """
+    查找最新的检查点目录
+    """
+    checkpoint_dirs = []
+    if os.path.exists(output_dir):
+        for item in os.listdir(output_dir):
+            if item.startswith("checkpoint-") and os.path.isdir(os.path.join(output_dir, item)):
+                try:
+                    step = int(item.split("-")[1])
+                    checkpoint_dirs.append((step, item))
+                except (ValueError, IndexError):
+                    continue
+    if checkpoint_dirs:
+        # 按步数排序，返回最新的
+        checkpoint_dirs.sort(key=lambda x: x[0])
+        latest_step, latest_dir = checkpoint_dirs[-1]
+        latest_path = os.path.join(output_dir, latest_dir)
+        return latest_path, latest_step
+    return None, None
+def check_lora_weights_exist(lora_path):
+    """
+    检查LoRA权重文件是否存在
+    """
+    if not lora_path:
+        return False
+    # 检查是否是目录
+    if os.path.isdir(lora_path):
+        # 检查目录中是否有pytorch_lora_weights.safetensors文件
+        weight_file = os.path.join(lora_path, "pytorch_lora_weights.safetensors")
+        if os.path.exists(weight_file):
+            return True
+        # 检查是否有其他.safetensors文件
+        for file in os.listdir(lora_path):
+            if file.endswith(".safetensors") and "lora" in file.lower():
+                return True
+        return False
+    # 检查是否是文件
+    elif os.path.isfile(lora_path):
+        return lora_path.endswith(".safetensors")
+    return False
+def check_full_finetune_checkpoint(checkpoint_path):
+    """
+    检查是否是全量微调的checkpoint（包含model.safetensors）
+    """
+    if not checkpoint_path or not os.path.isdir(checkpoint_path):
+        return False
+    # 检查是否有model.safetensors文件（全量微调的标志）
+    model_file = os.path.join(checkpoint_path, "model.safetensors")
+    return os.path.exists(model_file)
+def load_lora_from_checkpoint(pipeline, checkpoint_path, rank=0):
+    """
+    从检查点加载LoRA权重
+    """
+    if rank == 0:
+        print(f"Loading LoRA weights from checkpoint: {checkpoint_path}")
+    # 直接从检查点目录加载state dict
+    try:
+        # 使用accelerator来加载检查点
+        accelerator = Accelerator()
+        # 先配置LoRA
+        transformer_lora_config = LoraConfig(
+            r=64,  # 假设使用rank=64，可以根据需要调整
+            lora_alpha=64,
+            init_lora_weights="gaussian",
+            target_modules=["attn.to_k", "attn.to_q", "attn.to_v", "attn.to_out.0"],
+        )
+        # 为transformer添加LoRA
+        pipeline.transformer.add_adapter(transformer_lora_config)
+        # 加载检查点状态
+        accelerator.load_state(checkpoint_path)
+        if rank == 0:
+            print(f"Successfully loaded LoRA weights from checkpoint {checkpoint_path}")
+        return True
+    except Exception as e:
+        if rank == 0:
+            print(f"Error loading LoRA from checkpoint {checkpoint_path}: {e}")
+            print("Falling back to baseline model without LoRA")
+        return False
+def load_captions_from_jsonl(jsonl_path):
+    """
+    从JSONL文件加载caption列表
+    """
+    captions = []
+    try:
+        with open(jsonl_path, 'r', encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    data = json.loads(line)
+                    # 支持多种字段名
+                    caption = None
+                    for field in ['caption', 'text', 'prompt', 'description']:
+                        if field in data and isinstance(data[field], str):
+                            caption = data[field].strip()
+                            break
+                    if caption:
+                        captions.append(caption)
+                    else:
+                        # 如果没有找到标准字段，取第一个字符串值
+                        for value in data.values():
+                            if isinstance(value, str) and value.strip():
+                                captions.append(value.strip())
+                                break
+                except json.JSONDecodeError as e:
+                    print(f"Warning: Invalid JSON on line {line_num}: {e}")
+                    continue
+    except FileNotFoundError:
+        print(f"Error: JSONL file {jsonl_path} not found")
+        return []
+    except Exception as e:
+        print(f"Error loading JSONL file {jsonl_path}: {e}")
+        return []
+    print(f"Loaded {len(captions)} captions from {jsonl_path}")
+    return captions
+def main(args):
+    """
+    运行 SD3 LoRA 采样
+    """
+    assert torch.cuda.is_available(), "DDP采样需要至少一个GPU"
+    torch.set_grad_enabled(False)
+    # 设置 DDP
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    device = rank % torch.cuda.device_count()
+    seed = args.global_seed * dist.get_world_size() + rank
+    torch.manual_seed(seed)
+    torch.cuda.set_device(device)
+    print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")
+    # 加载captions
+    captions = []
+    if args.captions_jsonl:
+        if rank == 0:
+            print(f"Loading captions from {args.captions_jsonl}")
+        captions = load_captions_from_jsonl(args.captions_jsonl)
+        if not captions:
+            if rank == 0:
+                print("Warning: No captions loaded, using default caption")
+            captions = ["a beautiful high quality image"]
+    else:
+        # 使用默认caption
+        captions = ["a beautiful high quality image"]
+    # 计算总的图片数量
+    total_images_needed = len(captions) * args.images_per_caption
+    # 应用最大样本数限制
+    total_images_needed = min(total_images_needed, args.max_samples)
+    if rank == 0:
+        print(f"Will generate {args.images_per_caption} images for each of {len(captions)} captions")
+        print(f"Total images requested: {len(captions) * args.images_per_caption}")
+        print(f"Max samples limit: {args.max_samples}")
+        print(f"Total images to generate: {total_images_needed}")
+    # 设置数据类型 - 使用混合精度以减少内存占用
+    if args.mixed_precision == "fp16":
+        dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        dtype = torch.bfloat16
+    else:
+        dtype = torch.float32
+    # 检查是否是全量微调的checkpoint
+    is_full_finetune = False
+    if args.lora_path and check_full_finetune_checkpoint(args.lora_path):
+        # 全量微调：直接从checkpoint加载
+        if rank == 0:
+            print(f"Detected full fine-tuning checkpoint, loading from: {args.lora_path}")
+        try:
+            pipeline = StableDiffusion3Pipeline.from_pretrained(
+                args.lora_path,
+                revision=args.revision,
+                variant=args.variant,
+                torch_dtype=dtype,
+            )
+            is_full_finetune = True
+            lora_source = os.path.basename(args.lora_path.rstrip('/'))
+            if rank == 0:
+                print("Successfully loaded full fine-tuned model from checkpoint")
+        except Exception as e:
+            if rank == 0:
+                print(f"Failed to load full fine-tuned model: {e}")
+                print("Falling back to baseline model + LoRA loading")
+            is_full_finetune = False
+    # 如果不是全量微调，加载基础模型
+    if not is_full_finetune:
+        if rank == 0:
+            print(f"Loading SD3 pipeline from {args.pretrained_model_name_or_path}")
+        pipeline = StableDiffusion3Pipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=dtype,
+        )
+    # 检查和加载 LoRA 权重（仅当不是全量微调时）
+    lora_loaded = False
+    lora_source = "baseline" if not is_full_finetune else lora_source
+    if not is_full_finetune and args.lora_path:
+        # 检查指定的LoRA路径是否存在权重文件
+        if check_lora_weights_exist(args.lora_path):
+            if rank == 0:
+                print(f"Loading LoRA weights from specified path: {args.lora_path}")
+            try:
+                pipeline.load_lora_weights(args.lora_path)
+                lora_loaded = True
+                lora_source = os.path.basename(args.lora_path.rstrip('/'))
+                if rank == 0:
+                    print("Successfully loaded LoRA weights from specified path")
+            except Exception as e:
+                if rank == 0:
+                    print(f"Failed to load LoRA from specified path: {e}")
+        else:
+            if rank == 0:
+                print(f"No LoRA weights found at specified path: {args.lora_path}")
+    # 如果没有成功加载LoRA权重，尝试从当前目录或检查点加载（仅当不是全量微调时）
+    if not is_full_finetune and not lora_loaded:
+        # 首先检查当前工作目录是否有权重文件
+        current_dir = os.getcwd()
+        if check_lora_weights_exist(current_dir):
+            if rank == 0:
+                print(f"Found LoRA weights in current directory: {current_dir}")
+            try:
+                pipeline.load_lora_weights(current_dir)
+                lora_loaded = True
+                lora_source = "current_dir"
+                if rank == 0:
+                    print("Successfully loaded LoRA weights from current directory")
+            except Exception as e:
+                if rank == 0:
+                    print(f"Failed to load LoRA from current directory: {e}")
+        # 如果当前目录也没有，检查是否有检查点目录
+        if not lora_loaded:
+            # 检查常见的输出目录
+            possible_output_dirs = [
+                "sd3-lora-finetuned",
+                "sd3-lora-finetuned-last",
+                "output",
+                "checkpoints"
+            ]
+            checkpoint_found = False
+            for output_dir in possible_output_dirs:
+                if os.path.exists(output_dir):
+                    # 首先检查输出目录是否直接包含权重文件
+                    if check_lora_weights_exist(output_dir):
+                        if rank == 0:
+                            print(f"Found LoRA weights in output directory: {output_dir}")
+                        try:
+                            pipeline.load_lora_weights(output_dir)
+                            lora_loaded = True
+                            lora_source = output_dir
+                            if rank == 0:
+                                print(f"Successfully loaded LoRA weights from {output_dir}")
+                            break
+                        except Exception as e:
+                            if rank == 0:
+                                print(f"Failed to load LoRA from {output_dir}: {e}")
+                    # 如果输出目录没有直接的权重文件，查找最新的检查点
+                    if not lora_loaded:
+                        latest_checkpoint, latest_step = find_latest_checkpoint(output_dir)
+                        if latest_checkpoint:
+                            if rank == 0:
+                                print(f"Found latest checkpoint: {latest_checkpoint} (step {latest_step})")
+                            # 尝试从检查点加载LoRA权重
+                            if load_lora_from_checkpoint(pipeline, latest_checkpoint, rank):
+                                lora_loaded = True
+                                lora_source = f"checkpoint-{latest_step}"
+                                checkpoint_found = True
+                                break
+            if not checkpoint_found and not lora_loaded:
+                if rank == 0:
+                    print("No LoRA weights or checkpoints found. Using baseline model.")
+    # 启用内存优化选项（必须在移动到设备之前）
+    if args.enable_cpu_offload:
+        if rank == 0:
+            print("Enabling CPU offload to save memory")
+        # CPU offload 会自动管理设备，不需要先 to(device)
+        pipeline.enable_model_cpu_offload()
+    else:
+        # 如果不使用 CPU offload，先移动到设备，然后启用其他优化
+        pipeline = pipeline.to(device)
+        if rank == 0:
+            print("Enabling memory optimization options")
+        # 检查并启用可用的内存优化方法
+        # 注意：所有进程都需要执行这些操作，不仅仅是 rank 0
+        if hasattr(pipeline, 'enable_attention_slicing'):
+            try:
+                pipeline.enable_attention_slicing()
+                if rank == 0:
+                    print("  - Attention slicing enabled")
+            except Exception as e:
+                if rank == 0:
+                    print(f"  - Warning: Failed to enable attention slicing: {e}")
+        else:
+            if rank == 0:
+                print("  - Attention slicing not available for this pipeline")
+        # SD3 pipeline 可能不支持 enable_vae_slicing，需要检查
+        # 使用 getattr 来安全地检查方法是否存在，避免触发 __getattr__ 异常
+        enable_vae_slicing_method = getattr(pipeline, 'enable_vae_slicing', None)
+        if enable_vae_slicing_method is not None and callable(enable_vae_slicing_method):
+            try:
+                enable_vae_slicing_method()
+                if rank == 0:
+                    print("  - VAE slicing enabled")
+            except Exception as e:
+                if rank == 0:
+                    print(f"  - Warning: Failed to enable VAE slicing: {e}")
+        else:
+            if rank == 0:
+                print("  - VAE slicing not available for this pipeline (SD3 may not support this)")
+    # 禁用进度条
+    pipeline.set_progress_bar_config(disable=True)
+    # 创建保存目录
+    folder_name = f"batch32-rank64-last-sd3-{lora_source}-guidance-{args.guidance_scale}-steps-{args.num_inference_steps}-size-{args.height}x{args.width}"
+    sample_folder_dir = os.path.join(args.sample_dir, folder_name)
+    if rank == 0:
+        os.makedirs(sample_folder_dir, exist_ok=True)
+        print(f"Saving .png samples at {sample_folder_dir}")
+        # 清空caption文件
+        caption_file = os.path.join(sample_folder_dir, "captions.txt")
+        if os.path.exists(caption_file):
+            os.remove(caption_file)
+    dist.barrier()
+    # 计算采样参数
+    n = args.per_proc_batch_size
+    global_batch_size = n * dist.get_world_size()
+    # 检查已存在的样本数量
+    existing_samples = 0
+    if os.path.exists(sample_folder_dir):
+        existing_samples = len([
+            name for name in os.listdir(sample_folder_dir)
+            if os.path.isfile(os.path.join(sample_folder_dir, name)) and name.endswith(".png")
+        ])
+    total_samples = int(math.ceil(total_images_needed / global_batch_size) * global_batch_size)
+    if rank == 0:
+        print(f"Total number of images that will be sampled: {total_samples}")
+        print(f"Existing samples: {existing_samples}")
+    assert total_samples % dist.get_world_size() == 0, "total_samples must be divisible by world_size"
+    samples_needed_this_gpu = int(total_samples // dist.get_world_size())
+    assert samples_needed_this_gpu % n == 0, "samples_needed_this_gpu must be divisible by the per-GPU batch size"
+    iterations = int(samples_needed_this_gpu // n)
+    done_iterations = int(int(existing_samples // dist.get_world_size()) // n)
+    pbar = range(done_iterations, iterations)
+    pbar = tqdm(pbar) if rank == 0 else pbar
+    # 生成caption和image的映射列表
+    caption_image_pairs = []
+    for i, caption in enumerate(captions):
+        for j in range(args.images_per_caption):
+            caption_image_pairs.append((caption, i, j))  # (caption, caption_idx, image_idx)
+    total_generated = existing_samples
+    # 采样循环
+    for i in pbar:
+        # 获取这个batch对应的caption
+        batch_prompts = []
+        batch_caption_info = []
+        for j in range(n):
+            global_index = i * global_batch_size + j * dist.get_world_size() + rank
+            if global_index < len(caption_image_pairs):
+                caption, caption_idx, image_idx = caption_image_pairs[global_index]
+                batch_prompts.append(caption)
+                batch_caption_info.append((caption, caption_idx, image_idx))
+            else:
+                # 如果超出范围，使用最后一个caption
+                if caption_image_pairs:
+                    caption, caption_idx, image_idx = caption_image_pairs[-1]
+                    batch_prompts.append(caption)
+                    batch_caption_info.append((caption, caption_idx, image_idx))
+                else:
+                    batch_prompts.append("a beautiful high quality image")
+                    batch_caption_info.append(("a beautiful high quality image", 0, 0))
+        # 生成图像 - 为每个图像使用不同的随机种子
+        device_str = "cuda" if torch.cuda.is_available() else "cpu"
+        with torch.autocast(device_str, dtype=dtype):
+            # 为每个prompt生成独立的图像（使用不同的generator）
+            images = []
+            for k, prompt in enumerate(batch_prompts):
+                # 为每个图像创建独立的随机种子
+                image_seed = seed + i * 10000 + k * 1000 + rank
+                generator = torch.Generator(device=device).manual_seed(image_seed)
+                image = pipeline(
+                    prompt=prompt,
+                    negative_prompt=args.negative_prompt if args.negative_prompt else None,
+                    height=args.height,
+                    width=args.width,
+                    num_inference_steps=args.num_inference_steps,
+                    guidance_scale=args.guidance_scale,
+                    generator=generator,
+                    num_images_per_prompt=1,
+                ).images[0]
+                images.append(image)
+        # 清理 GPU 缓存以释放内存
+        if k == len(batch_prompts) - 1:  # 每个 batch 的最后一张图片后清理
+            torch.cuda.empty_cache()
+        # 保存图像
+        for j, (image, (caption, caption_idx, image_idx)) in enumerate(zip(images, batch_caption_info)):
+            global_index = i * global_batch_size + j * dist.get_world_size() + rank
+            if global_index < len(caption_image_pairs):
+                # 保存图片，文件名包含caption索引和图片索引
+                filename = f"{global_index:06d}_cap{caption_idx:04d}_img{image_idx:02d}.png"
+                image_path = os.path.join(sample_folder_dir, filename)
+                image.save(image_path)
+                # 保存caption信息到文本文件（只在rank 0上操作）
+                if rank == 0:
+                    caption_file = os.path.join(sample_folder_dir, "captions.txt")
+                    with open(caption_file, "a", encoding="utf-8") as f:
+                        f.write(f"{filename}\t{caption}\n")
+        total_generated += global_batch_size
+        # 每个迭代后清理 GPU 缓存
+        torch.cuda.empty_cache()
+        dist.barrier()
+    # 确保所有进程都完成采样
+    dist.barrier()
+    # 创建npz文件
+    if rank == 0:
+        # 重新计算实际生成的图片数量
+        actual_num_samples = len([name for name in os.listdir(sample_folder_dir) if name.endswith(".png")])
+        print(f"Actually generated {actual_num_samples} images")
+        # 使用实际的图片数量或用户指定的数量，取较小值
+        npz_samples = min(actual_num_samples, total_images_needed, args.max_samples)
+        create_npz_from_sample_folder(sample_folder_dir, npz_samples)
+        print("Done.")
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="SD3 LoRA分布式采样脚本")
+    # 模型和路径参数
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default="stabilityai/stable-diffusion-3-medium-diffusers",
+        help="预训练模型路径或HuggingFace模型ID"
+    )
+    parser.add_argument(
+        "--lora_path",
+        type=str,
+        default=None,
+        help="LoRA权重文件路径"
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        help="模型修订版本"
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="模型变体，如fp16"
+    )
+    # 采样参数
+    parser.add_argument(
+        "--num_inference_steps",
+        type=int,
+        default=28,
+        help="推理步数"
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=7.0,
+        help="引导尺度"
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=1024,
+        help="生成图像高度"
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=1024,
+        help="生成图像宽度"
+    )
+    parser.add_argument(
+        "--negative_prompt",
+        type=str,
+        default="",
+        help="负面提示词"
+    )
+    # 批处理和数据集参数
+    parser.add_argument(
+        "--per_proc_batch_size",
+        type=int,
+        default=1,
+        help="每个进程的批处理大小"
+    )
+    parser.add_argument(
+        "--sample_dir",
+        type=str,
+        default="sd3_lora_samples",
+        help="样本保存目录"
+    )
+    # Caption相关参数
+    parser.add_argument(
+        "--captions_jsonl",
+        type=str,
+        required=True,
+        help="包含caption列表的JSONL文件路径"
+    )
+    parser.add_argument(
+        "--images_per_caption",
+        type=int,
+        default=1,
+        help="每个caption生成的图像数量"
+    )
+    parser.add_argument(
+        "--max_samples",
+        type=int,
+        default=30000,
+        help="最大样本生成数量"
+    )
+    # 其他参数
+    parser.add_argument(
+        "--global_seed",
+        type=int,
+        default=42,
+        help="全局随机种子"
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="fp16",
+        choices=["no", "fp16", "bf16"],
+        help="混合精度类型"
+    )
+    parser.add_argument(
+        "--enable_cpu_offload",
+        action="store_true",
+        help="启用CPU offload以节省显存"
+    )
+    args = parser.parse_args()
+    main(args)

sample_sd3_lora_rn_pair_ddp.py ADDED Viewed

	@@ -0,0 +1,417 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""
+DDP对照采样：同一文本+同一初始噪声，分别生成 LoRA 与 RN 两类图像，并输出 pair 拼接图与 metadata。
+"""
+import argparse
+import importlib.util
+import json
+import math
+import os
+import sys
+from pathlib import Path
+import torch
+import torch.distributed as dist
+from PIL import Image
+from tqdm import tqdm
+from diffusers import StableDiffusion3Pipeline as DiffusersStableDiffusion3Pipeline
+def dynamic_import_training_classes(project_root: str):
+    sys.path.insert(0, project_root)
+    import train_rectified_noise as trn
+    return trn.RectifiedNoiseModule, trn.SD3WithRectifiedNoise
+def load_local_pipeline_class(local_pipeline_path: str):
+    """
+    从本地文件加载 StableDiffusion3Pipeline。
+    通过将模块名挂在 diffusers.pipelines.stable_diffusion_3 下，兼容文件内的相对导入。
+    """
+    module_name = "diffusers.pipelines.stable_diffusion_3.local_pipeline_stable_diffusion_3"
+    spec = importlib.util.spec_from_file_location(module_name, local_pipeline_path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Failed to build import spec from: {local_pipeline_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    if not hasattr(module, "StableDiffusion3Pipeline"):
+        raise ImportError("Local pipeline file has no StableDiffusion3Pipeline symbol.")
+    return module.StableDiffusion3Pipeline
+def load_captions_from_jsonl(jsonl_path):
+    captions = []
+    with open(jsonl_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+                cap = None
+                for field in ["caption", "text", "prompt", "description"]:
+                    if field in data and isinstance(data[field], str):
+                        cap = data[field].strip()
+                        break
+                if cap:
+                    captions.append(cap)
+            except Exception:
+                continue
+    return captions if captions else ["a beautiful high quality image"]
+def load_sit_weights(rectified_module, weights_path: str):
+    if os.path.isdir(weights_path):
+        search_dirs = [weights_path, os.path.join(weights_path, "sit_weights")]
+        for d in search_dirs:
+            if not os.path.exists(d):
+                continue
+            st = os.path.join(d, "pytorch_sit_weights.safetensors")
+            if os.path.exists(st):
+                from safetensors.torch import load_file
+                state = load_file(st)
+                rectified_module.load_state_dict(state, strict=False)
+                return True
+            for name in ["pytorch_sit_weights.bin", "pytorch_sit_weights.pt", "sit_weights.pt", "sit.pt"]:
+                cand = os.path.join(d, name)
+                if os.path.exists(cand):
+                    state = torch.load(cand, map_location="cpu")
+                    rectified_module.load_state_dict(state, strict=False)
+                    return True
+        return False
+    else:
+        if weights_path.endswith(".safetensors"):
+            from safetensors.torch import load_file
+            state = load_file(weights_path)
+        else:
+            state = torch.load(weights_path, map_location="cpu")
+        rectified_module.load_state_dict(state, strict=False)
+        return True
+def save_jsonl_line(path, obj):
+    with open(path, "a", encoding="utf-8") as f:
+        f.write(json.dumps(obj, ensure_ascii=False) + "\n")
+def load_jsonl(path):
+    if not os.path.exists(path):
+        return []
+    rows = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            rows.append(json.loads(line))
+    return rows
+def merge_rank_metadata(out_path, rank_paths):
+    rows = []
+    for rp in rank_paths:
+        rows.extend(load_jsonl(rp))
+    rows.sort(key=lambda x: x.get("file_name", ""))
+    with open(out_path, "w", encoding="utf-8") as f:
+        for r in rows:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+def build_rn_model(base_pipeline, rectified_weights, num_sit_layers, device):
+    RectifiedNoiseModule, SD3WithRectifiedNoise = dynamic_import_training_classes(str(Path(__file__).parent))
+    tfm = base_pipeline.transformer
+    if hasattr(tfm.config, "joint_attention_dim") and tfm.config.joint_attention_dim is not None:
+        sit_hidden_size = tfm.config.joint_attention_dim
+    elif hasattr(tfm.config, "inner_dim") and tfm.config.inner_dim is not None:
+        sit_hidden_size = tfm.config.inner_dim
+    else:
+        sit_hidden_size = 4096
+    transformer_hidden_size = getattr(tfm.config, "hidden_size", 1536)
+    num_attention_heads = getattr(tfm.config, "num_attention_heads", 32)
+    input_dim = getattr(tfm.config, "in_channels", 16)
+    rectified_module = RectifiedNoiseModule(
+        hidden_size=sit_hidden_size,
+        num_sit_layers=num_sit_layers,
+        num_attention_heads=num_attention_heads,
+        input_dim=input_dim,
+        transformer_hidden_size=transformer_hidden_size,
+    )
+    ok = load_sit_weights(rectified_module, rectified_weights)
+    if not ok:
+        raise RuntimeError(f"Failed to load rectified weights from: {rectified_weights}")
+    model = SD3WithRectifiedNoise(base_pipeline.transformer, rectified_module).to(device)
+    model.eval()
+    return model
+def create_npz_from_dir(sample_dir, max_samples):
+    import numpy as np
+    files = sorted([x for x in os.listdir(sample_dir) if x.endswith(".png") and x[:-4].isdigit()])
+    files = files[:max_samples]
+    if not files:
+        return None
+    arr = []
+    for fn in tqdm(files, desc=f"npz:{os.path.basename(sample_dir)}"):
+        arr.append(np.asarray(Image.open(os.path.join(sample_dir, fn))).astype(np.uint8))
+    arr = np.stack(arr)
+    out = f"{sample_dir}.npz"
+    np.savez(out, arr_0=arr)
+    return out
+def set_pipeline_modules_eval(pipe):
+    """
+    Diffusers pipeline 本身没有 .eval()，需要对内部 nn.Module 分别设为 eval。
+    """
+    for name in ["transformer", "vae", "text_encoder", "text_encoder_2", "text_encoder_3", "image_encoder", "model"]:
+        module = getattr(pipe, name, None)
+        if module is not None and hasattr(module, "eval"):
+            module.eval()
+def main(args):
+    assert torch.cuda.is_available(), "Need GPU"
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    world = dist.get_world_size()
+    device = rank % torch.cuda.device_count()
+    torch.cuda.set_device(device)
+    seed = args.global_seed * world + rank
+    torch.manual_seed(seed)
+    dtype = torch.float16 if args.mixed_precision == "fp16" else (torch.bfloat16 if args.mixed_precision == "bf16" else torch.float32)
+    root = Path(args.sample_dir)
+    lora_dir = root / "lora"
+    rn_dir = root / "rn"
+    pair_dir = root / "pair"
+    metadata_path = root / "metadata.jsonl"
+    lora_meta = lora_dir / "metadata.jsonl"
+    rn_meta = rn_dir / "metadata.jsonl"
+    pair_meta = pair_dir / "metadata.jsonl"
+    if rank == 0:
+        lora_dir.mkdir(parents=True, exist_ok=True)
+        rn_dir.mkdir(parents=True, exist_ok=True)
+        pair_dir.mkdir(parents=True, exist_ok=True)
+    dist.barrier()
+    if args.stage == "lora":
+        pipe_lora = DiffusersStableDiffusion3Pipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=dtype,
+        ).to(device)
+        if args.lora_path:
+            pipe_lora.load_lora_weights(args.lora_path)
+        pipe_lora.set_progress_bar_config(disable=True)
+        set_pipeline_modules_eval(pipe_lora)
+        captions = load_captions_from_jsonl(args.captions_jsonl)
+        total_needed = min(len(captions) * args.images_per_caption, args.max_samples)
+        n = args.per_proc_batch_size
+        global_batch = n * world
+        total_samples = int(math.ceil(total_needed / global_batch) * global_batch)
+        iters = total_samples // global_batch
+        pbar = tqdm(range(iters)) if rank == 0 else range(iters)
+        rank_meta_path = root / f"metadata.rank{rank}.jsonl"
+        if rank_meta_path.exists():
+            rank_meta_path.unlink()
+        rank_lora_meta_path = lora_dir / f"metadata.rank{rank}.jsonl"
+        if rank_lora_meta_path.exists():
+            rank_lora_meta_path.unlink()
+        for it in pbar:
+            for k in range(n):
+                global_idx = it * global_batch + k * world + rank
+                if global_idx >= total_needed:
+                    continue
+                cap_idx = global_idx // args.images_per_caption
+                prompt = captions[cap_idx]
+                image_seed = seed + it * 10000 + k * 1000
+                g = torch.Generator(device=device).manual_seed(image_seed)
+                latent_h = args.height // pipe_lora.vae_scale_factor
+                latent_w = args.width // pipe_lora.vae_scale_factor
+                latents = torch.randn(
+                    (1, pipe_lora.transformer.config.in_channels, latent_h, latent_w),
+                    device=device,
+                    dtype=dtype,
+                    generator=g,
+                )
+                with torch.autocast("cuda", dtype=dtype):
+                    img_lora = pipe_lora(
+                        prompt=prompt,
+                        height=args.height,
+                        width=args.width,
+                        num_inference_steps=args.num_inference_steps,
+                        guidance_scale=args.guidance_scale,
+                        latents=latents,
+                        num_images_per_prompt=1,
+                    ).images[0]
+                fn = f"{global_idx:07d}.png"
+                img_lora.save(lora_dir / fn)
+                save_jsonl_line(str(rank_meta_path), {"file_name": fn, "caption": prompt, "seed": int(image_seed), "lora_file": f"lora/{fn}"})
+                save_jsonl_line(str(rank_lora_meta_path), {"file_name": fn, "caption": prompt, "seed": int(image_seed)})
+            dist.barrier()
+        dist.barrier()
+        if rank == 0:
+            merge_rank_metadata(str(metadata_path), [str(root / f"metadata.rank{r}.jsonl") for r in range(world)])
+            merge_rank_metadata(str(lora_meta), [str(lora_dir / f"metadata.rank{r}.jsonl") for r in range(world)])
+            records = load_jsonl(str(metadata_path))
+            create_npz_from_dir(str(lora_dir), len(records))
+    elif args.stage == "rn":
+        records = load_jsonl(str(metadata_path))
+        if not records:
+            raise RuntimeError(f"metadata not found or empty: {metadata_path}. Run --stage lora first.")
+        total_needed = min(len(records), args.max_samples)
+        LocalStableDiffusion3Pipeline = load_local_pipeline_class(args.local_pipeline_path)
+        pipe_rn = LocalStableDiffusion3Pipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=dtype,
+        ).to(device)
+        if args.lora_path:
+            pipe_rn.load_lora_weights(args.lora_path)
+        pipe_rn.model = build_rn_model(pipe_rn, args.rectified_weights, args.num_sit_layers, device)
+        pipe_rn.set_progress_bar_config(disable=True)
+        set_pipeline_modules_eval(pipe_rn)
+        rank_rn_meta_path = rn_dir / f"metadata.rank{rank}.jsonl"
+        if rank_rn_meta_path.exists():
+            rank_rn_meta_path.unlink()
+        assigned = [r for i, r in enumerate(records[:total_needed]) if i % world == rank]
+        pbar = tqdm(assigned) if rank == 0 else assigned
+        for rec in pbar:
+            fn = rec["file_name"]
+            prompt = rec["caption"]
+            image_seed = int(rec["seed"])
+            g = torch.Generator(device=device).manual_seed(image_seed)
+            latent_h = args.height // pipe_rn.vae_scale_factor
+            latent_w = args.width // pipe_rn.vae_scale_factor
+            latents = torch.randn(
+                (1, pipe_rn.transformer.config.in_channels, latent_h, latent_w),
+                device=device,
+                dtype=dtype,
+                generator=g,
+            )
+            with torch.autocast("cuda", dtype=dtype):
+                img_rn = pipe_rn(
+                    prompt=prompt,
+                    height=args.height,
+                    width=args.width,
+                    num_inference_steps=args.num_inference_steps,
+                    guidance_scale=args.guidance_scale,
+                    latents=latents,
+                    num_images_per_prompt=1,
+                ).images[0]
+            img_rn.save(rn_dir / fn)
+            save_jsonl_line(str(rank_rn_meta_path), {"file_name": fn, "caption": prompt, "seed": image_seed})
+        dist.barrier()
+        if rank == 0:
+            merge_rank_metadata(str(rn_meta), [str(rn_dir / f"metadata.rank{r}.jsonl") for r in range(world)])
+            create_npz_from_dir(str(rn_dir), total_needed)
+    elif args.stage == "pair":
+        records = load_jsonl(str(metadata_path))
+        if not records:
+            raise RuntimeError(f"metadata not found: {metadata_path}")
+        total_needed = min(len(records), args.max_samples)
+        rank_pair_meta_path = pair_dir / f"metadata.rank{rank}.jsonl"
+        if rank_pair_meta_path.exists():
+            rank_pair_meta_path.unlink()
+        assigned = [r for i, r in enumerate(records[:total_needed]) if i % world == rank]
+        for rec in assigned:
+            fn = rec["file_name"]
+            lora_img_path = lora_dir / fn
+            rn_img_path = rn_dir / fn
+            if not lora_img_path.exists() or not rn_img_path.exists():
+                continue
+            img_lora = Image.open(lora_img_path).convert("RGB")
+            img_rn = Image.open(rn_img_path).convert("RGB")
+            pair = Image.new("RGB", (img_lora.width + img_rn.width, max(img_lora.height, img_rn.height)))
+            pair.paste(img_lora, (0, 0))
+            pair.paste(img_rn, (img_lora.width, 0))
+            pair.save(pair_dir / fn)
+            save_jsonl_line(
+                str(rank_pair_meta_path),
+                {"file_name": fn, "caption": rec["caption"], "seed": int(rec["seed"]), "pair_file": f"pair/{fn}"},
+            )
+        dist.barrier()
+        if rank == 0:
+            merge_rank_metadata(str(pair_meta), [str(pair_dir / f"metadata.rank{r}.jsonl") for r in range(world)])
+            # 更新根 metadata，补齐 rn/pair 路径
+            rn_set = {r["file_name"] for r in load_jsonl(str(rn_meta))}
+            pair_set = {r["file_name"] for r in load_jsonl(str(pair_meta))}
+            merged = []
+            for r in records[:total_needed]:
+                fn = r["file_name"]
+                out = dict(r)
+                if fn in rn_set:
+                    out["rn_file"] = f"rn/{fn}"
+                if fn in pair_set:
+                    out["pair_file"] = f"pair/{fn}"
+                merged.append(out)
+            with open(metadata_path, "w", encoding="utf-8") as f:
+                for r in merged:
+                    f.write(json.dumps(r, ensure_ascii=False) + "\n")
+    else:
+        raise ValueError(f"Unknown stage: {args.stage}")
+    dist.barrier()
+    if rank == 0:
+        print(f"Stage {args.stage} done. Output root: {root}")
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="DDP compare sampling: LoRA vs RN with same latent/prompt.")
+    parser.add_argument("--pretrained_model_name_or_path", type=str, required=True)
+    parser.add_argument(
+        "--local_pipeline_path",
+        type=str,
+        default=str(Path(__file__).parent / "pipeline_stable_diffusion_3.py"),
+        help="RN 分支使用的本地 pipeline 文件路径",
+    )
+    parser.add_argument("--revision", type=str, default=None)
+    parser.add_argument("--variant", type=str, default=None)
+    parser.add_argument("--lora_path", type=str, default=None)
+    parser.add_argument("--rectified_weights", type=str, required=True)
+    parser.add_argument("--num_sit_layers", type=int, default=1)
+    parser.add_argument("--captions_jsonl", type=str, required=True)
+    parser.add_argument("--sample_dir", type=str, default="./sd3_lora_rn_compare")
+    parser.add_argument("--num_inference_steps", type=int, default=40)
+    parser.add_argument("--guidance_scale", type=float, default=7.0)
+    parser.add_argument("--height", type=int, default=512)
+    parser.add_argument("--width", type=int, default=512)
+    parser.add_argument("--per_proc_batch_size", type=int, default=4)
+    parser.add_argument("--images_per_caption", type=int, default=1)
+    parser.add_argument("--max_samples", type=int, default=10000)
+    parser.add_argument("--global_seed", type=int, default=42)
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["no", "fp16", "bf16"])
+    parser.add_argument("--stage", type=str, default="lora", choices=["lora", "rn", "pair"])
+    args = parser.parse_args()
+    main(args)

sample_sd3_rectified_ddp.py ADDED Viewed

	@@ -0,0 +1,1316 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""
+分布式采样脚本：支持指定 LoRA 权重与 Rectified Noise(SIT) 权重
+依据 train_rectified_noise.py 的模型结构，加载并组装 SD3WithRectifiedNoise 进行采样。
+"""
+import os
+import sys
+import json
+import math
+import argparse
+from pathlib import Path
+import torch
+import torch.distributed as dist
+from tqdm import tqdm
+import numpy as np
+from PIL import Image
+from accelerate import Accelerator
+from diffusers import StableDiffusion3Pipeline
+from peft import LoraConfig, get_peft_model_state_dict
+from peft.utils import set_peft_model_state_dict
+def dynamic_import_training_classes(project_root: str):
+    """从 train_rectified_noise.py 动态导入 RectifiedNoiseModule 和 SD3WithRectifiedNoise"""
+    sys.path.insert(0, project_root)
+    try:
+        import train_rectified_noise as trn
+        return trn.RectifiedNoiseModule, trn.SD3WithRectifiedNoise
+    except Exception as e:
+        raise ImportError(f"无法从 train_rectified_noise.py 导入类: {e}")
+def create_npz_from_sample_folder(sample_dir, num_samples):
+    """
+    从样本文件夹构建单个.npz文件，保持与sample_ddp_new相同的格式
+    """
+    samples = []
+    actual_files = []
+    # 收集所有PNG文件
+    for filename in sorted(os.listdir(sample_dir)):
+        if filename.endswith('.png'):
+            actual_files.append(filename)
+    # 按照数量限制处理
+    for i in tqdm(range(min(num_samples, len(actual_files))), desc="Building .npz file from samples"):
+        if i < len(actual_files):
+            sample_path = os.path.join(sample_dir, actual_files[i])
+            sample_pil = Image.open(sample_path)
+            sample_np = np.asarray(sample_pil).astype(np.uint8)
+            samples.append(sample_np)
+        else:
+            # 如果不够，创建空白图像
+            sample_np = np.zeros((512, 512, 3), dtype=np.uint8)
+            samples.append(sample_np)
+    if samples:
+        samples = np.stack(samples)
+        npz_path = f"{sample_dir}.npz"
+        np.savez(npz_path, arr_0=samples)
+        print(f"Saved .npz file to {npz_path} [shape={samples.shape}].")
+        return npz_path
+    else:
+        print("No samples found to create npz file.")
+        return None
+def get_existing_sample_count(sample_dir):
+    """获取已存在的样本数量和最大索引"""
+    if not os.path.exists(sample_dir):
+        return 0, -1
+    existing_files = []
+    for filename in os.listdir(sample_dir):
+        if filename.endswith('.png') and filename[:-4].isdigit():
+            try:
+                idx = int(filename[:-4])
+                existing_files.append(idx)
+            except ValueError:
+                continue
+    if not existing_files:
+        return 0, -1
+    existing_files.sort()
+    max_index = existing_files[-1]
+    count = len(existing_files)
+    # 检查是否有缺失的文件（从0到max_index应该连续）
+    expected_count = max_index + 1
+    if count < expected_count:
+        print(f"Warning: Found {count} files but expected {expected_count} (missing some indices)")
+    return count, max_index
+def load_sit_weights(rectified_module, weights_path: str, rank=0):
+    """加载 Rectified Noise(SIT) 权重，支持 .safetensors / .bin / .pt
+    支持以下目录结构：
+    - weights_path/pytorch_sit_weights.safetensors (直接在主目录)
+    - weights_path/sit_weights/pytorch_sit_weights.safetensors (在sit_weights子目录)
+    """
+    if os.path.isdir(weights_path):
+        # 首先尝试在主目录查找
+        search_paths = [
+            weights_path,  # 主目录
+            os.path.join(weights_path, "sit_weights"),  # sit_weights子目录
+        ]
+        for search_dir in search_paths:
+            if not os.path.exists(search_dir):
+                continue
+            # 优先寻找 safetensors
+            st_path = os.path.join(search_dir, "pytorch_sit_weights.safetensors")
+            if os.path.exists(st_path):
+                try:
+                    from safetensors.torch import load_file
+                    if rank == 0:
+                        print(f"Loading rectified weights from: {st_path}")
+                    state = load_file(st_path)
+                    missing_keys, unexpected_keys = rectified_module.load_state_dict(state, strict=False)
+                    if rank == 0:
+                        print(f"  Loaded rectified weights: {len(state)} keys")
+                        if missing_keys:
+                            print(f"  Missing keys: {len(missing_keys)}")
+                        if unexpected_keys:
+                            print(f"  Unexpected keys: {len(unexpected_keys)}")
+                    return True
+                except Exception as e:
+                    if rank == 0:
+                        print(f"  Failed to load from {st_path}: {e}")
+                    continue
+            # 其次寻找 bin/pt
+            for name in ["pytorch_sit_weights.bin", "pytorch_sit_weights.pt", "sit_weights.pt", "sit.pt"]:
+                cand = os.path.join(search_dir, name)
+                if os.path.exists(cand):
+                    try:
+                        if rank == 0:
+                            print(f"Loading rectified weights from: {cand}")
+                        state = torch.load(cand, map_location="cpu")
+                        missing_keys, unexpected_keys = rectified_module.load_state_dict(state, strict=False)
+                        if rank == 0:
+                            print(f"  Loaded rectified weights: {len(state)} keys")
+                            if missing_keys:
+                                print(f"  Missing keys: {len(missing_keys)}")
+                            if unexpected_keys:
+                                print(f"  Unexpected keys: {len(unexpected_keys)}")
+                        return True
+                    except Exception as e:
+                        if rank == 0:
+                            print(f"  Failed to load from {cand}: {e}")
+                        continue
+            # 兜底：目录下任意 pt/bin
+            try:
+                for fn in os.listdir(search_dir):
+                    if fn.endswith((".pt", ".bin")):
+                        cand = os.path.join(search_dir, fn)
+                        try:
+                            if rank == 0:
+                                print(f"Loading rectified weights from: {cand}")
+                            state = torch.load(cand, map_location="cpu")
+                            missing_keys, unexpected_keys = rectified_module.load_state_dict(state, strict=False)
+                            if rank == 0:
+                                print(f"  Loaded rectified weights: {len(state)} keys")
+                            return True
+                        except Exception as e:
+                            if rank == 0:
+                                print(f"  Failed to load from {cand}: {e}")
+                            continue
+            except Exception:
+                pass
+        if rank == 0:
+            print(f"  ❌ No rectified weights found in {weights_path} or {os.path.join(weights_path, 'sit_weights')}")
+        return False
+    else:
+        # 直接文件
+        try:
+            if rank == 0:
+                print(f"Loading rectified weights from file: {weights_path}")
+            if weights_path.endswith(".safetensors"):
+                from safetensors.torch import load_file
+                state = load_file(weights_path)
+            else:
+                state = torch.load(weights_path, map_location="cpu")
+            missing_keys, unexpected_keys = rectified_module.load_state_dict(state, strict=False)
+            if rank == 0:
+                print(f"  Loaded rectified weights: {len(state)} keys")
+                if missing_keys:
+                    print(f"  Missing keys: {len(missing_keys)}")
+                if unexpected_keys:
+                    print(f"  Unexpected keys: {len(unexpected_keys)}")
+            return True
+        except Exception as e:
+            if rank == 0:
+                print(f"  ❌ Failed to load rectified weights from {weights_path}: {e}")
+            return False
+def check_lora_weights_exist(lora_path):
+    """检查LoRA权重文件是否存在"""
+    if not lora_path:
+        return False
+    if os.path.isdir(lora_path):
+        # 检查目录中是否有pytorch_lora_weights.safetensors文件
+        weight_file = os.path.join(lora_path, "pytorch_lora_weights.safetensors")
+        if os.path.exists(weight_file):
+            return True
+        # 检查是否有其他.safetensors文件
+        for file in os.listdir(lora_path):
+            if file.endswith(".safetensors") and "lora" in file.lower():
+                return True
+        return False
+    elif os.path.isfile(lora_path):
+        return lora_path.endswith(".safetensors")
+    return False
+def load_lora_from_checkpoint(pipeline, checkpoint_path, rank=0, lora_rank=64):
+    """
+    从accelerator checkpoint目录加载LoRA权重或完整模型权重
+    如果checkpoint包含完整的模型权重（合并后的），直接加载
+    如果只包含LoRA权重，则按LoRA方式加载
+    """
+    if rank == 0:
+        print(f"Loading weights from accelerator checkpoint: {checkpoint_path}")
+    try:
+        from safetensors.torch import load_file
+        model_file = os.path.join(checkpoint_path, "model.safetensors")
+        if not os.path.exists(model_file):
+            if rank == 0:
+                print(f"Model file not found: {model_file}")
+            return False
+        # 加载state dict
+        state_dict = load_file(model_file)
+        all_keys = list(state_dict.keys())
+        # 检测checkpoint类型：
+        # 1. 是否包含base_layer（PEFT格式，需要合并）
+        # 2. 是否包含完整的模型权重（合并���的，直接可用）
+        # 3. 是否只包含LoRA权重（需要添加适配器）
+        lora_keys = [k for k in all_keys if 'lora' in k.lower() and 'transformer' in k.lower()]
+        base_layer_keys = [k for k in all_keys if 'base_layer' in k.lower() and 'transformer' in k.lower()]
+        non_lora_transformer_keys = [k for k in all_keys if 'lora' not in k.lower() and 'base_layer' not in k.lower() and 'transformer' in k.lower()]
+        if rank == 0:
+            print(f"Checkpoint analysis:")
+            print(f"  Total keys: {len(all_keys)}")
+            print(f"  LoRA keys: {len(lora_keys)}")
+            print(f"  Base layer keys: {len(base_layer_keys)}")
+            print(f"  Direct transformer weight keys (merged): {len(non_lora_transformer_keys)}")
+        # 如果包含base_layer，说明是PEFT格式，需要合并base_layer + lora
+        if len(base_layer_keys) > 0:
+            if rank == 0:
+                print(f"✓ Detected PEFT format (base_layer + LoRA), merging weights...")
+            # 合并base_layer和lora权重
+            merged_state_dict = {}
+            # 首先收集所有需要合并的模块
+            modules_to_merge = {}
+            # 记录所有非LoRA的transformer权重键名（用于调试）
+            non_lora_keys_found = []
+            for key in all_keys:
+                # 移除前缀
+                new_key = key
+                has_transformer_prefix = False
+                if key.startswith('base_model.model.transformer.'):
+                    new_key = key[len('base_model.model.transformer.'):]
+                    has_transformer_prefix = True
+                elif key.startswith('model.transformer.'):
+                    new_key = key[len('model.transformer.'):]
+                    has_transformer_prefix = True
+                elif key.startswith('transformer.'):
+                    new_key = key[len('transformer.'):]
+                    has_transformer_prefix = True
+                elif 'transformer' in key.lower():
+                    # 可能没有前缀，但包含transformer（如直接是transformer_blocks.0...）
+                    has_transformer_prefix = True
+                if not has_transformer_prefix:
+                    continue
+                # 检查是否是base_layer或lora权重
+                if '.base_layer.weight' in new_key:
+                    # 提取模块名（去掉.base_layer.weight部分）
+                    module_key = new_key.replace('.base_layer.weight', '.weight')
+                    if module_key not in modules_to_merge:
+                        modules_to_merge[module_key] = {'base_weight': None, 'base_bias': None, 'lora_A': None, 'lora_B': None}
+                    modules_to_merge[module_key]['base_weight'] = (key, state_dict[key])
+                elif '.base_layer.bias' in new_key:
+                    module_key = new_key.replace('.base_layer.bias', '.bias')
+                    if module_key not in modules_to_merge:
+                        modules_to_merge[module_key] = {'base_weight': None, 'base_bias': None, 'lora_A': None, 'lora_B': None}
+                    modules_to_merge[module_key]['base_bias'] = (key, state_dict[key])
+                elif '.lora_A.default.weight' in new_key:
+                    module_key = new_key.replace('.lora_A.default.weight', '.weight')
+                    if module_key not in modules_to_merge:
+                        modules_to_merge[module_key] = {'base_weight': None, 'base_bias': None, 'lora_A': None, 'lora_B': None}
+                    modules_to_merge[module_key]['lora_A'] = (key, state_dict[key])
+                elif '.lora_B.default.weight' in new_key:
+                    module_key = new_key.replace('.lora_B.default.weight', '.weight')
+                    if module_key not in modules_to_merge:
+                        modules_to_merge[module_key] = {'base_weight': None, 'base_bias': None, 'lora_A': None, 'lora_B': None}
+                    modules_to_merge[module_key]['lora_B'] = (key, state_dict[key])
+                elif 'lora' not in new_key.lower() and 'base_layer' not in new_key.lower():
+                    # 其他非LoRA权重（如pos_embed、time_text_embed、context_embedder等），直接使用
+                    # 这些权重不在LoRA适配范围内，应该直接从checkpoint加载
+                    merged_state_dict[new_key] = state_dict[key]
+                    non_lora_keys_found.append(new_key)
+            if rank == 0:
+                print(f"  Found {len(non_lora_keys_found)} non-LoRA transformer keys in checkpoint")
+                if non_lora_keys_found:
+                    print(f"  Sample non-LoRA keys: {non_lora_keys_found[:10]}")
+            # 合并权重：weight = base_weight + lora_B @ lora_A * (alpha / rank)
+            if rank == 0:
+                print(f"  Merging {len(modules_to_merge)} modules...")
+            import torch
+            for module_key, weights in modules_to_merge.items():
+                # 处理权重（.weight）
+                if weights['base_weight'] is not None:
+                    base_key, base_weight = weights['base_weight']
+                    base_weight = base_weight.clone()
+                    if weights['lora_A'] is not None and weights['lora_B'] is not None:
+                        lora_A_key, lora_A = weights['lora_A']
+                        lora_B_key, lora_B = weights['lora_B']
+                        # 检测rank和alpha
+                        # lora_A: [rank, in_features], lora_B: [out_features, rank]
+                        rank_value = lora_A.shape[0]
+                        alpha = rank_value  # 通常alpha = rank
+                        # 合并：weight = base + (lora_B @ lora_A) * (alpha / rank)
+                        # lora_B @ lora_A 得到 [out_features, in_features]
+                        lora_delta = torch.matmul(lora_B, lora_A)
+                        if lora_delta.shape == base_weight.shape:
+                            merged_weight = base_weight + lora_delta * (alpha / rank_value)
+                            merged_state_dict[module_key] = merged_weight
+                            if rank == 0 and len(modules_to_merge) <= 20:
+                                print(f"  ✓ Merged {module_key}: {base_weight.shape}")
+                        else:
+                            if rank == 0:
+                                print(f"  ⚠️ Shape mismatch for {module_key}: base={base_weight.shape}, lora_delta={lora_delta.shape}, using base only")
+                            merged_state_dict[module_key] = base_weight
+                    else:
+                        # 只有base权重，没有LoRA
+                        merged_state_dict[module_key] = base_weight
+                # 处理bias（.bias）- bias通常不需要合并，直接使用base_bias
+                if '.bias' in module_key and weights['base_bias'] is not None:
+                    bias_key, base_bias = weights['base_bias']
+                    merged_state_dict[module_key] = base_bias.clone()
+            if rank == 0:
+                print(f"  Merged {len(merged_state_dict)} weights")
+                print(f"  Sample merged keys: {list(merged_state_dict.keys())[:5]}")
+            # 加载合并后的权重
+            try:
+                missing_keys, unexpected_keys = pipeline.transformer.load_state_dict(merged_state_dict, strict=False)
+                if rank == 0:
+                    print(f"  Loaded merged weights:")
+                    print(f"    Missing keys: {len(missing_keys)}")
+                    print(f"    Unexpected keys: {len(unexpected_keys)}")
+                    if missing_keys:
+                        print(f"    Missing keys: {missing_keys}")
+                        # 检查缺失的keys是否关键
+                        critical_keys = ['pos_embed', 'time_text_embed', 'context_embedder', 'norm_out', 'proj_out']
+                        has_critical = any(any(ck in mk for ck in critical_keys) for mk in missing_keys)
+                        if has_critical:
+                            print(f"    ⚠️ WARNING: Missing critical keys! These should be loaded from pretrained model.")
+                            print(f"    The missing keys will use values from the pretrained model (not fine-tuned).")
+                # 如果缺失的keys太多或包含关键组件，给出警告
+                if len(missing_keys) > 0:
+                    # 这些缺失的keys会使用pretrained model的默认值
+                    # 这是正常的，因为LoRA只适配了部分层，其他层保持原样
+                    if rank == 0:
+                        print(f"  Note: Missing keys will use pretrained model weights (not fine-tuned)")
+                if rank == 0:
+                    print(f"  ✓ Successfully loaded merged model weights")
+                return True
+            except Exception as e:
+                if rank == 0:
+                    print(f"  ❌ Error loading merged weights: {e}")
+                    import traceback
+                    traceback.print_exc()
+                return False
+        # 如果包含非LoRA的transformer权重（且没有base_layer），说明是合并后的完整模型
+        elif len(non_lora_transformer_keys) > 0:
+            if rank == 0:
+                print(f"✓ Detected merged model weights (contains full transformer weights)")
+                print(f"  Loading full model weights directly...")
+            # 提取transformer相关的权重（包括LoRA和基础权重）
+            transformer_state_dict = {}
+            for key, value in state_dict.items():
+                # 移除可能的accelerator包装前缀
+                new_key = key
+                if key.startswith('base_model.model.transformer.'):
+                    new_key = key[len('base_model.model.transformer.'):]
+                elif key.startswith('model.transformer.'):
+                    new_key = key[len('model.transformer.'):]
+                elif key.startswith('transformer.'):
+                    new_key = key[len('transformer.'):]
+                # 只保留transformer相关的权重（包括所有transformer子模块）
+                # 检查是否是transformer的权重（不包含text_encoder等）
+                if (new_key.startswith('transformer_blocks') or
+                    new_key.startswith('pos_embed') or
+                    new_key.startswith('time_text_embed') or
+                    'lora' in new_key.lower()):  # 也包含LoRA权重（如果存在）
+                    transformer_state_dict[new_key] = value
+            if rank == 0:
+                print(f"  Extracted {len(transformer_state_dict)} transformer weight keys")
+                print(f"  Sample keys: {list(transformer_state_dict.keys())[:5]}")
+            # 直接加载到transformer（不使用LoRA适配器）
+            try:
+                missing_keys, unexpected_keys = pipeline.transformer.load_state_dict(transformer_state_dict, strict=False)
+                if rank == 0:
+                    print(f"  Loaded full model weights:")
+                    print(f"    Missing keys: {len(missing_keys)}")
+                    print(f"    Unexpected keys: {len(unexpected_keys)}")
+                    if missing_keys:
+                        print(f"    Sample missing keys: {missing_keys[:5]}")
+                    if unexpected_keys:
+                        print(f"    Sample unexpected keys: {unexpected_keys[:5]}")
+                # 如果missing keys太多，可能有问题
+                if len(missing_keys) > len(transformer_state_dict) * 0.5:
+                    if rank == 0:
+                        print(f"  ⚠️ WARNING: Too many missing keys, weights may not be fully loaded")
+                    return False
+                if rank == 0:
+                    print(f"  ✓ Successfully loaded merged model weights")
+                return True
+            except Exception as e:
+                if rank == 0:
+                    print(f"  ❌ Error loading full model weights: {e}")
+                    import traceback
+                    traceback.print_exc()
+                return False
+        # 如果只包含LoRA权重，按原来的方式加载
+        if rank == 0:
+            print(f"Detected LoRA-only weights, loading as LoRA adapter...")
+        # 首先尝试从checkpoint中检测实际的rank
+        detected_rank = None
+        for key, value in state_dict.items():
+            if 'lora_A' in key and 'transformer' in key and len(value.shape) == 2:
+                # lora_A的形状是 [rank, hidden_size]
+                detected_rank = value.shape[0]
+                if rank == 0:
+                    print(f"✓ Detected LoRA rank from checkpoint: {detected_rank} (from key: {key})")
+                break
+        # 如果检测到rank，使用检测到的rank；否则使用传入的rank
+        actual_rank = detected_rank if detected_rank is not None else lora_rank
+        if detected_rank is not None and detected_rank != lora_rank:
+            if rank == 0:
+                print(f"⚠️ Warning: Detected rank ({detected_rank}) differs from requested rank ({lora_rank}), using detected rank")
+        # 检查适配器是否已存在，如果存在则先卸载
+        # SD3Transformer2DModel没有delete_adapter方法，需要使用unload_lora_weights
+        if hasattr(pipeline.transformer, 'peft_config') and pipeline.transformer.peft_config:
+            if "default" in pipeline.transformer.peft_config:
+                if rank == 0:
+                    print("Removing existing 'default' adapter before adding new one...")
+                try:
+                    # 使用pipeline的unload_lora_weights方法
+                    pipeline.unload_lora_weights()
+                    if rank == 0:
+                        print("Successfully unloaded existing LoRA adapter")
+                except Exception as e:
+                    if rank == 0:
+                        print(f"❌ ERROR: Could not unload existing adapter: {e}")
+                        print("Cannot proceed without cleaning up adapter")
+                    return False
+        # 先配置LoRA适配器（必须在加载之前配置）
+        # 使用检测到的或传入的rank
+        transformer_lora_config = LoraConfig(
+            r=actual_rank,
+            lora_alpha=actual_rank,
+            init_lora_weights="gaussian",
+            target_modules=["attn.to_k", "attn.to_q", "attn.to_v", "attn.to_out.0"],
+        )
+        # 为transformer添加LoRA适配器
+        pipeline.transformer.add_adapter(transformer_lora_config)
+        if rank == 0:
+            print(f"LoRA adapter configured with rank={actual_rank}")
+        # 继续处理LoRA权重加载（state_dict已经在上面加载了）
+        # 提取LoRA权重 - accelerator保存的格式
+        # 从accelerator checkpoint的model.safetensors中，键名格式可能是：
+        # - transformer_blocks.X.attn.to_q.lora_A.default.weight (PEFT格式，直接可用)
+        # - 或者包含其他前缀
+        lora_state_dict = {}
+        for key, value in state_dict.items():
+            if 'lora' in key.lower() and 'transformer' in key.lower():
+                # 检查键名格式
+                new_key = key
+                # 移除可能的accelerator包装前缀
+                # accelerator可能保存为: model.transformer.transformer_blocks...
+                # 或者: base_model.model.transformer.transformer_blocks...
+                if key.startswith('base_model.model.transformer.'):
+                    new_key = key[len('base_model.model.transformer.'):]
+                elif key.startswith('model.transformer.'):
+                    new_key = key[len('model.transformer.'):]
+                elif key.startswith('transformer.'):
+                    # 如果已经是transformer_blocks开头，不需要移除transformer.前缀
+                    # 因为transformer_blocks是transformer的子模块
+                    if not key[len('transformer.'):].startswith('transformer_blocks'):
+                        new_key = key[len('transformer.'):]
+                    else:
+                        new_key = key[len('transformer.'):]
+                # 只保留transformer相关的LoRA权重
+                if 'transformer_blocks' in new_key or 'transformer' in new_key:
+                    lora_state_dict[new_key] = value
+        if not lora_state_dict:
+            if rank == 0:
+                print("No LoRA weights found in checkpoint")
+                # 打印所有键名用于调试
+                all_keys = list(state_dict.keys())
+                print(f"Total keys: {len(all_keys)}")
+                print(f"First 20 keys: {all_keys[:20]}")
+                # 查找包含lora的键
+                lora_related = [k for k in all_keys if 'lora' in k.lower()]
+                if lora_related:
+                    print(f"Keys containing 'lora': {lora_related[:10]}")
+            return False
+        if rank == 0:
+            print(f"Found {len(lora_state_dict)} LoRA weight keys")
+            sample_keys = list(lora_state_dict.keys())[:5]
+            print(f"Sample LoRA keys: {sample_keys}")
+        # 加载LoRA权重到transformer
+        # 注意：从checkpoint提取的键名格式已经是PEFT格式（如：transformer_blocks.0.attn.to_q.lora_A.default.weight）
+        # 不需要使用convert_unet_state_dict_to_peft转换，直接使用即可
+        try:
+            # 检查键名格式
+            sample_key = list(lora_state_dict.keys())[0] if lora_state_dict else ""
+            if rank == 0:
+                print(f"Original key format: {sample_key}")
+            # 关键问题：set_peft_model_state_dict期望的键名格式
+            # 从back/train_dreambooth_lora.py看，需要移除.default后缀
+            # 格式应该是：transformer_blocks.X.attn.to_q.lora_A.weight（没有.default）
+            # 但accelerator保存的格式是：transformer_blocks.X.attn.to_q.lora_A.default.weight（有.default）
+            # 检查键名格式
+            sample_key = list(lora_state_dict.keys())[0] if lora_state_dict else ""
+            has_default_suffix = '.default.weight' in sample_key or '.default.bias' in sample_key
+            if rank == 0:
+                print(f"Sample key: {sample_key}")
+                print(f"Has .default suffix: {has_default_suffix}")
+            # 如果键名包含.default.weight或.default.bias，需要移除.default部分
+            # 因为set_peft_model_state_dict期望的格式是：lora_A.weight，而不是lora_A.default.weight
+            converted_dict = {}
+            for key, value in lora_state_dict.items():
+                # 移除.default后缀（如果存在）
+                # transformer_blocks.0.attn.to_q.lora_A.default.weight -> transformer_blocks.0.attn.to_q.lora_A.weight
+                new_key = key
+                if '.default.weight' in new_key:
+                    new_key = new_key.replace('.default.weight', '.weight')
+                elif '.default.bias' in new_key:
+                    new_key = new_key.replace('.default.bias', '.bias')
+                elif '.default' in new_key and (new_key.endswith('.weight') or new_key.endswith('.bias')):
+                    # 处理其他可能的.default位置
+                    new_key = new_key.replace('.default', '')
+                converted_dict[new_key] = value
+            if rank == 0:
+                print(f"Converted {len(converted_dict)} keys (removed .default suffix if present)")
+                print(f"Sample converted keys: {list(converted_dict.keys())[:5]}")
+            # 调用set_peft_model_state_dict并检查返回值
+            incompatible_keys = set_peft_model_state_dict(
+                pipeline.transformer,
+                converted_dict,
+                adapter_name="default"
+            )
+            # 检查加载结果
+            if incompatible_keys is not None:
+                missing_keys = getattr(incompatible_keys, "missing_keys", [])
+                unexpected_keys = getattr(incompatible_keys, "unexpected_keys", [])
+                if rank == 0:
+                    print(f"LoRA loading result:")
+                    print(f"  Missing keys: {len(missing_keys)}")
+                    print(f"  Unexpected keys: {len(unexpected_keys)}")
+                    if len(missing_keys) > 100:
+                        print(f"  ⚠️ WARNING: Too many missing keys ({len(missing_keys)}), LoRA may not be fully loaded!")
+                        print(f"  Sample missing keys: {missing_keys[:10]}")
+                    elif missing_keys:
+                        print(f"  Sample missing keys: {missing_keys[:10]}")
+                    if unexpected_keys:
+                        print(f"  Unexpected keys: {unexpected_keys[:10]}")
+                # 如果missing keys太多，说明加载失败
+                if len(missing_keys) > len(converted_dict) * 0.5:  # 超过50%的键缺失
+                    if rank == 0:
+                        print("❌ ERROR: Too many missing keys, LoRA weights not loaded correctly!")
+                    return False
+            else:
+                if rank == 0:
+                    print("✓ LoRA weights loaded (no incompatible keys reported)")
+        except RuntimeError as e:
+            # 检查是否是size mismatch错误
+            error_str = str(e)
+            if "size mismatch" in error_str:
+                if rank == 0:
+                    print(f"❌ Size mismatch error: The checkpoint rank doesn't match the adapter rank")
+                    print(f"   This usually means the checkpoint was trained with a different rank")
+                    # 尝试从错误信息中提取期望的rank
+                    import re
+                    # 错误信息格式: "copying a param with shape torch.Size([32, 1536]) from checkpoint"
+                    match = re.search(r'copying a param with shape torch\.Size\(\[(\d+),', error_str)
+                    if match:
+                        checkpoint_rank = int(match.group(1))
+                        if rank == 0:
+                            print(f"   Detected checkpoint rank: {checkpoint_rank}")
+                            print(f"   Adapter was configured with rank: {actual_rank}")
+                            if checkpoint_rank != actual_rank:
+                                print(f"   ⚠️ Mismatch! Need to recreate adapter with rank={checkpoint_rank}")
+            else:
+                if rank == 0:
+                    print(f"❌ Error setting LoRA state dict: {e}")
+                    import traceback
+                    traceback.print_exc()
+            # 清理适配器以便下次尝试
+            try:
+                pipeline.unload_lora_weights()
+            except:
+                pass
+            return False
+        except Exception as e:
+            if rank == 0:
+                print(f"❌ Error setting LoRA state dict: {e}")
+                import traceback
+                traceback.print_exc()
+            # 清理适配器以便下次尝试
+            try:
+                pipeline.unload_lora_weights()
+            except:
+                pass
+            return False
+        # 启用LoRA适配器
+        pipeline.transformer.set_adapter("default")
+        # 验证LoRA是否已加载和应用
+        if hasattr(pipeline.transformer, 'peft_config'):
+            adapters = list(pipeline.transformer.peft_config.keys())
+            if rank == 0:
+                print(f"LoRA adapters configured: {adapters}")
+                # 检查适配器是否启用
+                if hasattr(pipeline.transformer, 'active_adapters'):
+                    # active_adapters 是一个方法，需要调用
+                    try:
+                        if callable(pipeline.transformer.active_adapters):
+                            active = pipeline.transformer.active_adapters()
+                        else:
+                            active = pipeline.transformer.active_adapters
+                        if rank == 0:
+                            print(f"Active adapters: {active}")
+                    except:
+                        if rank == 0:
+                            print("Could not get active adapters, but LoRA is configured")
+        # 验证LoRA权���是否真的被应用
+        # 检查LoRA层的权重是否非零
+        lora_layers_found = 0
+        nonzero_lora_layers = 0
+        total_lora_weight_sum = 0.0
+        for name, module in pipeline.transformer.named_modules():
+            if 'lora_A' in name or 'lora_B' in name:
+                lora_layers_found += 1
+                if hasattr(module, 'weight') and module.weight is not None:
+                    weight_sum = module.weight.abs().sum().item()
+                    total_lora_weight_sum += weight_sum
+                    if weight_sum > 1e-6:  # 非零阈值
+                        nonzero_lora_layers += 1
+                        if rank == 0 and nonzero_lora_layers <= 3:  # 只打印前3个
+                            print(f"✓ Found non-zero LoRA weight in: {name}, sum={weight_sum:.6f}")
+        if rank == 0:
+            print(f"LoRA verification:")
+            print(f"  Total LoRA layers found: {lora_layers_found}")
+            print(f"  Non-zero LoRA layers: {nonzero_lora_layers}")
+            print(f"  Total LoRA weight sum: {total_lora_weight_sum:.6f}")
+            if lora_layers_found == 0:
+                print("❌ ERROR: No LoRA layers found in transformer!")
+                return False
+            elif nonzero_lora_layers == 0:
+                print("❌ ERROR: All LoRA weights are zero, LoRA not loaded correctly!")
+                return False
+            elif nonzero_lora_layers < lora_layers_found * 0.5:
+                print(f"⚠️ WARNING: Only {nonzero_lora_layers}/{lora_layers_found} LoRA layers have non-zero weights!")
+                print("⚠️ LoRA may not be fully applied!")
+            else:
+                print(f"✓ LoRA weights verified: {nonzero_lora_layers}/{lora_layers_found} layers have non-zero weights")
+        if nonzero_lora_layers == 0:
+            return False
+        if rank == 0:
+            print("✓ Successfully loaded and verified LoRA weights from checkpoint")
+        return True
+    except Exception as e:
+        if rank == 0:
+            print(f"Error loading LoRA from checkpoint: {e}")
+            import traceback
+            traceback.print_exc()
+        return False
+def load_captions_from_jsonl(jsonl_path):
+    captions = []
+    with open(jsonl_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+                cap = None
+                for field in ['caption', 'text', 'prompt', 'description']:
+                    if field in data and isinstance(data[field], str):
+                        cap = data[field].strip()
+                        break
+                if cap:
+                    captions.append(cap)
+            except Exception:
+                continue
+    return captions if captions else ["a beautiful high quality image"]
+def main(args):
+    assert torch.cuda.is_available(), "需要GPU运行"
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    device = rank % torch.cuda.device_count()
+    torch.cuda.set_device(device)
+    seed = args.global_seed * world_size + rank
+    torch.manual_seed(seed)
+    print(f"[rank{rank}] DDP initialized, device={device}, seed={seed}, world_size={world_size}")
+    # 调试：打印接收到的参数
+    if rank == 0:
+        print("=" * 80)
+        print("参数检查:")
+        print(f"  lora_path: {args.lora_path}")
+        print(f"  rectified_weights: {args.rectified_weights}")
+        print(f"  lora_path is None: {args.lora_path is None}")
+        print(f"  lora_path is empty: {args.lora_path == '' if args.lora_path else 'N/A'}")
+        print(f"  rectified_weights is None: {args.rectified_weights is None}")
+        print(f"  rectified_weights is empty: {args.rectified_weights == '' if args.rectified_weights else 'N/A'}")
+        print("=" * 80)
+    # 导入训练脚本中的类
+    RectifiedNoiseModule, SD3WithRectifiedNoise = dynamic_import_training_classes(str(Path(__file__).parent))
+    # 加载 pipeline
+    dtype = torch.float16 if args.mixed_precision == "fp16" else (torch.bfloat16 if args.mixed_precision == "bf16" else torch.float32)
+    if rank == 0:
+        print(f"Loading SD3 pipeline from {args.pretrained_model_name_or_path} (dtype={dtype})")
+    pipeline = StableDiffusion3Pipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=dtype,
+    ).to(device)
+    print(f"[rank{rank}] Pipeline loaded and moved to device {device}")
+    # 加载 LoRA（可选）
+    lora_loaded = False
+    if args.lora_path:
+        if rank == 0:
+            print(f"Attempting to load LoRA weights from: {args.lora_path}")
+            print(f"LoRA path exists: {os.path.exists(args.lora_path) if args.lora_path else False}")
+        # 首��检查是否是标准的LoRA权重文件/目录
+        if check_lora_weights_exist(args.lora_path):
+            if rank == 0:
+                print("Found standard LoRA weights, loading...")
+            try:
+                # 检查加载前的transformer参数（用于验证）
+                if rank == 0:
+                    sample_param_before = next(iter(pipeline.transformer.parameters())).clone()
+                    print(f"Sample transformer param before LoRA (first 5 values): {sample_param_before.flatten()[:5]}")
+                pipeline.load_lora_weights(args.lora_path)
+                lora_loaded = True
+                # 验证LoRA是否真的被加载
+                if rank == 0:
+                    sample_param_after = next(iter(pipeline.transformer.parameters())).clone()
+                    param_diff = (sample_param_after - sample_param_before).abs().max().item()
+                    print(f"Sample transformer param after LoRA (first 5 values): {sample_param_after.flatten()[:5]}")
+                    print(f"Max parameter change after LoRA loading: {param_diff}")
+                    if param_diff < 1e-6:
+                        print("⚠️ WARNING: LoRA weights may not have been applied (parameter change is very small)")
+                    else:
+                        print("✓ LoRA weights appear to have been applied")
+                    # 检查是否有peft_config
+                    if hasattr(pipeline.transformer, 'peft_config'):
+                        print(f"✓ PEFT config found: {list(pipeline.transformer.peft_config.keys())}")
+                    else:
+                        print("⚠️ WARNING: No peft_config found after loading LoRA")
+                if rank == 0:
+                    print("LoRA loaded successfully from standard format.")
+            except Exception as e:
+                if rank == 0:
+                    print(f"Failed to load LoRA from standard format: {e}")
+                    import traceback
+                    traceback.print_exc()
+        # 如果不是标准格式，尝试从accelerator checkpoint加载
+        if not lora_loaded and os.path.isdir(args.lora_path):
+            if rank == 0:
+                print("Standard LoRA weights not found, trying accelerator checkpoint format...")
+            # 首先尝试从checkpoint的model.safetensors中检测实际的rank
+            # 通过检查LoRA权重的形状来推断rank
+            detected_rank = None
+            try:
+                from safetensors.torch import load_file
+                model_file = os.path.join(args.lora_path, "model.safetensors")
+                if os.path.exists(model_file):
+                    state_dict = load_file(model_file)
+                    # 查找一个LoRA权重来确定rank
+                    for key, value in state_dict.items():
+                        if 'lora_A' in key and 'transformer' in key and len(value.shape) == 2:
+                            # lora_A的形状是 [rank, hidden_size]
+                            detected_rank = value.shape[0]
+                            if rank == 0:
+                                print(f"✓ Detected LoRA rank from checkpoint: {detected_rank} (from key: {key})")
+                            break
+            except Exception as e:
+                if rank == 0:
+                    print(f"Could not detect rank from checkpoint: {e}")
+            # 构建rank尝试列表
+            # 如果检测到rank，优先使用检测到的rank，只尝试一次
+            # 如果未检测到，尝试常见的rank值
+            if detected_rank is not None:
+                rank_list = [detected_rank]
+                if rank == 0:
+                    print(f"Using detected rank: {detected_rank}")
+            else:
+                # 如果检测失败，尝试常见的rank值（按用户指定的rank优先）
+                rank_list = []
+                # 如果用户指定了rank（从args.lora_rank），优先尝试
+                if hasattr(args, 'lora_rank') and args.lora_rank:
+                    rank_list.append(args.lora_rank)
+                # 添加其他常见的rank值
+                for r in [32, 64, 16, 128]:
+                    if r not in rank_list:
+                        rank_list.append(r)
+                if rank == 0:
+                    print(f"Rank detection failed, will try ranks in order: {rank_list}")
+            # 尝试不同的rank值
+            for lora_rank in rank_list:
+                # 在尝试新的rank之前，先清理已存在的适配器
+                # 重要：每次尝试前都要清理，否则适配器会保留之前的rank配置
+                if hasattr(pipeline.transformer, 'peft_config') and pipeline.transformer.peft_config:
+                    if "default" in pipeline.transformer.peft_config:
+                        try:
+                            # 使用pipeline的unload_lora_weights方法
+                            pipeline.unload_lora_weights()
+                            if rank == 0:
+                                print(f"Cleaned up existing adapter before trying rank={lora_rank}")
+                        except Exception as e:
+                            if rank == 0:
+                                print(f"Warning: Could not unload adapter: {e}")
+                                # 如果卸载失败，需要重新创建pipeline
+                                if rank == 0:
+                                    print("⚠️ WARNING: Cannot unload adapter, will recreate pipeline...")
+                                    # 重新加载pipeline（最后手段）
+                                    try:
+                                        pipeline = StableDiffusion3Pipeline.from_pretrained(
+                                            args.pretrained_model_name_or_path,
+                                            revision=args.revision,
+                                            variant=args.variant,
+                                            torch_dtype=dtype,
+                                        ).to(device)
+                                        if rank == 0:
+                                            print("Pipeline recreated to clear adapter state")
+                                    except Exception as e2:
+                                        if rank == 0:
+                                            print(f"Failed to recreate pipeline: {e2}")
+                if rank == 0:
+                    print(f"Trying to load with LoRA rank={lora_rank}...")
+                lora_loaded = load_lora_from_checkpoint(pipeline, args.lora_path, rank=rank, lora_rank=lora_rank)
+                if lora_loaded:
+                    if rank == 0:
+                        print(f"✓ Successfully loaded LoRA with rank={lora_rank}")
+                    break
+                elif rank == 0:
+                    print(f"✗ Failed to load with rank={lora_rank}, trying next rank...")
+        # 如果checkpoint目录加载失败，尝试从输出目录的根目录加载标准LoRA权重
+        if not lora_loaded and os.path.isdir(args.lora_path):
+            # 检查输出目录的根目录（checkpoint的父目录）
+            output_dir = os.path.dirname(args.lora_path.rstrip('/'))
+            if output_dir and os.path.exists(output_dir):
+                if rank == 0:
+                    print(f"Trying to load standard LoRA weights from output directory: {output_dir}")
+                if check_lora_weights_exist(output_dir):
+                    try:
+                        pipeline.load_lora_weights(output_dir)
+                        lora_loaded = True
+                        if rank == 0:
+                            print("LoRA loaded successfully from output directory.")
+                    except Exception as e:
+                        if rank == 0:
+                            print(f"Failed to load LoRA from output directory: {e}")
+        if not lora_loaded:
+            if rank == 0:
+                print(f"⚠️ WARNING: Failed to load LoRA weights from {args.lora_path}, using baseline model")
+        else:
+            # 最终验证LoRA是否真的被启用
+            if rank == 0:
+                print("=" * 80)
+                print("LoRA 加载验证:")
+                if hasattr(pipeline.transformer, 'peft_config') and pipeline.transformer.peft_config:
+                    print(f"  ✓ PEFT config exists: {list(pipeline.transformer.peft_config.keys())}")
+                    # 检查LoRA层的权重
+                    lora_layers_found = 0
+                    for name, module in pipeline.transformer.named_modules():
+                        if 'lora_A' in name or 'lora_B' in name:
+                            lora_layers_found += 1
+                            if lora_layers_found <= 3:  # 只打印前3个
+                                if hasattr(module, 'weight'):
+                                    weight_sum = module.weight.abs().sum().item() if module.weight is not None else 0
+                                    print(f"  ✓ Found LoRA layer: {name}, weight_sum={weight_sum:.6f}")
+                    print(f"  ✓ Total LoRA layers found: {lora_layers_found}")
+                    if lora_layers_found == 0:
+                        print("  ⚠️ WARNING: No LoRA layers found in transformer!")
+                else:
+                    print("  ⚠️ WARNING: No PEFT config found - LoRA may not be active!")
+                print("=" * 80)
+    # 构建 RectifiedNoiseModule 并加载权重（仅在提供了 rectified_weights 时）
+    # 安全地检查 rectified_weights 是否有效
+    use_rectified = False
+    rectified_weights_path = None
+    if args.rectified_weights:
+        rectified_weights_str = str(args.rectified_weights).strip()
+        if rectified_weights_str:
+            use_rectified = True
+            rectified_weights_path = rectified_weights_str
+    if rank == 0:
+        print(f"use_rectified: {use_rectified}, rectified_weights_path: {rectified_weights_path}")
+    if use_rectified:
+        if rank == 0:
+            print(f"Using Rectified Noise module with weights from: {rectified_weights_path}")
+        print(f"[rank{rank}] RectifiedNoiseModule configuration: num_sit_layers={args.num_sit_layers}")
+        # 从 transformer 配置推断必要尺寸
+        tfm = pipeline.transformer
+        if hasattr(tfm.config, 'joint_attention_dim') and tfm.config.joint_attention_dim is not None:
+            sit_hidden_size = tfm.config.joint_attention_dim
+        elif hasattr(tfm.config, 'inner_dim') and tfm.config.inner_dim is not None:
+            sit_hidden_size = tfm.config.inner_dim
+        elif hasattr(tfm.config, 'hidden_size') and tfm.config.hidden_size is not None:
+            sit_hidden_size = tfm.config.hidden_size
+        else:
+            sit_hidden_size = 4096
+        transformer_hidden_size = getattr(tfm.config, 'hidden_size', 1536)
+        num_attention_heads = getattr(tfm.config, 'num_attention_heads', 32)
+        input_dim = getattr(tfm.config, 'in_channels', 16)
+        rectified_module = RectifiedNoiseModule(
+            hidden_size=sit_hidden_size,
+            num_sit_layers=args.num_sit_layers,
+            num_attention_heads=num_attention_heads,
+            input_dim=input_dim,
+            transformer_hidden_size=transformer_hidden_size,
+        )
+        # 加载 SIT 权重
+        ok = load_sit_weights(rectified_module, rectified_weights_path, rank=rank)
+        if rank == 0:
+            if not ok:
+                print("⚠️ Warning: Failed to load rectified weights, will use baseline model without rectified noise")
+            else:
+                print("✓ Successfully loaded rectified noise weights")
+        # 组装 SD3WithRectifiedNoise
+        # 关键：SD3WithRectifiedNoise 会保留 transformer 的引用
+        # 但是，SD3WithRectifiedNoise 在 __init__ 中会冻结 transformer 参数
+        # 这不应该影响 LoRA，因为 LoRA 是作为适配器添加的，不是原始参数
+        # 我们需要确保在创建 SD3WithRectifiedNoise 之前，LoRA 适配器已经正确加载和启用
+        if lora_loaded and rank == 0:
+            print("Creating SD3WithRectifiedNoise with LoRA-enabled transformer...")
+        elif rank == 0:
+            print("Creating SD3WithRectifiedNoise...")
+        model = SD3WithRectifiedNoise(pipeline.transformer, rectified_module).to(device)
+        # 重要：SD3WithRectifiedNoise 的 __init__ 会冻结 transformer 参数
+        # 但 LoRA 适配器应该仍然有效，因为它们是独立的模块
+        # 我们需要确保 LoRA 适配器在包装后仍然可以访问
+        # 确保 LoRA 适配器在模型替换后仍然启用
+        if lora_loaded:
+            # 通过model.transformer访问，因为SD3WithRectifiedNoise包装了transformer
+            if hasattr(model.transformer, 'peft_config'):
+                try:
+                    # 确保适配器处于启用状态
+                    model.transformer.set_adapter("default_0")
+                    # 验证LoRA权重在包装后是否仍然存在
+                    lora_layers_after_wrap = 0
+                    nonzero_after_wrap = 0
+                    for name, module in model.transformer.named_modules():
+                        if 'lora_A' in name or 'lora_B' in name:
+                            lora_layers_after_wrap += 1
+                            if hasattr(module, 'weight') and module.weight is not None:
+                                if module.weight.abs().sum().item() > 1e-6:
+                                    nonzero_after_wrap += 1
+                    if rank == 0:
+                        print(f"LoRA after SD3WithRectifiedNoise wrapping:")
+                        print(f"  LoRA layers: {lora_layers_after_wrap}, Non-zero: {nonzero_after_wrap}")
+                        if nonzero_after_wrap == 0:
+                            print("  ❌ ERROR: All LoRA weights are zero after wrapping!")
+                        elif nonzero_after_wrap < lora_layers_after_wrap * 0.5:
+                            print(f"  ⚠️ WARNING: Only {nonzero_after_wrap}/{lora_layers_after_wrap} LoRA layers have weights!")
+                        else:
+                            print(f"  ✓ LoRA weights preserved after wrapping")
+                    # 验证适配器是否真的启用
+                    if hasattr(model.transformer, 'active_adapters'):
+                        try:
+                            if callable(model.transformer.active_adapters):
+                                active = model.transformer.active_adapters()
+                            else:
+                                active = model.transformer.active_adapters
+                            if rank == 0:
+                                print(f"  Active adapters: {active}")
+                        except:
+                            if rank == 0:
+                                print("  LoRA adapter re-enabled after model wrapping")
+                    else:
+                        if rank == 0:
+                            print("  LoRA adapter re-enabled after model wrapping")
+                except Exception as e:
+                    if rank == 0:
+                        print(f"❌ ERROR: Could not re-enable LoRA adapter: {e}")
+                        import traceback
+                        traceback.print_exc()
+            else:
+                # LoRA权重已经合并到transformer的基础权重中（合并加载方式）
+                # 这种情况下没有peft_config是正常的，因为LoRA已经合并了
+                if rank == 0:
+                    print("LoRA loaded via merged weights (no PEFT adapter needed)")
+                    print("  ✓ LoRA weights are already merged into transformer base weights")
+                    print("  Note: This is expected when loading from merged checkpoint format")
+        # 注册到 pipeline（pipeline_stable_diffusion_3.py 已支持 external model）
+        pipeline.model = model
+        # 确保模型处于评估模式（LoRA在eval模式下也应该工作）
+        model.eval()
+        model.transformer.eval()  # 确保transformer也处于eval模式
+    else:
+        if rank == 0:
+            print("Not using Rectified Noise module, using baseline SD3 pipeline")
+        # 不使用 SD3WithRectifiedNoise，保持原始 pipeline
+        # pipeline.model 保持为原始的 transformer
+    # 关键：确保LoRA适配器在推理时被使用
+    # PEFT模型在eval模式下，LoRA适配器应该自动启用，但我们需要确保
+    if lora_loaded:
+        # 获取正确的 transformer 引用
+        transformer_ref = model.transformer if use_rectified else pipeline.transformer
+        # 确保transformer的LoRA适配器处于启用状态
+        if hasattr(transformer_ref, 'set_adapter'):
+            try:
+                transformer_ref.set_adapter("default")
+            except:
+                pass
+        # 验证LoRA是否真的会被使用
+        if rank == 0:
+            # 检查一个LoRA层的权重
+            lora_found = False
+            for name, module in transformer_ref.named_modules():
+                if 'lora_A' in name and 'default' in name and hasattr(module, 'weight'):
+                    if module.weight is not None:
+                        weight_sum = module.weight.abs().sum().item()
+                        if weight_sum > 0:
+                            print(f"✓ Verified LoRA weight in {name}: sum={weight_sum:.6f}")
+                            lora_found = True
+                            break
+            if not lora_found:
+                print("⚠ Warning: Could not verify LoRA weights in model")
+            else:
+                # 额外检查：验证LoRA层是否真的会被调用
+                # 检查一个LoRA Linear层
+                for name, module in transformer_ref.named_modules():
+                    if hasattr(module, '__class__') and 'lora' in module.__class__.__name__.lower():
+                        if hasattr(module, 'lora_enabled'):
+                            enabled = module.lora_enabled
+                            if rank == 0:
+                                print(f"✓ Found LoRA layer {name}, enabled: {enabled}")
+                        break
+            print("Model set to eval mode, LoRA should be active during inference")
+    # 启用内存优化选项
+    if args.enable_attention_slicing:
+        if rank == 0:
+            print("Enabling attention slicing to save memory")
+        pipeline.enable_attention_slicing()
+    if args.enable_vae_slicing:
+        if rank == 0:
+            print("Enabling VAE slicing to save memory")
+        pipeline.enable_vae_slicing()
+    if args.enable_cpu_offload:
+        if rank == 0:
+            print("Enabling CPU offload to save memory")
+        pipeline.enable_model_cpu_offload()
+    # 禁用进度条以减少输出
+    pipeline.set_progress_bar_config(disable=True)
+    # 读入 captions
+    captions = load_captions_from_jsonl(args.captions_jsonl)
+    total_images_needed = min(len(captions) * args.images_per_caption, args.max_samples)
+    # 输出目录
+    if rank == 0:
+        os.makedirs(args.sample_dir, exist_ok=True)
+    dist.barrier()
+    # 检查已存在的样本
+    existing_count, max_existing_index = get_existing_sample_count(args.sample_dir)
+    if rank == 0:
+        print(f"Found {existing_count} existing samples, max index: {max_existing_index}")
+    # 调整需要生成的样本数量
+    remaining_images_needed = max(0, total_images_needed - existing_count)
+    if remaining_images_needed == 0:
+        if rank == 0:
+            print("All required samples already exist. Skipping generation.")
+            print(f"Creating npz from existing samples...")
+            create_npz_from_sample_folder(args.sample_dir, total_images_needed)
+        return
+    if rank == 0:
+        print(f"Need to generate {remaining_images_needed} more samples (total needed: {total_images_needed})")
+    n = args.per_proc_batch_size
+    global_batch = n * world_size
+    total_samples = int(math.ceil(remaining_images_needed / global_batch) * global_batch)
+    assert total_samples % world_size == 0
+    samples_per_gpu = total_samples // world_size
+    assert samples_per_gpu % n == 0
+    iterations = samples_per_gpu // n
+    if rank == 0:
+        print(f"Sampling remaining={remaining_images_needed}, total_samples={total_samples}, per_gpu={samples_per_gpu}, iterations={iterations}")
+    pbar = tqdm(range(iterations)) if rank == 0 else range(iterations)
+    saved = 0
+    autocast_device = "cuda" if torch.cuda.is_available() else "cpu"
+    for it in pbar:
+        if rank == 0 and it % 10 == 0:
+            print(f"[rank{rank}] Sampling iteration {it}/{iterations}")
+        batch_prompts = []
+        base_index = it * global_batch + rank
+        for j in range(n):
+            idx = it * global_batch + j * world_size + rank
+            if idx < remaining_images_needed:
+                cap_idx = idx // args.images_per_caption
+                batch_prompts.append(captions[cap_idx])
+            else:
+                batch_prompts.append("a beautiful high quality image")
+        with torch.autocast(autocast_device, dtype=dtype):
+            images = []
+            for k, prompt in enumerate(batch_prompts):
+                image_seed = seed + it * 10000 + k * 1000 + rank
+                generator = torch.Generator(device=device).manual_seed(image_seed)
+                img = pipeline(
+                    prompt=prompt,
+                    height=args.height,
+                    width=args.width,
+                    num_inference_steps=args.num_inference_steps,
+                    guidance_scale=args.guidance_scale,
+                    generator=generator,
+                    num_images_per_prompt=1,
+                ).images[0]
+                images.append(img)
+        # 保存
+        out_dir = Path(args.sample_dir)
+        if rank == 0 and it == 0:
+            print(f"Saving pngs to: {out_dir}")
+        for j, img in enumerate(images):
+            global_index = it * global_batch + j * world_size + rank + existing_count  # 加上已存在的数量
+            if global_index < total_images_needed:
+                filename = f"{global_index:07d}.png"
+                img.save(out_dir / filename)
+                saved += 1
+        dist.barrier()
+    if rank == 0:
+        print(f"Done. Saved {saved * world_size} images in total.")
+        actual_num_samples = len([name for name in os.listdir(args.sample_dir) if name.endswith(".png")])
+        print(f"Actually generated {actual_num_samples} images")
+        npz_samples = min(actual_num_samples, total_images_needed)
+        print(f"[rank{rank}] Creating npz from sample folder: {args.sample_dir}, npz_samples={npz_samples}")
+        create_npz_from_sample_folder(args.sample_dir, npz_samples)
+        print("Done creating npz.")
+        print("Done.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="SD3 LoRA + RectifiedNoise 分布式采样脚本")
+    # 模型
+    parser.add_argument("--pretrained_model_name_or_path", type=str, required=True)
+    parser.add_argument("--revision", type=str, default=None)
+    parser.add_argument("--variant", type=str, default=None)
+    # LoRA 与 Rectified
+    parser.add_argument("--lora_path", type=str, default=None, help="LoRA 权重路径(文件或目录)")
+    parser.add_argument("--rectified_weights", type=str, default=None, help="Rectified(SIT) 权重路径(文件或目录)")
+    parser.add_argument("--num_sit_layers", type=int, default=1, help="与训练一致的 SIT 层数")
+    # 采样
+    parser.add_argument("--num_inference_steps", type=int, default=28)
+    parser.add_argument("--guidance_scale", type=float, default=7.0)
+    parser.add_argument("--height", type=int, default=1024)
+    parser.add_argument("--width", type=int, default=1024)
+    parser.add_argument("--per_proc_batch_size", type=int, default=1)
+    parser.add_argument("--images_per_caption", type=int, default=1)
+    parser.add_argument("--max_samples", type=int, default=10000)
+    parser.add_argument("--captions_jsonl", type=str, required=True)
+    parser.add_argument("--sample_dir", type=str, default="sd3_rectified_samples")
+    parser.add_argument("--global_seed", type=int, default=42)
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["no", "fp16", "bf16"])
+    # 内存优化选项
+    parser.add_argument("--enable_attention_slicing", action="store_true", help="启用 attention slicing 以节省显存")
+    parser.add_argument("--enable_vae_slicing", action="store_true", help="启用 VAE slicing 以节省显存")
+    parser.add_argument("--enable_cpu_offload", action="store_true", help="启用 CPU offload 以节省显存")
+    args = parser.parse_args()
+    main(args)

sample_sd3_rectified_ddp_old.py ADDED Viewed

	@@ -0,0 +1,1317 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""
+分布式采样脚本：支持指定 LoRA 权重与 Rectified Noise(SIT) 权重
+依据 train_rectified_noise.py 的模型结构，加载并组装 SD3WithRectifiedNoise 进行采样。
+"""
+import os
+import sys
+import json
+import math
+import argparse
+from pathlib import Path
+import torch
+import torch.distributed as dist
+from tqdm import tqdm
+import numpy as np
+from PIL import Image
+from accelerate import Accelerator
+from diffusers import StableDiffusion3Pipeline
+from peft import LoraConfig, get_peft_model_state_dict
+from peft.utils import set_peft_model_state_dict
+def dynamic_import_training_classes(project_root: str):
+    """从 train_rectified_noise.py 动态导入 RectifiedNoiseModule 和 SD3WithRectifiedNoise"""
+    sys.path.insert(0, project_root)
+    try:
+        import train_rectified_noise as trn
+        return trn.RectifiedNoiseModule, trn.SD3WithRectifiedNoise
+    except Exception as e:
+        raise ImportError(f"无法从 train_rectified_noise.py 导入类: {e}")
+def create_npz_from_sample_folder(sample_dir, num_samples):
+    """
+    从样本文件夹构建单个.npz文件，保持与sample_ddp_new相同的格式
+    """
+    samples = []
+    actual_files = []
+    # 收集所有PNG文件
+    for filename in sorted(os.listdir(sample_dir)):
+        if filename.endswith('.png'):
+            actual_files.append(filename)
+    # 按照数量限制处理
+    for i in tqdm(range(min(num_samples, len(actual_files))), desc="Building .npz file from samples"):
+        if i < len(actual_files):
+            sample_path = os.path.join(sample_dir, actual_files[i])
+            sample_pil = Image.open(sample_path)
+            sample_np = np.asarray(sample_pil).astype(np.uint8)
+            samples.append(sample_np)
+        else:
+            # 如果不够，创建空白图像
+            sample_np = np.zeros((512, 512, 3), dtype=np.uint8)
+            samples.append(sample_np)
+    if samples:
+        samples = np.stack(samples)
+        npz_path = f"{sample_dir}.npz"
+        np.savez(npz_path, arr_0=samples)
+        print(f"Saved .npz file to {npz_path} [shape={samples.shape}].")
+        return npz_path
+    else:
+        print("No samples found to create npz file.")
+        return None
+def load_sit_weights(rectified_module, weights_path: str, rank=0):
+    """加载 Rectified Noise(SIT) 权重，支持 .safetensors / .bin / .pt
+    支持以下目录结构：
+    - weights_path/pytorch_sit_weights.safetensors (直接在主目录)
+    - weights_path/sit_weights/pytorch_sit_weights.safetensors (在sit_weights子目录)
+    """
+    if os.path.isdir(weights_path):
+        # 首先尝试在主目录查找
+        search_paths = [
+            weights_path,  # 主目录
+            os.path.join(weights_path, "sit_weights"),  # sit_weights子目录
+        ]
+        for search_dir in search_paths:
+            if not os.path.exists(search_dir):
+                continue
+            # 优先寻找 safetensors
+            st_path = os.path.join(search_dir, "pytorch_sit_weights.safetensors")
+            if os.path.exists(st_path):
+                try:
+                    from safetensors.torch import load_file
+                    if rank == 0:
+                        print(f"Loading rectified weights from: {st_path}")
+                    state = load_file(st_path)
+                    missing_keys, unexpected_keys = rectified_module.load_state_dict(state, strict=False)
+                    if rank == 0:
+                        print(f"  Loaded rectified weights: {len(state)} keys")
+                        if missing_keys:
+                            print(f"  Missing keys: {len(missing_keys)}")
+                        if unexpected_keys:
+                            print(f"  Unexpected keys: {len(unexpected_keys)}")
+                    return True
+                except Exception as e:
+                    if rank == 0:
+                        print(f"  Failed to load from {st_path}: {e}")
+                    continue
+            # 其次寻找 bin/pt
+            for name in ["pytorch_sit_weights.bin", "pytorch_sit_weights.pt", "sit_weights.pt", "sit.pt"]:
+                cand = os.path.join(search_dir, name)
+                if os.path.exists(cand):
+                    try:
+                        if rank == 0:
+                            print(f"Loading rectified weights from: {cand}")
+                        state = torch.load(cand, map_location="cpu")
+                        missing_keys, unexpected_keys = rectified_module.load_state_dict(state, strict=False)
+                        if rank == 0:
+                            print(f"  Loaded rectified weights: {len(state)} keys")
+                            if missing_keys:
+                                print(f"  Missing keys: {len(missing_keys)}")
+                            if unexpected_keys:
+                                print(f"  Unexpected keys: {len(unexpected_keys)}")
+                        return True
+                    except Exception as e:
+                        if rank == 0:
+                            print(f"  Failed to load from {cand}: {e}")
+                        continue
+            # 兜底：目录下任意 pt/bin
+            try:
+                for fn in os.listdir(search_dir):
+                    if fn.endswith((".pt", ".bin")):
+                        cand = os.path.join(search_dir, fn)
+                        try:
+                            if rank == 0:
+                                print(f"Loading rectified weights from: {cand}")
+                            state = torch.load(cand, map_location="cpu")
+                            missing_keys, unexpected_keys = rectified_module.load_state_dict(state, strict=False)
+                            if rank == 0:
+                                print(f"  Loaded rectified weights: {len(state)} keys")
+                            return True
+                        except Exception as e:
+                            if rank == 0:
+                                print(f"  Failed to load from {cand}: {e}")
+                            continue
+            except Exception:
+                pass
+        if rank == 0:
+            print(f"  ❌ No rectified weights found in {weights_path} or {os.path.join(weights_path, 'sit_weights')}")
+        return False
+    else:
+        # 直接文件
+        try:
+            if rank == 0:
+                print(f"Loading rectified weights from file: {weights_path}")
+            if weights_path.endswith(".safetensors"):
+                from safetensors.torch import load_file
+                state = load_file(weights_path)
+            else:
+                state = torch.load(weights_path, map_location="cpu")
+            missing_keys, unexpected_keys = rectified_module.load_state_dict(state, strict=False)
+            if rank == 0:
+                print(f"  Loaded rectified weights: {len(state)} keys")
+                if missing_keys:
+                    print(f"  Missing keys: {len(missing_keys)}")
+                if unexpected_keys:
+                    print(f"  Unexpected keys: {len(unexpected_keys)}")
+            return True
+        except Exception as e:
+            if rank == 0:
+                print(f"  ❌ Failed to load rectified weights from {weights_path}: {e}")
+            return False
+def check_lora_weights_exist(lora_path):
+    """检查LoRA权重文件是否存在"""
+    if not lora_path:
+        return False
+    if os.path.isdir(lora_path):
+        # 检查目录中是否有pytorch_lora_weights.safetensors文件
+        weight_file = os.path.join(lora_path, "pytorch_lora_weights.safetensors")
+        if os.path.exists(weight_file):
+            return True
+        # 检查是否有其他.safetensors文件
+        for file in os.listdir(lora_path):
+            if file.endswith(".safetensors") and "lora" in file.lower():
+                return True
+        return False
+    elif os.path.isfile(lora_path):
+        return lora_path.endswith(".safetensors")
+    return False
+def load_lora_from_checkpoint(pipeline, checkpoint_path, rank=0, lora_rank=64):
+    """
+    从accelerator checkpoint目录加载LoRA权重或完整模型权重
+    如果checkpoint包含完整的模型权重（合并后的），直接加载
+    如果只包含LoRA权重，则按LoRA方式加载
+    """
+    if rank == 0:
+        print(f"Loading weights from accelerator checkpoint: {checkpoint_path}")
+    try:
+        from safetensors.torch import load_file
+        model_file = os.path.join(checkpoint_path, "model.safetensors")
+        if not os.path.exists(model_file):
+            if rank == 0:
+                print(f"Model file not found: {model_file}")
+            return False
+        # 加载state dict
+        state_dict = load_file(model_file)
+        all_keys = list(state_dict.keys())
+        # 检测checkpoint类型：
+        # 1. 是否包含base_layer（PEFT格式，需要合并）
+        # 2. 是否包含完整的模型权重（合并后的，直接可用）
+        # 3. 是否只包含LoRA权重（需要添加适配器）
+        lora_keys = [k for k in all_keys if 'lora' in k.lower() and 'transformer' in k.lower()]
+        base_layer_keys = [k for k in all_keys if 'base_layer' in k.lower() and 'transformer' in k.lower()]
+        non_lora_transformer_keys = [k for k in all_keys if 'lora' not in k.lower() and 'base_layer' not in k.lower() and 'transformer' in k.lower()]
+        if rank == 0:
+            print(f"Checkpoint analysis:")
+            print(f"  Total keys: {len(all_keys)}")
+            print(f"  LoRA keys: {len(lora_keys)}")
+            print(f"  Base layer keys: {len(base_layer_keys)}")
+            print(f"  Direct transformer weight keys (merged): {len(non_lora_transformer_keys)}")
+        # 如果包含base_layer，说明是PEFT格式，需要合并base_layer + lora
+        if len(base_layer_keys) > 0:
+            if rank == 0:
+                print(f"✓ Detected PEFT format (base_layer + LoRA), merging weights...")
+            # 合并base_layer和lora权重
+            merged_state_dict = {}
+            # 首先收集所有需要合并的模块
+            modules_to_merge = {}
+            # 记录所有非LoRA的transformer权重键名（用于调试）
+            non_lora_keys_found = []
+            for key in all_keys:
+                # 移除前缀
+                new_key = key
+                has_transformer_prefix = False
+                if key.startswith('base_model.model.transformer.'):
+                    new_key = key[len('base_model.model.transformer.'):]
+                    has_transformer_prefix = True
+                elif key.startswith('model.transformer.'):
+                    new_key = key[len('model.transformer.'):]
+                    has_transformer_prefix = True
+                elif key.startswith('transformer.'):
+                    new_key = key[len('transformer.'):]
+                    has_transformer_prefix = True
+                elif 'transformer' in key.lower():
+                    # 可能没有前缀，但包含transformer（如直接是transformer_blocks.0...）
+                    has_transformer_prefix = True
+                if not has_transformer_prefix:
+                    continue
+                # 检查是否是base_layer或lora权重
+                if '.base_layer.weight' in new_key:
+                    # 提取模块名（去掉.base_layer.weight部分）
+                    module_key = new_key.replace('.base_layer.weight', '.weight')
+                    if module_key not in modules_to_merge:
+                        modules_to_merge[module_key] = {'base_weight': None, 'base_bias': None, 'lora_A': None, 'lora_B': None}
+                    modules_to_merge[module_key]['base_weight'] = (key, state_dict[key])
+                elif '.base_layer.bias' in new_key:
+                    module_key = new_key.replace('.base_layer.bias', '.bias')
+                    if module_key not in modules_to_merge:
+                        modules_to_merge[module_key] = {'base_weight': None, 'base_bias': None, 'lora_A': None, 'lora_B': None}
+                    modules_to_merge[module_key]['base_bias'] = (key, state_dict[key])
+                elif '.lora_A.default.weight' in new_key:
+                    module_key = new_key.replace('.lora_A.default.weight', '.weight')
+                    if module_key not in modules_to_merge:
+                        modules_to_merge[module_key] = {'base_weight': None, 'base_bias': None, 'lora_A': None, 'lora_B': None}
+                    modules_to_merge[module_key]['lora_A'] = (key, state_dict[key])
+                elif '.lora_B.default.weight' in new_key:
+                    module_key = new_key.replace('.lora_B.default.weight', '.weight')
+                    if module_key not in modules_to_merge:
+                        modules_to_merge[module_key] = {'base_weight': None, 'base_bias': None, 'lora_A': None, 'lora_B': None}
+                    modules_to_merge[module_key]['lora_B'] = (key, state_dict[key])
+                elif 'lora' not in new_key.lower() and 'base_layer' not in new_key.lower():
+                    # 其他非LoRA权重（如pos_embed、time_text_embed、context_embedder等），直接使用
+                    # 这些权重不在LoRA适配范围内，应该直接从checkpoint加载
+                    merged_state_dict[new_key] = state_dict[key]
+                    non_lora_keys_found.append(new_key)
+            if rank == 0:
+                print(f"  Found {len(non_lora_keys_found)} non-LoRA transformer keys in checkpoint")
+                if non_lora_keys_found:
+                    print(f"  Sample non-LoRA keys: {non_lora_keys_found[:10]}")
+            # 合并权重：weight = base_weight + lora_B @ lora_A * (alpha / rank)
+            if rank == 0:
+                print(f"  Merging {len(modules_to_merge)} modules...")
+            import torch
+            for module_key, weights in modules_to_merge.items():
+                # 处理权重（.weight）
+                if weights['base_weight'] is not None:
+                    base_key, base_weight = weights['base_weight']
+                    base_weight = base_weight.clone()
+                    if weights['lora_A'] is not None and weights['lora_B'] is not None:
+                        lora_A_key, lora_A = weights['lora_A']
+                        lora_B_key, lora_B = weights['lora_B']
+                        # 检测rank和alpha
+                        # lora_A: [rank, in_features], lora_B: [out_features, rank]
+                        rank_value = lora_A.shape[0]
+                        alpha = rank_value  # 通常alpha = rank
+                        # 合并：weight = base + (lora_B @ lora_A) * (alpha / rank)
+                        # lora_B @ lora_A 得到 [out_features, in_features]
+                        lora_delta = torch.matmul(lora_B, lora_A)
+                        if lora_delta.shape == base_weight.shape:
+                            merged_weight = base_weight + lora_delta * (alpha / rank_value)
+                            merged_state_dict[module_key] = merged_weight
+                            if rank == 0 and len(modules_to_merge) <= 20:
+                                print(f"  ✓ Merged {module_key}: {base_weight.shape}")
+                        else:
+                            if rank == 0:
+                                print(f"  ⚠️ Shape mismatch for {module_key}: base={base_weight.shape}, lora_delta={lora_delta.shape}, using base only")
+                            merged_state_dict[module_key] = base_weight
+                    else:
+                        # 只有base权重，没有LoRA
+                        merged_state_dict[module_key] = base_weight
+                # 处理bias（.bias）- bias通常不需要合并，直接使用base_bias
+                if '.bias' in module_key and weights['base_bias'] is not None:
+                    bias_key, base_bias = weights['base_bias']
+                    merged_state_dict[module_key] = base_bias.clone()
+            if rank == 0:
+                print(f"  Merged {len(merged_state_dict)} weights")
+                print(f"  Sample merged keys: {list(merged_state_dict.keys())[:5]}")
+            # 加载合并后的权重
+            try:
+                missing_keys, unexpected_keys = pipeline.transformer.load_state_dict(merged_state_dict, strict=False)
+                if rank == 0:
+                    print(f"  Loaded merged weights:")
+                    print(f"    Missing keys: {len(missing_keys)}")
+                    print(f"    Unexpected keys: {len(unexpected_keys)}")
+                    if missing_keys:
+                        print(f"    Missing keys: {missing_keys}")
+                        # 检查缺失的keys是否关键
+                        critical_keys = ['pos_embed', 'time_text_embed', 'context_embedder', 'norm_out', 'proj_out']
+                        has_critical = any(any(ck in mk for ck in critical_keys) for mk in missing_keys)
+                        if has_critical:
+                            print(f"    ⚠️ WARNING: Missing critical keys! These should be loaded from pretrained model.")
+                            print(f"    The missing keys will use values from the pretrained model (not fine-tuned).")
+                # 如果缺失的keys太多或包含关键组件，给出警告
+                if len(missing_keys) > 0:
+                    # 这些缺失的keys会使用pretrained model的默认值
+                    # 这是正常的，因为LoRA只适配了部分层，其他层保持原样
+                    if rank == 0:
+                        print(f"  Note: Missing keys will use pretrained model weights (not fine-tuned)")
+                if rank == 0:
+                    print(f"  ✓ Successfully loaded merged model weights")
+                return True
+            except Exception as e:
+                if rank == 0:
+                    print(f"  ❌ Error loading merged weights: {e}")
+                    import traceback
+                    traceback.print_exc()
+                return False
+        # 如果包含非LoRA的transformer权重（且没有base_layer），说明是合并后的完整模型
+        elif len(non_lora_transformer_keys) > 0:
+            if rank == 0:
+                print(f"✓ Detected merged model weights (contains full transformer weights)")
+                print(f"  Loading full model weights directly...")
+            # 提取transformer相关的权重（包括LoRA和基础权重）
+            transformer_state_dict = {}
+            for key, value in state_dict.items():
+                # 移除可能的accelerator包装前缀
+                new_key = key
+                if key.startswith('base_model.model.transformer.'):
+                    new_key = key[len('base_model.model.transformer.'):]
+                elif key.startswith('model.transformer.'):
+                    new_key = key[len('model.transformer.'):]
+                elif key.startswith('transformer.'):
+                    new_key = key[len('transformer.'):]
+                # 只保留transformer相关的权重（包括所有transformer子模块）
+                # 检查是否是transformer的权重（不包含text_encoder等）
+                if (new_key.startswith('transformer_blocks') or
+                    new_key.startswith('pos_embed') or
+                    new_key.startswith('time_text_embed') or
+                    'lora' in new_key.lower()):  # 也包含LoRA权重（如果存在）
+                    transformer_state_dict[new_key] = value
+            if rank == 0:
+                print(f"  Extracted {len(transformer_state_dict)} transformer weight keys")
+                print(f"  Sample keys: {list(transformer_state_dict.keys())[:5]}")
+            # 直接加载到transformer（不使用LoRA适配器）
+            try:
+                missing_keys, unexpected_keys = pipeline.transformer.load_state_dict(transformer_state_dict, strict=False)
+                if rank == 0:
+                    print(f"  Loaded full model weights:")
+                    print(f"    Missing keys: {len(missing_keys)}")
+                    print(f"    Unexpected keys: {len(unexpected_keys)}")
+                    if missing_keys:
+                        print(f"    Sample missing keys: {missing_keys[:5]}")
+                    if unexpected_keys:
+                        print(f"    Sample unexpected keys: {unexpected_keys[:5]}")
+                # 如果missing keys太多，可能有问题
+                if len(missing_keys) > len(transformer_state_dict) * 0.5:
+                    if rank == 0:
+                        print(f"  ⚠️ WARNING: Too many missing keys, weights may not be fully loaded")
+                    return False
+                if rank == 0:
+                    print(f"  ✓ Successfully loaded merged model weights")
+                return True
+            except Exception as e:
+                if rank == 0:
+                    print(f"  ❌ Error loading full model weights: {e}")
+                    import traceback
+                    traceback.print_exc()
+                return False
+        # 如果只包含LoRA权重，按原来的方式加载
+        if rank == 0:
+            print(f"Detected LoRA-only weights, loading as LoRA adapter...")
+        # 首先尝试从checkpoint中检测实际的rank
+        detected_rank = None
+        for key, value in state_dict.items():
+            if 'lora_A' in key and 'transformer' in key and len(value.shape) == 2:
+                # lora_A的形状是 [rank, hidden_size]
+                detected_rank = value.shape[0]
+                if rank == 0:
+                    print(f"✓ Detected LoRA rank from checkpoint: {detected_rank} (from key: {key})")
+                break
+        # 如果检测到rank，使用检测到的rank；否则使用传入的rank
+        actual_rank = detected_rank if detected_rank is not None else lora_rank
+        if detected_rank is not None and detected_rank != lora_rank:
+            if rank == 0:
+                print(f"⚠️ Warning: Detected rank ({detected_rank}) differs from requested rank ({lora_rank}), using detected rank")
+        # 检查适配器是否已存在，如果存在则先卸载
+        # SD3Transformer2DModel没有delete_adapter方法，需要使用unload_lora_weights
+        if hasattr(pipeline.transformer, 'peft_config') and pipeline.transformer.peft_config:
+            if "default" in pipeline.transformer.peft_config:
+                if rank == 0:
+                    print("Removing existing 'default' adapter before adding new one...")
+                try:
+                    # 使用pipeline的unload_lora_weights方法
+                    pipeline.unload_lora_weights()
+                    if rank == 0:
+                        print("Successfully unloaded existing LoRA adapter")
+                except Exception as e:
+                    if rank == 0:
+                        print(f"❌ ERROR: Could not unload existing adapter: {e}")
+                        print("Cannot proceed without cleaning up adapter")
+                    return False
+        # 先配置LoRA适配器（必须在加载之前配置）
+        # 使用检测到的或传入的rank
+        transformer_lora_config = LoraConfig(
+            r=actual_rank,
+            lora_alpha=actual_rank,
+            init_lora_weights="gaussian",
+            target_modules=["attn.to_k", "attn.to_q", "attn.to_v", "attn.to_out.0"],
+        )
+        # 为transformer添加LoRA适配器
+        pipeline.transformer.add_adapter(transformer_lora_config)
+        if rank == 0:
+            print(f"LoRA adapter configured with rank={actual_rank}")
+        # 继续处理LoRA权重加载（state_dict已经在上面加载了）
+        # 提取LoRA权重 - accelerator保存的格式
+        # 从accelerator checkpoint的model.safetensors中，键名格式可能是：
+        # - transformer_blocks.X.attn.to_q.lora_A.default.weight (PEFT格式，直接可用)
+        # - 或者包含其他前缀
+        lora_state_dict = {}
+        for key, value in state_dict.items():
+            if 'lora' in key.lower() and 'transformer' in key.lower():
+                # 检查键名格式
+                new_key = key
+                # 移除可能的accelerator包装前缀
+                # accelerator可能保存为: model.transformer.transformer_blocks...
+                # 或者: base_model.model.transformer.transformer_blocks...
+                if key.startswith('base_model.model.transformer.'):
+                    new_key = key[len('base_model.model.transformer.'):]
+                elif key.startswith('model.transformer.'):
+                    new_key = key[len('model.transformer.'):]
+                elif key.startswith('transformer.'):
+                    # 如果已经是transformer_blocks开头，不需要移除transformer.前缀
+                    # 因为transformer_blocks是transformer的子模块
+                    if not key[len('transformer.'):].startswith('transformer_blocks'):
+                        new_key = key[len('transformer.'):]
+                    else:
+                        new_key = key[len('transformer.'):]
+                # 只保留transformer相关的LoRA权重
+                if 'transformer_blocks' in new_key or 'transformer' in new_key:
+                    lora_state_dict[new_key] = value
+        if not lora_state_dict:
+            if rank == 0:
+                print("No LoRA weights found in checkpoint")
+                # 打印所有键名用于调试
+                all_keys = list(state_dict.keys())
+                print(f"Total keys: {len(all_keys)}")
+                print(f"First 20 keys: {all_keys[:20]}")
+                # 查找包含lora的键
+                lora_related = [k for k in all_keys if 'lora' in k.lower()]
+                if lora_related:
+                    print(f"Keys containing 'lora': {lora_related[:10]}")
+            return False
+        if rank == 0:
+            print(f"Found {len(lora_state_dict)} LoRA weight keys")
+            sample_keys = list(lora_state_dict.keys())[:5]
+            print(f"Sample LoRA keys: {sample_keys}")
+        # 加载LoRA权重到transformer
+        # 注意：从checkpoint提取的键名格式已经是PEFT格式（如：transformer_blocks.0.attn.to_q.lora_A.default.weight）
+        # 不需要使用convert_unet_state_dict_to_peft转换，直接使用即可
+        try:
+            # 检查键名格式
+            sample_key = list(lora_state_dict.keys())[0] if lora_state_dict else ""
+            if rank == 0:
+                print(f"Original key format: {sample_key}")
+            # 关键问题：set_peft_model_state_dict期望的键名格式
+            # 从back/train_dreambooth_lora.py看，需要移除.default后缀
+            # 格式应该是：transformer_blocks.X.attn.to_q.lora_A.weight（没有.default）
+            # 但accelerator保存的格式是：transformer_blocks.X.attn.to_q.lora_A.default.weight（有.default）
+            # 检查键名格式
+            sample_key = list(lora_state_dict.keys())[0] if lora_state_dict else ""
+            has_default_suffix = '.default.weight' in sample_key or '.default.bias' in sample_key
+            if rank == 0:
+                print(f"Sample key: {sample_key}")
+                print(f"Has .default suffix: {has_default_suffix}")
+            # 如果键名包含.default.weight或.default.bias，需要移除.default部分
+            # 因为set_peft_model_state_dict期望的格式是：lora_A.weight，而不是lora_A.default.weight
+            converted_dict = {}
+            for key, value in lora_state_dict.items():
+                # 移除.default后缀（如果存在）
+                # transformer_blocks.0.attn.to_q.lora_A.default.weight -> transformer_blocks.0.attn.to_q.lora_A.weight
+                new_key = key
+                if '.default.weight' in new_key:
+                    new_key = new_key.replace('.default.weight', '.weight')
+                elif '.default.bias' in new_key:
+                    new_key = new_key.replace('.default.bias', '.bias')
+                elif '.default' in new_key and (new_key.endswith('.weight') or new_key.endswith('.bias')):
+                    # 处理其他可能的.default位置
+                    new_key = new_key.replace('.default', '')
+                converted_dict[new_key] = value
+            if rank == 0:
+                print(f"Converted {len(converted_dict)} keys (removed .default suffix if present)")
+                print(f"Sample converted keys: {list(converted_dict.keys())[:5]}")
+            # 调用set_peft_model_state_dict并检查返回值
+            incompatible_keys = set_peft_model_state_dict(
+                pipeline.transformer,
+                converted_dict,
+                adapter_name="default"
+            )
+            # 检查加载结果
+            if incompatible_keys is not None:
+                missing_keys = getattr(incompatible_keys, "missing_keys", [])
+                unexpected_keys = getattr(incompatible_keys, "unexpected_keys", [])
+                if rank == 0:
+                    print(f"LoRA loading result:")
+                    print(f"  Missing keys: {len(missing_keys)}")
+                    print(f"  Unexpected keys: {len(unexpected_keys)}")
+                    if len(missing_keys) > 100:
+                        print(f"  ⚠️ WARNING: Too many missing keys ({len(missing_keys)}), LoRA may not be fully loaded!")
+                        print(f"  Sample missing keys: {missing_keys[:10]}")
+                    elif missing_keys:
+                        print(f"  Sample missing keys: {missing_keys[:10]}")
+                    if unexpected_keys:
+                        print(f"  Unexpected keys: {unexpected_keys[:10]}")
+                # 如果missing keys太多，说明加载失败
+                if len(missing_keys) > len(converted_dict) * 0.5:  # 超过50%的键缺失
+                    if rank == 0:
+                        print("❌ ERROR: Too many missing keys, LoRA weights not loaded correctly!")
+                    return False
+            else:
+                if rank == 0:
+                    print("✓ LoRA weights loaded (no incompatible keys reported)")
+        except RuntimeError as e:
+            # 检查是否是size mismatch错误
+            error_str = str(e)
+            if "size mismatch" in error_str:
+                if rank == 0:
+                    print(f"❌ Size mismatch error: The checkpoint rank doesn't match the adapter rank")
+                    print(f"   This usually means the checkpoint was trained with a different rank")
+                    # 尝试从错误信息中提取期望的rank
+                    import re
+                    # 错误信息格式: "copying a param with shape torch.Size([32, 1536]) from checkpoint"
+                    match = re.search(r'copying a param with shape torch\.Size\(\[(\d+),', error_str)
+                    if match:
+                        checkpoint_rank = int(match.group(1))
+                        if rank == 0:
+                            print(f"   Detected checkpoint rank: {checkpoint_rank}")
+                            print(f"   Adapter was configured with rank: {actual_rank}")
+                            if checkpoint_rank != actual_rank:
+                                print(f"   ⚠️ Mismatch! Need to recreate adapter with rank={checkpoint_rank}")
+            else:
+                if rank == 0:
+                    print(f"❌ Error setting LoRA state dict: {e}")
+                    import traceback
+                    traceback.print_exc()
+            # 清理适配器以便下次尝试
+            try:
+                pipeline.unload_lora_weights()
+            except:
+                pass
+            return False
+        except Exception as e:
+            if rank == 0:
+                print(f"❌ Error setting LoRA state dict: {e}")
+                import traceback
+                traceback.print_exc()
+            # 清理适配器以便下次尝试
+            try:
+                pipeline.unload_lora_weights()
+            except:
+                pass
+            return False
+        # 启用LoRA适配器
+        pipeline.transformer.set_adapter("default")
+        # 验证LoRA是否已加载和应用
+        if hasattr(pipeline.transformer, 'peft_config'):
+            adapters = list(pipeline.transformer.peft_config.keys())
+            if rank == 0:
+                print(f"LoRA adapters configured: {adapters}")
+                # 检查适配器是否启用
+                if hasattr(pipeline.transformer, 'active_adapters'):
+                    # active_adapters 是一个方法，需要调用
+                    try:
+                        if callable(pipeline.transformer.active_adapters):
+                            active = pipeline.transformer.active_adapters()
+                        else:
+                            active = pipeline.transformer.active_adapters
+                        if rank == 0:
+                            print(f"Active adapters: {active}")
+                    except:
+                        if rank == 0:
+                            print("Could not get active adapters, but LoRA is configured")
+        # 验证LoRA权重是否真的被应用
+        # 检查LoRA层的权重是否非零
+        lora_layers_found = 0
+        nonzero_lora_layers = 0
+        total_lora_weight_sum = 0.0
+        for name, module in pipeline.transformer.named_modules():
+            if 'lora_A' in name or 'lora_B' in name:
+                lora_layers_found += 1
+                if hasattr(module, 'weight') and module.weight is not None:
+                    weight_sum = module.weight.abs().sum().item()
+                    total_lora_weight_sum += weight_sum
+                    if weight_sum > 1e-6:  # 非零阈值
+                        nonzero_lora_layers += 1
+                        if rank == 0 and nonzero_lora_layers <= 3:  # 只打印前3个
+                            print(f"✓ Found non-zero LoRA weight in: {name}, sum={weight_sum:.6f}")
+        if rank == 0:
+            print(f"LoRA verification:")
+            print(f"  Total LoRA layers found: {lora_layers_found}")
+            print(f"  Non-zero LoRA layers: {nonzero_lora_layers}")
+            print(f"  Total LoRA weight sum: {total_lora_weight_sum:.6f}")
+            if lora_layers_found == 0:
+                print("❌ ERROR: No LoRA layers found in transformer!")
+                return False
+            elif nonzero_lora_layers == 0:
+                print("❌ ERROR: All LoRA weights are zero, LoRA not loaded correctly!")
+                return False
+            elif nonzero_lora_layers < lora_layers_found * 0.5:
+                print(f"⚠️ WARNING: Only {nonzero_lora_layers}/{lora_layers_found} LoRA layers have non-zero weights!")
+                print("⚠️ LoRA may not be fully applied!")
+            else:
+                print(f"✓ LoRA weights verified: {nonzero_lora_layers}/{lora_layers_found} layers have non-zero weights")
+        if nonzero_lora_layers == 0:
+            return False
+        if rank == 0:
+            print("✓ Successfully loaded and verified LoRA weights from checkpoint")
+        return True
+    except Exception as e:
+        if rank == 0:
+            print(f"Error loading LoRA from checkpoint: {e}")
+            import traceback
+            traceback.print_exc()
+        return False
+def load_captions_from_jsonl(jsonl_path):
+    captions = []
+    with open(jsonl_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+                cap = None
+                for field in ['caption', 'text', 'prompt', 'description']:
+                    if field in data and isinstance(data[field], str):
+                        cap = data[field].strip()
+                        break
+                if cap:
+                    captions.append(cap)
+            except Exception:
+                continue
+    return captions if captions else ["a beautiful high quality image"]
+def main(args):
+    assert torch.cuda.is_available(), "需要GPU运行"
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    device = rank % torch.cuda.device_count()
+    torch.cuda.set_device(device)
+    seed = args.global_seed * world_size + rank
+    torch.manual_seed(seed)
+    # 调试：打印接收到的参数
+    if rank == 0:
+        print("=" * 80)
+        print("参数检查:")
+        print(f"  lora_path: {args.lora_path}")
+        print(f"  rectified_weights: {args.rectified_weights}")
+        print(f"  lora_path is None: {args.lora_path is None}")
+        print(f"  lora_path is empty: {args.lora_path == '' if args.lora_path else 'N/A'}")
+        print(f"  rectified_weights is None: {args.rectified_weights is None}")
+        print(f"  rectified_weights is empty: {args.rectified_weights == '' if args.rectified_weights else 'N/A'}")
+        print("=" * 80)
+    lora_source = "baseline"
+    # 导入训练脚本中的类
+    RectifiedNoiseModule, SD3WithRectifiedNoise = dynamic_import_training_classes(str(Path(__file__).parent))
+    # 加载 pipeline
+    dtype = torch.float16 if args.mixed_precision == "fp16" else (torch.bfloat16 if args.mixed_precision == "bf16" else torch.float32)
+    if rank == 0:
+        print(f"Loading SD3 pipeline from {args.pretrained_model_name_or_path} (dtype={dtype})")
+    pipeline = StableDiffusion3Pipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=dtype,
+    ).to(device)
+    # 加载 LoRA（可选）
+    lora_loaded = False
+    if args.lora_path:
+        if rank == 0:
+            print(f"Attempting to load LoRA weights from: {args.lora_path}")
+            print(f"LoRA path exists: {os.path.exists(args.lora_path) if args.lora_path else False}")
+        # 首先检查是否是标准的LoRA权重文件/目录
+        if check_lora_weights_exist(args.lora_path):
+            if rank == 0:
+                print("Found standard LoRA weights, loading...")
+            try:
+                # 检查加载前的transformer参数（用于验证）
+                if rank == 0:
+                    sample_param_before = next(iter(pipeline.transformer.parameters())).clone()
+                    print(f"Sample transformer param before LoRA (first 5 values): {sample_param_before.flatten()[:5]}")
+                pipeline.load_lora_weights(args.lora_path)
+                lora_loaded = True
+                lora_source = os.path.basename(args.lora_path.rstrip('/'))
+                # 验证LoRA是否真的被加载
+                if rank == 0:
+                    sample_param_after = next(iter(pipeline.transformer.parameters())).clone()
+                    param_diff = (sample_param_after - sample_param_before).abs().max().item()
+                    print(f"Sample transformer param after LoRA (first 5 values): {sample_param_after.flatten()[:5]}")
+                    print(f"Max parameter change after LoRA loading: {param_diff}")
+                    if param_diff < 1e-6:
+                        print("⚠️ WARNING: LoRA weights may not have been applied (parameter change is very small)")
+                    else:
+                        print("✓ LoRA weights appear to have been applied")
+                    # 检查是否有peft_config
+                    if hasattr(pipeline.transformer, 'peft_config'):
+                        print(f"✓ PEFT config found: {list(pipeline.transformer.peft_config.keys())}")
+                    else:
+                        print("⚠️ WARNING: No peft_config found after loading LoRA")
+                if rank == 0:
+                    print("LoRA loaded successfully from standard format.")
+            except Exception as e:
+                if rank == 0:
+                    print(f"Failed to load LoRA from standard format: {e}")
+                    import traceback
+                    traceback.print_exc()
+        # 如果不是标准格式，尝试从accelerator checkpoint加载
+        if not lora_loaded and os.path.isdir(args.lora_path):
+            if rank == 0:
+                print("Standard LoRA weights not found, trying accelerator checkpoint format...")
+            # 首先尝试从checkpoint的model.safetensors中检测实际的rank
+            # 通过检查LoRA权重的形状来推断rank
+            detected_rank = None
+            try:
+                from safetensors.torch import load_file
+                model_file = os.path.join(args.lora_path, "model.safetensors")
+                if os.path.exists(model_file):
+                    state_dict = load_file(model_file)
+                    # 查找一个LoRA权重来确定rank
+                    for key, value in state_dict.items():
+                        if 'lora_A' in key and 'transformer' in key and len(value.shape) == 2:
+                            # lora_A的形状是 [rank, hidden_size]
+                            detected_rank = value.shape[0]
+                            if rank == 0:
+                                print(f"✓ Detected LoRA rank from checkpoint: {detected_rank} (from key: {key})")
+                            break
+            except Exception as e:
+                if rank == 0:
+                    print(f"Could not detect rank from checkpoint: {e}")
+            # 构建rank尝试列表
+            # 如果检测到rank，优先使用检测到的rank，只尝试一次
+            # 如果未检测到，尝试常见的rank值
+            if detected_rank is not None:
+                rank_list = [detected_rank]
+                if rank == 0:
+                    print(f"Using detected rank: {detected_rank}")
+            else:
+                # 如果检测失败，尝试常见的rank值（按用户指定的rank优先）
+                rank_list = []
+                # 如果用户指定了rank（从args.lora_rank），优先尝试
+                if hasattr(args, 'lora_rank') and args.lora_rank:
+                    rank_list.append(args.lora_rank)
+                # 添加其他常见的rank值
+                for r in [32, 64, 16, 128]:
+                    if r not in rank_list:
+                        rank_list.append(r)
+                if rank == 0:
+                    print(f"Rank detection failed, will try ranks in order: {rank_list}")
+            # 尝试不同的rank值
+            for lora_rank in rank_list:
+                # 在尝试新的rank之前，先清理已存在的适配器
+                # 重要：每次尝试前都要清理，否则适配器会保留之前的rank配置
+                if hasattr(pipeline.transformer, 'peft_config') and pipeline.transformer.peft_config:
+                    if "default" in pipeline.transformer.peft_config:
+                        try:
+                            # 使用pipeline的unload_lora_weights方法
+                            pipeline.unload_lora_weights()
+                            if rank == 0:
+                                print(f"Cleaned up existing adapter before trying rank={lora_rank}")
+                        except Exception as e:
+                            if rank == 0:
+                                print(f"Warning: Could not unload adapter: {e}")
+                                # 如果卸载失败，需要重新创建pipeline
+                                if rank == 0:
+                                    print("⚠️ WARNING: Cannot unload adapter, will recreate pipeline...")
+                                    # 重新加载pipeline（最后手段）
+                                    try:
+                                        pipeline = StableDiffusion3Pipeline.from_pretrained(
+                                            args.pretrained_model_name_or_path,
+                                            revision=args.revision,
+                                            variant=args.variant,
+                                            torch_dtype=dtype,
+                                        ).to(device)
+                                        if rank == 0:
+                                            print("Pipeline recreated to clear adapter state")
+                                    except Exception as e2:
+                                        if rank == 0:
+                                            print(f"Failed to recreate pipeline: {e2}")
+                if rank == 0:
+                    print(f"Trying to load with LoRA rank={lora_rank}...")
+                lora_loaded = load_lora_from_checkpoint(pipeline, args.lora_path, rank=rank, lora_rank=lora_rank)
+                if lora_loaded:
+                    if rank == 0:
+                        print(f"✓ Successfully loaded LoRA with rank={lora_rank}")
+                    lora_source = "checkpoint"
+                    break
+                elif rank == 0:
+                    print(f"✗ Failed to load with rank={lora_rank}, trying next rank...")
+        # 如果checkpoint目录加载失败，尝试从输出目录的根目录加载标准LoRA权重
+        if not lora_loaded and os.path.isdir(args.lora_path):
+            # 检查输出目录的根目录（checkpoint的父目录）
+            output_dir = os.path.dirname(args.lora_path.rstrip('/'))
+            if output_dir and os.path.exists(output_dir):
+                if rank == 0:
+                    print(f"Trying to load standard LoRA weights from output directory: {output_dir}")
+                if check_lora_weights_exist(output_dir):
+                    try:
+                        pipeline.load_lora_weights(output_dir)
+                        lora_loaded = True
+                        if rank == 0:
+                            print("LoRA loaded successfully from output directory.")
+                    except Exception as e:
+                        if rank == 0:
+                            print(f"Failed to load LoRA from output directory: {e}")
+        if not lora_loaded:
+            if rank == 0:
+                print(f"⚠️ WARNING: Failed to load LoRA weights from {args.lora_path}, using baseline model")
+        else:
+            # 最终验证LoRA是否真的被启用
+            if rank == 0:
+                print("=" * 80)
+                print("LoRA 加载验证:")
+                if hasattr(pipeline.transformer, 'peft_config') and pipeline.transformer.peft_config:
+                    print(f"  ✓ PEFT config exists: {list(pipeline.transformer.peft_config.keys())}")
+                    # 检查LoRA层的权重
+                    lora_layers_found = 0
+                    for name, module in pipeline.transformer.named_modules():
+                        if 'lora_A' in name or 'lora_B' in name:
+                            lora_layers_found += 1
+                            if lora_layers_found <= 3:  # 只打印前3个
+                                if hasattr(module, 'weight'):
+                                    weight_sum = module.weight.abs().sum().item() if module.weight is not None else 0
+                                    print(f"  ✓ Found LoRA layer: {name}, weight_sum={weight_sum:.6f}")
+                    print(f"  ✓ Total LoRA layers found: {lora_layers_found}")
+                    if lora_layers_found == 0:
+                        print("  ⚠️ WARNING: No LoRA layers found in transformer!")
+                else:
+                    print("  ⚠️ WARNING: No PEFT config found - LoRA may not be active!")
+                print("=" * 80)
+    # 构建 RectifiedNoiseModule 并加载权重（仅在提供了 rectified_weights 时）
+    # 安全地检查 rectified_weights 是否有效
+    use_rectified = False
+    rectified_weights_path = None
+    if args.rectified_weights:
+        rectified_weights_str = str(args.rectified_weights).strip()
+        if rectified_weights_str:
+            use_rectified = True
+            rectified_weights_path = rectified_weights_str
+    if rank == 0:
+        print(f"use_rectified: {use_rectified}, rectified_weights_path: {rectified_weights_path}")
+    if use_rectified:
+        if rank == 0:
+            print(f"Using Rectified Noise module with weights from: {rectified_weights_path}")
+        # 从 transformer 配置推断必要尺寸
+        tfm = pipeline.transformer
+        if hasattr(tfm.config, 'joint_attention_dim') and tfm.config.joint_attention_dim is not None:
+            sit_hidden_size = tfm.config.joint_attention_dim
+        elif hasattr(tfm.config, 'inner_dim') and tfm.config.inner_dim is not None:
+            sit_hidden_size = tfm.config.inner_dim
+        elif hasattr(tfm.config, 'hidden_size') and tfm.config.hidden_size is not None:
+            sit_hidden_size = tfm.config.hidden_size
+        else:
+            sit_hidden_size = 4096
+        transformer_hidden_size = getattr(tfm.config, 'hidden_size', 1536)
+        num_attention_heads = getattr(tfm.config, 'num_attention_heads', 32)
+        input_dim = getattr(tfm.config, 'in_channels', 16)
+        rectified_module = RectifiedNoiseModule(
+            hidden_size=sit_hidden_size,
+            num_sit_layers=args.num_sit_layers,
+            num_attention_heads=num_attention_heads,
+            input_dim=input_dim,
+            transformer_hidden_size=transformer_hidden_size,
+        )
+        # 加载 SIT 权重
+        ok = load_sit_weights(rectified_module, rectified_weights_path, rank=rank)
+        if rank == 0:
+            if not ok:
+                print("⚠️ Warning: Failed to load rectified weights, will use baseline model without rectified noise")
+            else:
+                print("✓ Successfully loaded rectified noise weights")
+        # 组装 SD3WithRectifiedNoise
+        # 关键：SD3WithRectifiedNoise 会保留 transformer 的引用
+        # 但是，SD3WithRectifiedNoise 在 __init__ 中会冻结 transformer 参数
+        # 这不应该影响 LoRA，因为 LoRA 是作为适配器添加的，不是原始参数
+        # 我们需要确保在创建 SD3WithRectifiedNoise 之前，LoRA 适配器已经正确加载和启用
+        if lora_loaded and rank == 0:
+            print("Creating SD3WithRectifiedNoise with LoRA-enabled transformer...")
+        elif rank == 0:
+            print("Creating SD3WithRectifiedNoise...")
+        model = SD3WithRectifiedNoise(pipeline.transformer, rectified_module).to(device)
+        # 重要：SD3WithRectifiedNoise 的 __init__ 会冻结 transformer 参数
+        # 但 LoRA 适配器应该仍然有效，因为它们是独立的模块
+        # 我们需要确保 LoRA 适配器在包装后仍然可以访问
+        # 确保 LoRA 适配器在模型替换后仍然启用
+        if lora_loaded:
+            # 通过model.transformer访问，因为SD3WithRectifiedNoise包装了transformer
+            if hasattr(model.transformer, 'peft_config'):
+                try:
+                    # 确保适配器处于启用状态
+                    model.transformer.set_adapter("default_0")
+                    # 验证LoRA权重在包装后是否仍然存在
+                    lora_layers_after_wrap = 0
+                    nonzero_after_wrap = 0
+                    for name, module in model.transformer.named_modules():
+                        if 'lora_A' in name or 'lora_B' in name:
+                            lora_layers_after_wrap += 1
+                            if hasattr(module, 'weight') and module.weight is not None:
+                                if module.weight.abs().sum().item() > 1e-6:
+                                    nonzero_after_wrap += 1
+                    if rank == 0:
+                        print(f"LoRA after SD3WithRectifiedNoise wrapping:")
+                        print(f"  LoRA layers: {lora_layers_after_wrap}, Non-zero: {nonzero_after_wrap}")
+                        if nonzero_after_wrap == 0:
+                            print("  ❌ ERROR: All LoRA weights are zero after wrapping!")
+                        elif nonzero_after_wrap < lora_layers_after_wrap * 0.5:
+                            print(f"  ⚠️ WARNING: Only {nonzero_after_wrap}/{lora_layers_after_wrap} LoRA layers have weights!")
+                        else:
+                            print(f"  ✓ LoRA weights preserved after wrapping")
+                    # 验证适配器是否真的启用
+                    if hasattr(model.transformer, 'active_adapters'):
+                        try:
+                            if callable(model.transformer.active_adapters):
+                                active = model.transformer.active_adapters()
+                            else:
+                                active = model.transformer.active_adapters
+                            if rank == 0:
+                                print(f"  Active adapters: {active}")
+                        except:
+                            if rank == 0:
+                                print("  LoRA adapter re-enabled after model wrapping")
+                    else:
+                        if rank == 0:
+                            print("  LoRA adapter re-enabled after model wrapping")
+                except Exception as e:
+                    if rank == 0:
+                        print(f"❌ ERROR: Could not re-enable LoRA adapter: {e}")
+                        import traceback
+                        traceback.print_exc()
+            else:
+                # LoRA权重已经合并到transformer的基础权重中（合并加载方式）
+                # 这种情况下没有peft_config是正常的，因为LoRA已经合并了
+                if rank == 0:
+                    print("LoRA loaded via merged weights (no PEFT adapter needed)")
+                    print("  ✓ LoRA weights are already merged into transformer base weights")
+                    print("  Note: This is expected when loading from merged checkpoint format")
+        # 注册到 pipeline（pipeline_stable_diffusion_3.py 已支持 external model）
+        pipeline.model = model
+        # 确保模型处于评估模式（LoRA在eval模式下也应该工作）
+        model.eval()
+        model.transformer.eval()  # 确保transformer也处于eval模式
+    else:
+        if rank == 0:
+            print("Not using Rectified Noise module, using baseline SD3 pipeline")
+        # 不使用 SD3WithRectifiedNoise，保持原始 pipeline
+        # pipeline.model 保持为原始的 transformer
+    # 关键：确保LoRA适配器在推理时被使用
+    # PEFT模型在eval模式下，LoRA适配器应该自动启用，但我们需要确保
+    if lora_loaded:
+        # 获取正确的 transformer 引用
+        transformer_ref = model.transformer if use_rectified else pipeline.transformer
+        # 确保transformer的LoRA适配器处于启用状态
+        if hasattr(transformer_ref, 'set_adapter'):
+            try:
+                transformer_ref.set_adapter("default")
+            except:
+                pass
+        # 验证LoRA是否真的会被使用
+        if rank == 0:
+            # 检查一个LoRA层的权重
+            lora_found = False
+            for name, module in transformer_ref.named_modules():
+                if 'lora_A' in name and 'default' in name and hasattr(module, 'weight'):
+                    if module.weight is not None:
+                        weight_sum = module.weight.abs().sum().item()
+                        if weight_sum > 0:
+                            print(f"✓ Verified LoRA weight in {name}: sum={weight_sum:.6f}")
+                            lora_found = True
+                            break
+            if not lora_found:
+                print("⚠ Warning: Could not verify LoRA weights in model")
+            else:
+                # 额外检查：验证LoRA层是否真的会被调用
+                # 检查一个LoRA Linear层
+                for name, module in transformer_ref.named_modules():
+                    if hasattr(module, '__class__') and 'lora' in module.__class__.__name__.lower():
+                        if hasattr(module, 'lora_enabled'):
+                            enabled = module.lora_enabled
+                            if rank == 0:
+                                print(f"✓ Found LoRA layer {name}, enabled: {enabled}")
+                        break
+            print("Model set to eval mode, LoRA should be active during inference")
+    # 启用内存优化选项
+    if args.enable_attention_slicing:
+        enable_attention_slicing_method = getattr(pipeline, 'enable_attention_slicing', None)
+        if enable_attention_slicing_method is not None and callable(enable_attention_slicing_method):
+            try:
+                if rank == 0:
+                    print("Enabling attention slicing to save memory")
+                enable_attention_slicing_method()
+            except Exception as e:
+                if rank == 0:
+                    print(f"Warning: Failed to enable attention slicing: {e}")
+        else:
+            if rank == 0:
+                print("Warning: Attention slicing not available for this pipeline")
+    if args.enable_vae_slicing:
+        # 使用 getattr 来安全地检查方法是否存在，避免触发 __getattr__ 异常
+        enable_vae_slicing_method = getattr(pipeline, 'enable_vae_slicing', None)
+        if enable_vae_slicing_method is not None and callable(enable_vae_slicing_method):
+            try:
+                if rank == 0:
+                    print("Enabling VAE slicing to save memory")
+                enable_vae_slicing_method()
+            except Exception as e:
+                if rank == 0:
+                    print(f"Warning: Failed to enable VAE slicing: {e}")
+        else:
+            if rank == 0:
+                print("Warning: VAE slicing not available for this pipeline (SD3 may not support this)")
+    if args.enable_cpu_offload:
+        if rank == 0:
+            print("Enabling CPU offload to save memory")
+        pipeline.enable_model_cpu_offload()
+    # 禁用进度条以减少输出
+    pipeline.set_progress_bar_config(disable=True)
+    # 读入 captions
+    captions = load_captions_from_jsonl(args.captions_jsonl)
+    total_images_needed = min(len(captions) * args.images_per_caption, args.max_samples)
+    # 生成caption和image的映射列表
+    caption_image_pairs = []
+    for i, caption in enumerate(captions):
+        for j in range(args.images_per_caption):
+            caption_image_pairs.append((caption, i, j))  # (caption, caption_idx, image_idx)
+    # 输出目录
+    folder_name = f"sd3-rectified-{lora_source}-guidance-{args.guidance_scale}-steps-{args.num_inference_steps}-size-{args.height}x{args.width}"
+    sample_folder_dir = os.path.join(args.sample_dir, folder_name)
+    if rank == 0:
+        os.makedirs(sample_folder_dir, exist_ok=True)
+        print(f"Saving .png samples at {sample_folder_dir}")
+        # 清空caption文件
+        caption_file = os.path.join(sample_folder_dir, "captions.txt")
+        if os.path.exists(caption_file):
+            os.remove(caption_file)
+    dist.barrier()
+    n = args.per_proc_batch_size
+    global_batch = n * world_size
+    total_samples = int(math.ceil(total_images_needed / global_batch) * global_batch)
+    assert total_samples % world_size == 0
+    samples_per_gpu = total_samples // world_size
+    assert samples_per_gpu % n == 0
+    iterations = samples_per_gpu // n
+    if rank == 0:
+        print(f"Sampling total={total_samples}, per_gpu={samples_per_gpu}, iterations={iterations}")
+    pbar = tqdm(range(iterations)) if rank == 0 else range(iterations)
+    saved = 0
+    autocast_device = "cuda" if torch.cuda.is_available() else "cpu"
+    for it in pbar:
+        # 获取这个batch对应的caption
+        batch_prompts = []
+        batch_caption_info = []
+        for j in range(n):
+            global_index = it * global_batch + j * world_size + rank
+            if global_index < len(caption_image_pairs):
+                caption, caption_idx, image_idx = caption_image_pairs[global_index]
+                batch_prompts.append(caption)
+                batch_caption_info.append((caption, caption_idx, image_idx))
+            else:
+                # 如果超出范围，使用最后一个caption
+                if caption_image_pairs:
+                    caption, caption_idx, image_idx = caption_image_pairs[-1]
+                    batch_prompts.append(caption)
+                    batch_caption_info.append((caption, caption_idx, image_idx))
+                else:
+                    batch_prompts.append("a beautiful high quality image")
+                    batch_caption_info.append(("a beautiful high quality image", 0, 0))
+        with torch.autocast(autocast_device, dtype=dtype):
+            images = []
+            for k, prompt in enumerate(batch_prompts):
+                image_seed = seed + it * 10000 + k * 1000 + rank
+                generator = torch.Generator(device=device).manual_seed(image_seed)
+                img = pipeline(
+                    prompt=prompt,
+                    height=args.height,
+                    width=args.width,
+                    num_inference_steps=args.num_inference_steps,
+                    guidance_scale=args.guidance_scale,
+                    generator=generator,
+                    num_images_per_prompt=1,
+                ).images[0]
+                images.append(img)
+        # 保存
+        for j, (image, (caption, caption_idx, image_idx)) in enumerate(zip(images, batch_caption_info)):
+            global_index = it * global_batch + j * world_size + rank
+            if global_index < len(caption_image_pairs):
+                # 保存图片，文件名包含caption索引和图片索引
+                filename = f"{global_index:06d}_cap{caption_idx:04d}_img{image_idx:02d}.png"
+                image_path = os.path.join(sample_folder_dir, filename)
+                image.save(image_path)
+                # 保存caption信息到文本文件（只在rank 0上操作）
+                if rank == 0:
+                    caption_file = os.path.join(sample_folder_dir, "captions.txt")
+                    with open(caption_file, "a", encoding="utf-8") as f:
+                        f.write(f"{filename}\t{caption}\n")
+        total_generated = saved * world_size  # 近似值
+        dist.barrier()
+    if rank == 0:
+        print(f"Done. Saved {saved * world_size} images in total.")
+        # 重新计算实际生成的图片数量
+        actual_num_samples = len([name for name in os.listdir(sample_folder_dir) if name.endswith(".png")])
+        print(f"Actually generated {actual_num_samples} images")
+        # 使用实际的图片数量或用户指定的数量，取较小值
+        npz_samples = min(actual_num_samples, total_images_needed, args.max_samples)
+        create_npz_from_sample_folder(sample_folder_dir, npz_samples)
+        print("Done.")
+    dist.barrier()
+    dist.destroy_process_group()
+    parser = argparse.ArgumentParser(description="SD3 LoRA + RectifiedNoise 分布式采样脚本")
+    # 模型
+    parser.add_argument("--pretrained_model_name_or_path", type=str, required=True)
+    parser.add_argument("--revision", type=str, default=None)
+    parser.add_argument("--variant", type=str, default=None)
+    # LoRA 与 Rectified
+    parser.add_argument("--lora_path", type=str, default=None, help="LoRA 权重路径(文件或目录)")
+    parser.add_argument("--rectified_weights", type=str, default=None, help="Rectified(SIT) 权重路径(文件或目录)")
+    parser.add_argument("--num_sit_layers", type=int, default=1, help="与训练一致的 SIT 层数")
+    # 采样
+    parser.add_argument("--num_inference_steps", type=int, default=28)
+    parser.add_argument("--guidance_scale", type=float, default=7.0)
+    parser.add_argument("--height", type=int, default=1024)
+    parser.add_argument("--width", type=int, default=1024)
+    parser.add_argument("--per_proc_batch_size", type=int, default=1)
+    parser.add_argument("--images_per_caption", type=int, default=1)
+    parser.add_argument("--max_samples", type=int, default=10000)
+    parser.add_argument("--captions_jsonl", type=str, required=True)
+    parser.add_argument("--sample_dir", type=str, default="sd3_rectified_samples")
+    parser.add_argument("--global_seed", type=int, default=42)
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["no", "fp16", "bf16"])
+    # 内存优化选项
+    parser.add_argument("--enable_attention_slicing", action="store_true", help="启用 attention slicing 以节省显存")
+    parser.add_argument("--enable_vae_slicing", action="store_true", help="启用 VAE slicing 以节省显存")
+    parser.add_argument("--enable_cpu_offload", action="store_true", help="启用 CPU offload 以节省显存")
+    args = parser.parse_args()
+    main(args)

sd3_rectified_samples_batch2_2200005011.01.01.0cfg_cond_true.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Inception Score: 37.646392822265625
+FID: 21.19386100577333
+sFID: 71.79977998851734
+Precision: 0.690407122136641
+Recall: 0.358997247638176

train_lora_sd3.py ADDED Viewed

	@@ -0,0 +1,1597 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SD3 LoRA fine-tuning script for text2image generation."""
+import argparse
+import copy
+import json
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, DistributedType, ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import CLIPTokenizer, PretrainedConfig, T5TokenizerFast
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    SD3Transformer2DModel,
+    StableDiffusion3Pipeline,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import (
+    _set_state_dict_into_text_encoder,
+    cast_training_params,
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+    free_memory,
+)
+from diffusers.utils import (
+    check_min_version,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.torch_utils import is_compiled_module
+if is_wandb_available():
+    import wandb
+# Check minimum diffusers version
+check_min_version("0.30.0")
+logger = get_logger(__name__)
+def save_model_card(
+    repo_id: str,
+    images: list = None,
+    base_model: str = None,
+    dataset_name: str = None,
+    train_text_encoder: bool = False,
+    repo_folder: str = None,
+    vae_path: str = None,
+):
+    """Save model card for SD3 LoRA model."""
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+    model_description = f"""
+# SD3 LoRA text2image fine-tuning - {repo_id}
+These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
+{img_str}
+LoRA for the text encoder was enabled: {train_text_encoder}.
+Special VAE used for training: {vae_path}.
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="other",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+    tags = [
+        "stable-diffusion-3",
+        "stable-diffusion-3-diffusers",
+        "text-to-image",
+        "diffusers",
+        "diffusers-training",
+        "lora",
+        "sd3",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+    model_card.save(os.path.join(repo_folder, "README.md"))
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    epoch,
+    is_final_validation=False,
+    global_step=None,
+):
+    """Run validation and log images."""
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed is not None else None
+    pipeline_args = {"prompt": args.validation_prompt}
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+    with autocast_ctx:
+        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
+    # Save images to output directory
+    if accelerator.is_main_process:
+        validation_dir = os.path.join(args.output_dir, "validation_images")
+        os.makedirs(validation_dir, exist_ok=True)
+        for i, image in enumerate(images):
+            # Create filename with step and epoch information
+            if global_step is not None:
+                filename = f"validation_step_{global_step}_epoch_{epoch}_img_{i}.png"
+            else:
+                filename = f"validation_epoch_{epoch}_img_{i}.png"
+            image_path = os.path.join(validation_dir, filename)
+            image.save(image_path)
+            logger.info(f"Saved validation image: {image_path}")
+    for tracker in accelerator.trackers if hasattr(accelerator, 'trackers') and accelerator.trackers else []:
+        phase_name = "test" if is_final_validation else "validation"
+        try:
+            if tracker.name == "tensorboard":
+                np_images = np.stack([np.asarray(img) for img in images])
+                tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+            if tracker.name == "wandb":
+                tracker.log(
+                    {
+                        phase_name: [
+                            wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                        ]
+                    }
+                )
+        except Exception as e:
+            logger.warning(f"Failed to log to {tracker.name}: {e}")
+    del pipeline
+    free_memory()
+    return images
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    """Import the correct text encoder class."""
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+        return CLIPTextModelWithProjection
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+def parse_args(input_args=None):
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="SD3 LoRA training script.")
+    # Model arguments
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files, e.g. fp16",
+    )
+    # Dataset arguments
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the Dataset to train on.",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data.",
+    )
+    parser.add_argument(
+        "--image_column",
+        type=str,
+        default="image",
+        help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="caption",
+        help="The column of the dataset containing a caption.",
+    )
+    # Training arguments
+    parser.add_argument(
+        "--max_sequence_length",
+        type=int,
+        default=77,
+        help="Maximum sequence length to use with the T5 text encoder",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt used during validation.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images for validation.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help="Run validation every X epochs.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help="Truncate the number of training examples.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd3-lora-finetuned",
+        help="Output directory for model predictions and checkpoints.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="Directory to store downloaded models and datasets.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help="Image resolution for training.",
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help="Whether to center crop input images.",
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="Whether to randomly flip images horizontally.",
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder.",
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=16,
+        help="Batch size for training dataloader."
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        type=int,
+        default=100
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help="Save checkpoint every X updates.",
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help="Max number of checkpoints to store.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="Path to resume training from checkpoint.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of update steps to accumulate.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Use gradient checkpointing to save memory.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale learning rate by number of GPUs, etc.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help="Learning rate scheduler type.",
+    )
+    parser.add_argument(
+        "--lr_warmup_steps",
+        type=int,
+        default=500,
+        help="Number of warmup steps."
+    )
+    # SD3 specific arguments
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="logit_normal",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap"],
+        help="Weighting scheme for flow matching loss.",
+    )
+    parser.add_argument(
+        "--logit_mean",
+        type=float,
+        default=0.0,
+        help="Mean for logit_normal weighting."
+    )
+    parser.add_argument(
+        "--logit_std",
+        type=float,
+        default=1.0,
+        help="Std for logit_normal weighting."
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale for mode weighting scheme.",
+    )
+    parser.add_argument(
+        "--precondition_outputs",
+        type=int,
+        default=1,
+        help="Whether to precondition model outputs.",
+    )
+    # Optimization arguments
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help="Allow TF32 on Ampere GPUs.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help="Number of data loading workers.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Use 8-bit Adam optimizer."
+    )
+    parser.add_argument(
+        "--adam_beta1",
+        type=float,
+        default=0.9,
+        help="Beta1 for Adam optimizer."
+    )
+    parser.add_argument(
+        "--adam_beta2",
+        type=float,
+        default=0.999,
+        help="Beta2 for Adam optimizer."
+    )
+    parser.add_argument(
+        "--adam_weight_decay",
+        type=float,
+        default=1e-2,
+        help="Weight decay for Adam."
+    )
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon for Adam optimizer."
+    )
+    parser.add_argument(
+        "--max_grad_norm",
+        default=1.0,
+        type=float,
+        help="Max gradient norm."
+    )
+    # Hub and logging arguments
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Push model to the Hub."
+    )
+    parser.add_argument(
+        "--hub_token",
+        type=str,
+        default=None,
+        help="Token for Model Hub."
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="Repository name for the Hub.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help="TensorBoard log directory.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help="Logging integration to use.",
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help="Mixed precision type.",
+    )
+    parser.add_argument(
+        "--local_rank",
+        type=int,
+        default=-1,
+        help="Local rank for distributed training."
+    )
+    # LoRA arguments
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=64,
+        help="LoRA rank dimension.",
+    )
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+    return args
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+def tokenize_prompt(tokenizer, prompt):
+    """Tokenize prompt using the given tokenizer."""
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_tensors="pt",
+    )
+    return text_inputs.input_ids
+def _encode_prompt_with_t5(
+    text_encoder,
+    tokenizer,
+    max_sequence_length,
+    prompt=None,
+    num_images_per_prompt=1,
+    device=None,
+    text_input_ids=None,
+):
+    """Encode prompt using T5 text encoder."""
+    if prompt is not None:
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+    else:
+        # When prompt is None, we must have text_input_ids
+        if text_input_ids is None:
+            raise ValueError("Either prompt or text_input_ids must be provided")
+        batch_size = text_input_ids.shape[0]
+    if tokenizer is not None and prompt is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when tokenizer is not specified or prompt is None")
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings for each generation per prompt
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds
+def _encode_prompt_with_clip(
+    text_encoder,
+    tokenizer,
+    prompt: str,
+    device=None,
+    text_input_ids=None,
+    num_images_per_prompt: int = 1,
+):
+    """Encode prompt using CLIP text encoder."""
+    if prompt is not None:
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+    else:
+        # When prompt is None, we must have text_input_ids
+        if text_input_ids is None:
+            raise ValueError("Either prompt or text_input_ids must be provided")
+        batch_size = text_input_ids.shape[0]
+    if tokenizer is not None and prompt is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when tokenizer is not specified or prompt is None")
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+    pooled_prompt_embeds = prompt_embeds[0]
+    prompt_embeds = prompt_embeds.hidden_states[-2]
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings for each generation per prompt
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds, pooled_prompt_embeds
+def encode_prompt(
+    text_encoders,
+    tokenizers,
+    prompt: str,
+    max_sequence_length,
+    device=None,
+    num_images_per_prompt: int = 1,
+    text_input_ids_list=None,
+):
+    """Encode prompt using all three text encoders (SD3 architecture)."""
+    if prompt is not None:
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+    # Process CLIP encoders (first two)
+    clip_tokenizers = tokenizers[:2]
+    clip_text_encoders = text_encoders[:2]
+    clip_prompt_embeds_list = []
+    clip_pooled_prompt_embeds_list = []
+    for i, (tokenizer, text_encoder) in enumerate(zip(clip_tokenizers, clip_text_encoders)):
+        prompt_embeds, pooled_prompt_embeds = _encode_prompt_with_clip(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            prompt=prompt,
+            device=device if device is not None else text_encoder.device,
+            num_images_per_prompt=num_images_per_prompt,
+            text_input_ids=text_input_ids_list[i] if text_input_ids_list else None,
+        )
+        clip_prompt_embeds_list.append(prompt_embeds)
+        clip_pooled_prompt_embeds_list.append(pooled_prompt_embeds)
+    # Concatenate CLIP embeddings
+    clip_prompt_embeds = torch.cat(clip_prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = torch.cat(clip_pooled_prompt_embeds_list, dim=-1)
+    # Process T5 encoder (third encoder)
+    t5_prompt_embed = _encode_prompt_with_t5(
+        text_encoders[-1],
+        tokenizers[-1],
+        max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        text_input_ids=text_input_ids_list[-1] if text_input_ids_list else None,
+        device=device if device is not None else text_encoders[-1].device,
+    )
+    # Pad CLIP embeddings to match T5 embedding dimension
+    clip_prompt_embeds = torch.nn.functional.pad(
+        clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+    )
+    # Concatenate all embeddings
+    prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+    return prompt_embeds, pooled_prompt_embeds
+def load_dataset_from_jsonl(metadata_path, data_dir, accelerator=None):
+    """
+    从 metadata.jsonl 文件加载数据集，避免扫描所有文件。
+    这对于大型数据集在分布式训练中非常重要。
+    注意：只让主进程读取 jsonl 文件，然后创建数据集。
+    其他进程会等待主进程完成后再继续。
+    Args:
+        metadata_path: metadata.jsonl 文件路径
+        data_dir: 数据集根目录
+        accelerator: Accelerator 对象，用于多进程同步
+    Returns:
+        datasets.DatasetDict
+    """
+    if accelerator is None or accelerator.is_main_process:
+        print(f"[INFO] Loading dataset from metadata.jsonl: {metadata_path}", flush=True)
+    # 读取 metadata.jsonl（只让主进程读取，避免多进程竞争）
+    data_list = []
+    if os.path.exists(metadata_path):
+        with open(metadata_path, 'r', encoding='utf-8') as f:
+            for line_num, line in enumerate(f):
+                try:
+                    item = json.loads(line.strip())
+                    file_name = item.get('file_name', '')
+                    caption = item.get('caption', '')
+                    # 构建完整路径
+                    image_path = os.path.join(data_dir, file_name)
+                    # 注意：这里不检查文件是否存在，因为：
+                    # 1. 检查会非常慢（需要访问文件系统）
+                    # 2. 在 DataLoader 中加载时会自然处理不存在的文件
+                    # 3. 可以大大加快数据集加载速度
+                    data_list.append({
+                        'image': image_path,
+                        'text': caption
+                    })
+                    # 每处理 100000 条记录打印一次进度（减少打印频率）
+                    if (line_num + 1) % 100000 == 0 and (accelerator is None or accelerator.is_main_process):
+                        print(f"[INFO] Processed {line_num + 1} entries from metadata.jsonl", flush=True)
+                except json.JSONDecodeError as e:
+                    if accelerator is None or accelerator.is_main_process:
+                        print(f"[WARNING] Skipping invalid JSON at line {line_num + 1}: {e}", flush=True)
+                    continue
+        if accelerator is None or accelerator.is_main_process:
+            print(f"[INFO] Loaded {len(data_list)} image-caption pairs from metadata.jsonl", flush=True)
+    else:
+        raise FileNotFoundError(f"metadata.jsonl not found at: {metadata_path}")
+    # 创建数据集
+    # 注意：'image' 列存储的是路径字符串，不是 PIL Image 对象
+    # 图片会在预处理函数中延迟加载
+    dataset = datasets.Dataset.from_list(data_list)
+    return datasets.DatasetDict({'train': dataset})
+def main(args):
+    """Main training function."""
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to security risk."
+        )
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS."
+        )
+    # GPU多卡训练检查
+    if torch.cuda.is_available():
+        num_gpus = torch.cuda.device_count()
+        print(f"Found {num_gpus} GPUs available")
+        if num_gpus > 1:
+            print(f"Multi-GPU training enabled with {num_gpus} GPUs")
+    else:
+        print("No CUDA GPUs found, training on CPU")
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    # 优化多GPU训练的DDP参数
+    kwargs = DistributedDataParallelKwargs(
+        find_unused_parameters=True,
+        gradient_as_bucket_view=True,  # 提高多GPU训练效率
+        static_graph=False,  # 动态图支持
+    )
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+    # Logging setup
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_main_process:
+        print("[INFO] Accelerator initialized", flush=True)
+    # 记录多GPU训练信息
+    if accelerator.is_main_process:
+        logger.info(f"Number of processes: {accelerator.num_processes}")
+        logger.info(f"Distributed type: {accelerator.distributed_type}")
+        logger.info(f"Mixed precision: {accelerator.mixed_precision}")
+        if torch.cuda.is_available():
+            for i in range(torch.cuda.device_count()):
+                logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
+                logger.info(f"GPU {i} memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB")
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    # Set training seed
+    if args.seed is not None:
+        set_seed(args.seed)
+        if accelerator.is_main_process:
+            print(f"[INFO] Seed set to {args.seed}", flush=True)
+    # Create output directory
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+                token=args.hub_token
+            ).repo_id
+    if accelerator.is_main_process:
+        print("[INFO] Loading tokenizers...", flush=True)
+    # Load tokenizers (three for SD3)
+    tokenizer_one = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+    )
+    tokenizer_two = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+    )
+    if accelerator.is_main_process:
+        print("[INFO] Tokenizers loaded. Loading text encoders, VAE, and transformer...", flush=True)
+    tokenizer_three = T5TokenizerFast.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_3",
+        revision=args.revision,
+    )
+    # Import text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+    text_encoder_cls_three = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_3"
+    )
+    # Load models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="scheduler"
+    )
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+        variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder_2",
+        revision=args.revision,
+        variant=args.variant
+    )
+    text_encoder_three = text_encoder_cls_three.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder_3",
+        revision=args.revision,
+        variant=args.variant
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+        variant=args.variant,
+    )
+    transformer = SD3Transformer2DModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="transformer",
+        revision=args.revision,
+        variant=args.variant
+    )
+    if accelerator.is_main_process:
+        print("[INFO] Text encoders, VAE, and transformer loaded", flush=True)
+    # Freeze non-trainable weights
+    transformer.requires_grad_(False)
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    text_encoder_three.requires_grad_(False)
+    # Set precision
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    # Move models to device
+    vae.to(accelerator.device, dtype=torch.float32)  # VAE stays in fp32
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_three.to(accelerator.device, dtype=weight_dtype)
+    # Enable gradient checkpointing
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder_one.gradient_checkpointing_enable()
+            text_encoder_two.gradient_checkpointing_enable()
+    # Configure LoRA for transformer
+    transformer_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["attn.to_k", "attn.to_q", "attn.to_v", "attn.to_out.0"],
+    )
+    transformer.add_adapter(transformer_lora_config)
+    # Configure LoRA for text encoders if enabled
+    if args.train_text_encoder:
+        text_lora_config = LoraConfig(
+            r=args.rank,
+            lora_alpha=args.rank,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder_one.add_adapter(text_lora_config)
+        text_encoder_two.add_adapter(text_lora_config)
+        # Note: T5 encoder typically doesn't use LoRA
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+    # Enable TF32 for faster training
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+    # Scale learning rate
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    # Cast trainable parameters to float32
+    if args.mixed_precision == "fp16":
+        models = [transformer]
+        if args.train_text_encoder:
+            models.extend([text_encoder_one, text_encoder_two])
+        cast_training_params(models, dtype=torch.float32)
+    # Setup optimizer
+    transformer_lora_parameters = list(filter(lambda p: p.requires_grad, transformer.parameters()))
+    if args.train_text_encoder:
+        text_lora_parameters_one = list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
+        text_lora_parameters_two = list(filter(lambda p: p.requires_grad, text_encoder_two.parameters()))
+        params_to_optimize = (
+            transformer_lora_parameters
+            + text_lora_parameters_one
+            + text_lora_parameters_two
+        )
+    else:
+        params_to_optimize = transformer_lora_parameters
+    # Create optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError("To use 8-bit Adam, install bitsandbytes: pip install bitsandbytes")
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    if accelerator.is_main_process:
+        print("[INFO] Optimizer created. Loading dataset...", flush=True)
+    # Load dataset - 使用 main_process_first 避免多进程竞争
+    # 优先使用 metadata.jsonl 文件，避免扫描所有文件
+    with accelerator.main_process_first():
+        metadata_path = None
+        if args.train_data_dir is not None:
+            # 检查是否存在 metadata.jsonl
+            potential_metadata = os.path.join(args.train_data_dir, "metadata.jsonl")
+            if os.path.exists(potential_metadata):
+                metadata_path = potential_metadata
+        if metadata_path is not None:
+            # 使用 metadata.jsonl 加载数据集（更高效，避免扫描所有文件）
+            if accelerator.is_main_process:
+                print(f"[INFO] Found metadata.jsonl, using efficient loading method", flush=True)
+            dataset = load_dataset_from_jsonl(metadata_path, args.train_data_dir, accelerator)
+        elif args.dataset_name is not None:
+            dataset = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                cache_dir=args.cache_dir,
+                data_dir=args.train_data_dir
+            )
+        else:
+            # 回退到 imagefolder（可能会很慢）
+            if accelerator.is_main_process:
+                print("[WARNING] No metadata.jsonl found, using imagefolder (may be slow for large datasets)", flush=True)
+            data_files = {}
+            if args.train_data_dir is not None:
+                data_files["train"] = os.path.join(args.train_data_dir, "**")
+            dataset = load_dataset(
+                "imagefolder",
+                data_files=data_files,
+                cache_dir=args.cache_dir,
+            )
+        if accelerator.is_main_process:
+            print("[INFO] Dataset loaded successfully.", flush=True)
+    # 确保所有进程等待数据集加载完成
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        print("[INFO] All processes synchronized. Building transforms and DataLoader...", flush=True)
+    # Preprocessing
+    column_names = dataset["train"].column_names
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if accelerator.is_main_process:
+        print(f"[INFO] Dataset columns: {column_names}", flush=True)
+    # 智能选择 image 列：优先使用指定的列，如果不存在则自动回退
+    if args.image_column is not None and args.image_column in column_names:
+        # 如果指定了列名且存在，使用指定的列
+        image_column = args.image_column
+    else:
+        # 自动选择可用的 image 列
+        if 'image' in column_names:
+            image_column = 'image'
+        else:
+            image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+        # 如果用户指定了列名但不存在，给出警告
+        if args.image_column is not None and args.image_column != image_column:
+            if accelerator.is_main_process:
+                print(f"[WARNING] Specified image_column '{args.image_column}' not found. Using '{image_column}' instead.", flush=True)
+    if accelerator.is_main_process:
+        print(f"[INFO] Using image column: {image_column}", flush=True)
+    # 智能选择 caption 列：优先使用指定的列，如果不存在则自动回退
+    if args.caption_column is not None and args.caption_column in column_names:
+        # 如果指定了列名且存在，使用指定的列
+        caption_column = args.caption_column
+    else:
+        # 自动选择可用的 caption 列
+        if 'text' in column_names:
+            caption_column = 'text'
+        elif 'caption' in column_names:
+            caption_column = 'caption'
+        else:
+            caption_column = dataset_columns[1] if dataset_columns is not None else (column_names[1] if len(column_names) > 1 else column_names[0])
+        # 如果用户指定了列名但不存在，给出警告
+        if args.caption_column is not None and args.caption_column != caption_column:
+            if accelerator.is_main_process:
+                print(f"[WARNING] Specified caption_column '{args.caption_column}' not found. Using '{caption_column}' instead.", flush=True)
+    if accelerator.is_main_process:
+        print(f"[INFO] Using caption column: {caption_column}", flush=True)
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(f"Caption column should contain strings or lists of strings.")
+        tokens_one = tokenize_prompt(tokenizer_one, captions)
+        tokens_two = tokenize_prompt(tokenizer_two, captions)
+        tokens_three = tokenize_prompt(tokenizer_three, captions)
+        return tokens_one, tokens_two, tokens_three
+    # Image transforms
+    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
+    train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
+    train_flip = transforms.RandomHorizontalFlip(p=1.0)
+    train_transforms = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    def preprocess_train(examples):
+        # 处理图片：如果 image_column 中是路径字符串，则加载图片；如果是 PIL Image，则直接使用
+        images = []
+        for img in examples[image_column]:
+            if isinstance(img, str):
+                # 如果是路径字符串，加载图片
+                try:
+                    img = Image.open(img).convert("RGB")
+                except Exception as e:
+                    # 如果加载失败，创建一个占位符
+                    if accelerator.is_main_process:
+                        print(f"[WARNING] Failed to load image {img}: {e}", flush=True)
+                    img = Image.new('RGB', (args.resolution, args.resolution), color='black')
+            elif hasattr(img, 'convert'):
+                # 如果是 PIL Image，直接使用
+                img = img.convert("RGB")
+            else:
+                raise ValueError(f"Unexpected image type: {type(img)}")
+            images.append(img)
+        original_sizes = []
+        all_images = []
+        crop_top_lefts = []
+        for image in images:
+            original_sizes.append((image.height, image.width))
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            crop_top_left = (y1, x1)
+            crop_top_lefts.append(crop_top_left)
+            image = train_transforms(image)
+            all_images.append(image)
+        examples["original_sizes"] = original_sizes
+        examples["crop_top_lefts"] = crop_top_lefts
+        examples["pixel_values"] = all_images
+        tokens_one, tokens_two, tokens_three = tokenize_captions(examples)
+        examples["input_ids_one"] = tokens_one
+        examples["input_ids_two"] = tokens_two
+        examples["input_ids_three"] = tokens_three
+        return examples
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        train_dataset = dataset["train"].with_transform(preprocess_train, output_all_columns=True)
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        original_sizes = [example["original_sizes"] for example in examples]
+        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
+        input_ids_one = torch.stack([example["input_ids_one"] for example in examples])
+        input_ids_two = torch.stack([example["input_ids_two"] for example in examples])
+        input_ids_three = torch.stack([example["input_ids_three"] for example in examples])
+        return {
+            "pixel_values": pixel_values,
+            "input_ids_one": input_ids_one,
+            "input_ids_two": input_ids_two,
+            "input_ids_three": input_ids_three,
+            "original_sizes": original_sizes,
+            "crop_top_lefts": crop_top_lefts,
+        }
+    # 针对多GPU训练优化dataloader设置
+    if args.dataloader_num_workers == 0 and accelerator.num_processes > 1:
+        # 多GPU训练时自动设置数据加载器worker数量
+        args.dataloader_num_workers = min(4, os.cpu_count() // accelerator.num_processes)
+        logger.info(f"Auto-setting dataloader_num_workers to {args.dataloader_num_workers} for multi-GPU training")
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+        pin_memory=True,  # 提高GPU数据传输效率
+        persistent_workers=args.dataloader_num_workers > 0,  # 保持worker进程活跃
+    )
+    if accelerator.is_main_process:
+        print("[INFO] DataLoader ready. Computing training steps and scheduler...", flush=True)
+    # Scheduler and math around training steps
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+    # Prepare everything with accelerator
+    if args.train_text_encoder:
+        transformer, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            transformer, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            transformer, optimizer, train_dataloader, lr_scheduler
+        )
+    # Recalculate training steps
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # Initialize trackers
+    if accelerator.is_main_process:
+        try:
+            accelerator.init_trackers("text2image-fine-tune", config=vars(args))
+        except Exception as e:
+            logger.warning(f"Failed to initialize trackers: {e}")
+            logger.warning("Continuing without tracking. You can monitor training through console logs.")
+            # Set report_to to None to avoid further tracking attempts
+            args.report_to = None
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    logger.info(f"  Number of GPU processes = {accelerator.num_processes}")
+    if accelerator.num_processes > 1:
+        logger.info(f"  Effective batch size per GPU = {args.train_batch_size * args.gradient_accumulation_steps}")
+        logger.info(f"  Total effective batch size across all GPUs = {total_batch_size}")
+    global_step = 0
+    first_epoch = 0
+    if accelerator.is_main_process:
+        print(
+            f"[INFO] Training setup complete. num_examples={len(train_dataset)}, "
+            f"max_train_steps={args.max_train_steps}, num_epochs={args.num_train_epochs}",
+            flush=True,
+        )
+    # Resume from checkpoint if specified
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+        if path is None:
+            accelerator.print(f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting new training.")
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        disable=not accelerator.is_local_main_process,
+    )
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+    # Training loop
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+        if args.train_text_encoder:
+            text_encoder_one.train()
+            text_encoder_two.train()
+        if accelerator.is_main_process:
+            print(
+                f"[INFO] Starting epoch {epoch + 1}/{args.num_train_epochs}, current global_step={global_step}",
+                flush=True,
+            )
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(transformer):
+                # Convert images to latent space
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                # Apply VAE scaling
+                vae_config_shift_factor = vae.config.shift_factor
+                vae_config_scaling_factor = vae.config.scaling_factor
+                model_input = (model_input - vae_config_shift_factor) * vae_config_scaling_factor
+                model_input = model_input.to(dtype=weight_dtype)
+                # Encode prompts
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                    text_encoders=[text_encoder_one, text_encoder_two, text_encoder_three],
+                    tokenizers=[tokenizer_one, tokenizer_two, tokenizer_three],
+                    prompt=None,
+                    max_sequence_length=args.max_sequence_length,
+                    text_input_ids_list=[batch["input_ids_one"], batch["input_ids_two"], batch["input_ids_three"]],
+                )
+                # Sample noise and timesteps
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+                # Flow Matching timestep sampling
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+                # Flow Matching interpolation
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
+                # Predict using SD3 Transformer
+                model_pred = transformer(
+                    hidden_states=noisy_model_input,
+                    timestep=timesteps,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    return_dict=False,
+                )[0]
+                # Compute target for Flow Matching
+                if args.precondition_outputs:
+                    model_pred = model_pred * (-sigmas) + noisy_model_input
+                    target = model_input
+                else:
+                    target = noise - model_input
+                # Compute loss with weighting
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+                loss = torch.mean(
+                    (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    1,
+                )
+                loss = loss.mean()
+                # Gather loss across processes
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            # Checks if the accelerator has performed an optimization step
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if hasattr(accelerator, 'trackers') and accelerator.trackers:
+                    accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+                if accelerator.is_main_process and global_step % 1000 == 0:
+                    print(
+                        f"[INFO] Optimization step completed at global_step={global_step}, "
+                        f"recent step_loss={loss.detach().item():.4f}",
+                        flush=True,
+                    )
+                # Save checkpoint
+                if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+                                logger.info(f"Removing {len(removing_checkpoints)} checkpoints")
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+                        # 同时保存标准的LoRA权重格式，方便采样时直接加载
+                        try:
+                            # 获取当前模型的LoRA权重
+                            unwrapped_transformer = unwrap_model(transformer)
+                            transformer_lora_layers = get_peft_model_state_dict(unwrapped_transformer)
+                            text_encoder_lora_layers = None
+                            text_encoder_2_lora_layers = None
+                            if args.train_text_encoder:
+                                unwrapped_text_encoder_one = unwrap_model(text_encoder_one)
+                                unwrapped_text_encoder_two = unwrap_model(text_encoder_two)
+                                text_encoder_lora_layers = get_peft_model_state_dict(unwrapped_text_encoder_one)
+                                text_encoder_2_lora_layers = get_peft_model_state_dict(unwrapped_text_encoder_two)
+                            # 保存为标准LoRA格式到checkpoint目录
+                            StableDiffusion3Pipeline.save_lora_weights(
+                                save_directory=save_path,
+                                transformer_lora_layers=transformer_lora_layers,
+                                text_encoder_lora_layers=text_encoder_lora_layers,
+                                text_encoder_2_lora_layers=text_encoder_2_lora_layers,
+                            )
+                            logger.info(f"Saved LoRA weights in standard format to {save_path}")
+                        except Exception as e:
+                            logger.warning(f"Failed to save LoRA weights in standard format: {e}")
+                            logger.warning("Checkpoint saved with accelerator format only. You can extract LoRA weights later.")
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            if global_step >= args.max_train_steps:
+                break
+        # Validation
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None :#and epoch % args.validation_epochs == 0:
+                print(f"[INFO] Running validation for epoch {epoch + 1}, global_step={global_step}", flush=True)
+                pipeline = StableDiffusion3Pipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=unwrap_model(text_encoder_one),
+                    text_encoder_2=unwrap_model(text_encoder_two),
+                    text_encoder_3=unwrap_model(text_encoder_three),
+                    transformer=unwrap_model(transformer),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                images = log_validation(pipeline, args, accelerator, epoch, global_step=global_step)
+                del pipeline
+                torch.cuda.empty_cache()
+    # Save final LoRA weights
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        transformer = unwrap_model(transformer)
+        transformer_lora_layers = get_peft_model_state_dict(transformer)
+        if args.train_text_encoder:
+            text_encoder_one = unwrap_model(text_encoder_one)
+            text_encoder_two = unwrap_model(text_encoder_two)
+            text_encoder_lora_layers = get_peft_model_state_dict(text_encoder_one)
+            text_encoder_2_lora_layers = get_peft_model_state_dict(text_encoder_two)
+        else:
+            text_encoder_lora_layers = None
+            text_encoder_2_lora_layers = None
+        StableDiffusion3Pipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            transformer_lora_layers=transformer_lora_layers,
+            text_encoder_lora_layers=text_encoder_lora_layers,
+            text_encoder_2_lora_layers=text_encoder_2_lora_layers,
+        )
+        # Final inference
+        if args.mixed_precision == "fp16":
+            vae.to(weight_dtype)
+        pipeline = StableDiffusion3Pipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            vae=vae,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        pipeline.load_lora_weights(args.output_dir)
+        if args.validation_prompt and args.num_validation_images > 0:
+            images = log_validation(pipeline, args, accelerator, epoch, is_final_validation=True, global_step=global_step)
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                dataset_name=args.dataset_name,
+                train_text_encoder=args.train_text_encoder,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+    accelerator.end_training()
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

train_lora_sd3_new.py ADDED Viewed

	@@ -0,0 +1,1422 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SD3 LoRA fine-tuning script for text2image generation."""
+import argparse
+import copy
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, DistributedType, ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import CLIPTokenizer, PretrainedConfig, T5TokenizerFast
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    SD3Transformer2DModel,
+    StableDiffusion3Pipeline,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import (
+    _set_state_dict_into_text_encoder,
+    cast_training_params,
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+    free_memory,
+)
+from diffusers.utils import (
+    check_min_version,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.torch_utils import is_compiled_module
+if is_wandb_available():
+    import wandb
+# Check minimum diffusers version
+check_min_version("0.30.0")
+logger = get_logger(__name__)
+def save_model_card(
+    repo_id: str,
+    images: list = None,
+    base_model: str = None,
+    dataset_name: str = None,
+    train_text_encoder: bool = False,
+    repo_folder: str = None,
+    vae_path: str = None,
+):
+    """Save model card for SD3 LoRA model."""
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+    model_description = f"""
+# SD3 LoRA text2image fine-tuning - {repo_id}
+These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
+{img_str}
+LoRA for the text encoder was enabled: {train_text_encoder}.
+Special VAE used for training: {vae_path}.
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="other",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+    tags = [
+        "stable-diffusion-3",
+        "stable-diffusion-3-diffusers",
+        "text-to-image",
+        "diffusers",
+        "diffusers-training",
+        "lora",
+        "sd3",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+    model_card.save(os.path.join(repo_folder, "README.md"))
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    epoch,
+    is_final_validation=False,
+    global_step=None,
+):
+    """Run validation and log images."""
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed is not None else None
+    pipeline_args = {"prompt": args.validation_prompt}
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(accelerator.device.type)
+    with autocast_ctx:
+        images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)]
+    # Save images to output directory
+    if accelerator.is_main_process:
+        validation_dir = os.path.join(args.output_dir, "validation_images")
+        os.makedirs(validation_dir, exist_ok=True)
+        for i, image in enumerate(images):
+            # Create filename with step and epoch information
+            if global_step is not None:
+                filename = f"validation_step_{global_step}_epoch_{epoch}_img_{i}.png"
+            else:
+                filename = f"validation_epoch_{epoch}_img_{i}.png"
+            image_path = os.path.join(validation_dir, filename)
+            image.save(image_path)
+            logger.info(f"Saved validation image: {image_path}")
+    for tracker in accelerator.trackers if hasattr(accelerator, 'trackers') and accelerator.trackers else []:
+        phase_name = "test" if is_final_validation else "validation"
+        try:
+            if tracker.name == "tensorboard":
+                np_images = np.stack([np.asarray(img) for img in images])
+                tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+            if tracker.name == "wandb":
+                tracker.log(
+                    {
+                        phase_name: [
+                            wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                        ]
+                    }
+                )
+        except Exception as e:
+            logger.warning(f"Failed to log to {tracker.name}: {e}")
+    del pipeline
+    free_memory()
+    return images
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    """Import the correct text encoder class."""
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+        return CLIPTextModelWithProjection
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+def parse_args(input_args=None):
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="SD3 LoRA training script.")
+    # Model arguments
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files, e.g. fp16",
+    )
+    # Dataset arguments
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the Dataset to train on.",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data.",
+    )
+    parser.add_argument(
+        "--image_column",
+        type=str,
+        default="image",
+        help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="caption",
+        help="The column of the dataset containing a caption.",
+    )
+    # Training arguments
+    parser.add_argument(
+        "--max_sequence_length",
+        type=int,
+        default=77,
+        help="Maximum sequence length to use with the T5 text encoder",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt used during validation.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images for validation.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help="Run validation every X epochs.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help="Truncate the number of training examples.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd3-lora-finetuned",
+        help="Output directory for model predictions and checkpoints.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="Directory to store downloaded models and datasets.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help="Image resolution for training.",
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help="Whether to center crop input images.",
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="Whether to randomly flip images horizontally.",
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder.",
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=16,
+        help="Batch size for training dataloader."
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        type=int,
+        default=100
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help="Save checkpoint every X updates.",
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help="Max number of checkpoints to store.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="Path to resume training from checkpoint.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of update steps to accumulate.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Use gradient checkpointing to save memory.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale learning rate by number of GPUs, etc.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help="Learning rate scheduler type.",
+    )
+    parser.add_argument(
+        "--lr_warmup_steps",
+        type=int,
+        default=500,
+        help="Number of warmup steps."
+    )
+    # SD3 specific arguments
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="logit_normal",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap"],
+        help="Weighting scheme for flow matching loss.",
+    )
+    parser.add_argument(
+        "--logit_mean",
+        type=float,
+        default=0.0,
+        help="Mean for logit_normal weighting."
+    )
+    parser.add_argument(
+        "--logit_std",
+        type=float,
+        default=1.0,
+        help="Std for logit_normal weighting."
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale for mode weighting scheme.",
+    )
+    parser.add_argument(
+        "--precondition_outputs",
+        type=int,
+        default=1,
+        help="Whether to precondition model outputs.",
+    )
+    # Optimization arguments
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help="Allow TF32 on Ampere GPUs.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help="Number of data loading workers.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Use 8-bit Adam optimizer."
+    )
+    parser.add_argument(
+        "--adam_beta1",
+        type=float,
+        default=0.9,
+        help="Beta1 for Adam optimizer."
+    )
+    parser.add_argument(
+        "--adam_beta2",
+        type=float,
+        default=0.999,
+        help="Beta2 for Adam optimizer."
+    )
+    parser.add_argument(
+        "--adam_weight_decay",
+        type=float,
+        default=1e-2,
+        help="Weight decay for Adam."
+    )
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon for Adam optimizer."
+    )
+    parser.add_argument(
+        "--max_grad_norm",
+        default=1.0,
+        type=float,
+        help="Max gradient norm."
+    )
+    # Hub and logging arguments
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Push model to the Hub."
+    )
+    parser.add_argument(
+        "--hub_token",
+        type=str,
+        default=None,
+        help="Token for Model Hub."
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="Repository name for the Hub.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help="TensorBoard log directory.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help="Logging integration to use.",
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help="Mixed precision type.",
+    )
+    parser.add_argument(
+        "--local_rank",
+        type=int,
+        default=-1,
+        help="Local rank for distributed training."
+    )
+    # LoRA arguments
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=64,
+        help="LoRA rank dimension.",
+    )
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+    return args
+DATASET_NAME_MAPPING = {
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
+}
+def tokenize_prompt(tokenizer, prompt):
+    """Tokenize prompt using the given tokenizer."""
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_tensors="pt",
+    )
+    return text_inputs.input_ids
+def _encode_prompt_with_t5(
+    text_encoder,
+    tokenizer,
+    max_sequence_length,
+    prompt=None,
+    num_images_per_prompt=1,
+    device=None,
+    text_input_ids=None,
+):
+    """Encode prompt using T5 text encoder."""
+    if prompt is not None:
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+    else:
+        # When prompt is None, we must have text_input_ids
+        if text_input_ids is None:
+            raise ValueError("Either prompt or text_input_ids must be provided")
+        batch_size = text_input_ids.shape[0]
+    if tokenizer is not None and prompt is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when tokenizer is not specified or prompt is None")
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings for each generation per prompt
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds
+def _encode_prompt_with_clip(
+    text_encoder,
+    tokenizer,
+    prompt: str,
+    device=None,
+    text_input_ids=None,
+    num_images_per_prompt: int = 1,
+):
+    """Encode prompt using CLIP text encoder."""
+    if prompt is not None:
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+    else:
+        # When prompt is None, we must have text_input_ids
+        if text_input_ids is None:
+            raise ValueError("Either prompt or text_input_ids must be provided")
+        batch_size = text_input_ids.shape[0]
+    if tokenizer is not None and prompt is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when tokenizer is not specified or prompt is None")
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+    pooled_prompt_embeds = prompt_embeds[0]
+    prompt_embeds = prompt_embeds.hidden_states[-2]
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings for each generation per prompt
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds, pooled_prompt_embeds
+def encode_prompt(
+    text_encoders,
+    tokenizers,
+    prompt: str,
+    max_sequence_length,
+    device=None,
+    num_images_per_prompt: int = 1,
+    text_input_ids_list=None,
+):
+    """Encode prompt using all three text encoders (SD3 architecture)."""
+    if prompt is not None:
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+    # Process CLIP encoders (first two)
+    clip_tokenizers = tokenizers[:2]
+    clip_text_encoders = text_encoders[:2]
+    clip_prompt_embeds_list = []
+    clip_pooled_prompt_embeds_list = []
+    for i, (tokenizer, text_encoder) in enumerate(zip(clip_tokenizers, clip_text_encoders)):
+        prompt_embeds, pooled_prompt_embeds = _encode_prompt_with_clip(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            prompt=prompt,
+            device=device if device is not None else text_encoder.device,
+            num_images_per_prompt=num_images_per_prompt,
+            text_input_ids=text_input_ids_list[i] if text_input_ids_list else None,
+        )
+        clip_prompt_embeds_list.append(prompt_embeds)
+        clip_pooled_prompt_embeds_list.append(pooled_prompt_embeds)
+    # Concatenate CLIP embeddings
+    clip_prompt_embeds = torch.cat(clip_prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = torch.cat(clip_pooled_prompt_embeds_list, dim=-1)
+    # Process T5 encoder (third encoder)
+    t5_prompt_embed = _encode_prompt_with_t5(
+        text_encoders[-1],
+        tokenizers[-1],
+        max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        text_input_ids=text_input_ids_list[-1] if text_input_ids_list else None,
+        device=device if device is not None else text_encoders[-1].device,
+    )
+    # Pad CLIP embeddings to match T5 embedding dimension
+    clip_prompt_embeds = torch.nn.functional.pad(
+        clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+    )
+    # Concatenate all embeddings
+    prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+    return prompt_embeds, pooled_prompt_embeds
+def main(args):
+    """Main training function."""
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to security risk."
+        )
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS."
+        )
+    # GPU多卡训练检查
+    if torch.cuda.is_available():
+        num_gpus = torch.cuda.device_count()
+        print(f"Found {num_gpus} GPUs available")
+        if num_gpus > 1:
+            print(f"Multi-GPU training enabled with {num_gpus} GPUs")
+    else:
+        print("No CUDA GPUs found, training on CPU")
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    # 优化多GPU训练的DDP参数
+    kwargs = DistributedDataParallelKwargs(
+        find_unused_parameters=True,
+        gradient_as_bucket_view=True,  # 提高多GPU训练效率
+        static_graph=False,  # 动态图支持
+    )
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+    # Logging setup
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    # 记录多GPU训练信息
+    if accelerator.is_main_process:
+        logger.info(f"Number of processes: {accelerator.num_processes}")
+        logger.info(f"Distributed type: {accelerator.distributed_type}")
+        logger.info(f"Mixed precision: {accelerator.mixed_precision}")
+        if torch.cuda.is_available():
+            for i in range(torch.cuda.device_count()):
+                logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
+                logger.info(f"GPU {i} memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB")
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    # Set training seed
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Create output directory
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+                token=args.hub_token
+            ).repo_id
+    # Load tokenizers (three for SD3)
+    tokenizer_one = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+    )
+    tokenizer_two = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+    )
+    tokenizer_three = T5TokenizerFast.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_3",
+        revision=args.revision,
+    )
+    # Import text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+    text_encoder_cls_three = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_3"
+    )
+    # Load models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="scheduler"
+    )
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+        variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder_2",
+        revision=args.revision,
+        variant=args.variant
+    )
+    text_encoder_three = text_encoder_cls_three.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder_3",
+        revision=args.revision,
+        variant=args.variant
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+        variant=args.variant,
+    )
+    transformer = SD3Transformer2DModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="transformer",
+        revision=args.revision,
+        variant=args.variant
+    )
+    # Freeze non-trainable weights
+    transformer.requires_grad_(False)
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    text_encoder_three.requires_grad_(False)
+    # Set precision
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    # Move models to device
+    vae.to(accelerator.device, dtype=torch.float32)  # VAE stays in fp32
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_three.to(accelerator.device, dtype=weight_dtype)
+    # Enable gradient checkpointing
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder_one.gradient_checkpointing_enable()
+            text_encoder_two.gradient_checkpointing_enable()
+    # Configure LoRA for transformer
+    transformer_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["attn.to_k", "attn.to_q", "attn.to_v", "attn.to_out.0"],
+    )
+    transformer.add_adapter(transformer_lora_config)
+    # Configure LoRA for text encoders if enabled
+    if args.train_text_encoder:
+        text_lora_config = LoraConfig(
+            r=args.rank,
+            lora_alpha=args.rank,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder_one.add_adapter(text_lora_config)
+        text_encoder_two.add_adapter(text_lora_config)
+        # Note: T5 encoder typically doesn't use LoRA
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+    # Enable TF32 for faster training
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+    # Scale learning rate
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    # Cast trainable parameters to float32
+    if args.mixed_precision == "fp16":
+        models = [transformer]
+        if args.train_text_encoder:
+            models.extend([text_encoder_one, text_encoder_two])
+        cast_training_params(models, dtype=torch.float32)
+    # Setup optimizer
+    transformer_lora_parameters = list(filter(lambda p: p.requires_grad, transformer.parameters()))
+    if args.train_text_encoder:
+        text_lora_parameters_one = list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
+        text_lora_parameters_two = list(filter(lambda p: p.requires_grad, text_encoder_two.parameters()))
+        params_to_optimize = (
+            transformer_lora_parameters
+            + text_lora_parameters_one
+            + text_lora_parameters_two
+        )
+    else:
+        params_to_optimize = transformer_lora_parameters
+    # Create optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError("To use 8-bit Adam, install bitsandbytes: pip install bitsandbytes")
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    # Load dataset
+    if args.dataset_name is not None:
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            data_dir=args.train_data_dir
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+    # Preprocessing
+    column_names = dataset["train"].column_names
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(f"--image_column '{args.image_column}' not found in: {column_names}")
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(f"--caption_column '{args.caption_column}' not found in: {column_names}")
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(f"Caption column should contain strings or lists of strings.")
+        tokens_one = tokenize_prompt(tokenizer_one, captions)
+        tokens_two = tokenize_prompt(tokenizer_two, captions)
+        tokens_three = tokenize_prompt(tokenizer_three, captions)
+        return tokens_one, tokens_two, tokens_three
+    # Image transforms
+    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
+    train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
+    train_flip = transforms.RandomHorizontalFlip(p=1.0)
+    train_transforms = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        original_sizes = []
+        all_images = []
+        crop_top_lefts = []
+        for image in images:
+            original_sizes.append((image.height, image.width))
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            crop_top_left = (y1, x1)
+            crop_top_lefts.append(crop_top_left)
+            image = train_transforms(image)
+            all_images.append(image)
+        examples["original_sizes"] = original_sizes
+        examples["crop_top_lefts"] = crop_top_lefts
+        examples["pixel_values"] = all_images
+        tokens_one, tokens_two, tokens_three = tokenize_captions(examples)
+        examples["input_ids_one"] = tokens_one
+        examples["input_ids_two"] = tokens_two
+        examples["input_ids_three"] = tokens_three
+        return examples
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        train_dataset = dataset["train"].with_transform(preprocess_train, output_all_columns=True)
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        original_sizes = [example["original_sizes"] for example in examples]
+        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
+        input_ids_one = torch.stack([example["input_ids_one"] for example in examples])
+        input_ids_two = torch.stack([example["input_ids_two"] for example in examples])
+        input_ids_three = torch.stack([example["input_ids_three"] for example in examples])
+        return {
+            "pixel_values": pixel_values,
+            "input_ids_one": input_ids_one,
+            "input_ids_two": input_ids_two,
+            "input_ids_three": input_ids_three,
+            "original_sizes": original_sizes,
+            "crop_top_lefts": crop_top_lefts,
+        }
+    # 针对多GPU训练优化dataloader设置
+    if args.dataloader_num_workers == 0 and accelerator.num_processes > 1:
+        # 多GPU训练时自动设置数据加载器worker数量
+        args.dataloader_num_workers = min(4, os.cpu_count() // accelerator.num_processes)
+        logger.info(f"Auto-setting dataloader_num_workers to {args.dataloader_num_workers} for multi-GPU training")
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+        pin_memory=True,  # 提高GPU数据传输效率
+        persistent_workers=args.dataloader_num_workers > 0,  # 保持worker进程活跃
+    )
+    # Scheduler and math around training steps
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+    # Prepare everything with accelerator
+    if args.train_text_encoder:
+        transformer, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            transformer, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            transformer, optimizer, train_dataloader, lr_scheduler
+        )
+    # Recalculate training steps
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # Initialize trackers
+    if accelerator.is_main_process:
+        try:
+            accelerator.init_trackers("text2image-fine-tune", config=vars(args))
+        except Exception as e:
+            logger.warning(f"Failed to initialize trackers: {e}")
+            logger.warning("Continuing without tracking. You can monitor training through console logs.")
+            # Set report_to to None to avoid further tracking attempts
+            args.report_to = None
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    logger.info(f"  Number of GPU processes = {accelerator.num_processes}")
+    if accelerator.num_processes > 1:
+        logger.info(f"  Effective batch size per GPU = {args.train_batch_size * args.gradient_accumulation_steps}")
+        logger.info(f"  Total effective batch size across all GPUs = {total_batch_size}")
+    global_step = 0
+    first_epoch = 0
+    # Resume from checkpoint if specified
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+        if path is None:
+            accelerator.print(f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting new training.")
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        disable=not accelerator.is_local_main_process,
+    )
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+    # Training loop
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+        if args.train_text_encoder:
+            text_encoder_one.train()
+            text_encoder_two.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(transformer):
+                # Convert images to latent space
+                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                # Apply VAE scaling
+                vae_config_shift_factor = vae.config.shift_factor
+                vae_config_scaling_factor = vae.config.scaling_factor
+                model_input = (model_input - vae_config_shift_factor) * vae_config_scaling_factor
+                model_input = model_input.to(dtype=weight_dtype)
+                # Encode prompts
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                    text_encoders=[text_encoder_one, text_encoder_two, text_encoder_three],
+                    tokenizers=[tokenizer_one, tokenizer_two, tokenizer_three],
+                    prompt=None,
+                    max_sequence_length=args.max_sequence_length,
+                    text_input_ids_list=[batch["input_ids_one"], batch["input_ids_two"], batch["input_ids_three"]],
+                )
+                # Sample noise and timesteps
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+                # Flow Matching timestep sampling
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+                # Flow Matching interpolation
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
+                # Predict using SD3 Transformer
+                model_pred = transformer(
+                    hidden_states=noisy_model_input,
+                    timestep=timesteps,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    return_dict=False,
+                )[0]
+                # Compute target for Flow Matching
+                if args.precondition_outputs:
+                    model_pred = model_pred * (-sigmas) + noisy_model_input
+                    target = model_input
+                else:
+                    target = noise - model_input
+                # Compute loss with weighting
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+                loss = torch.mean(
+                    (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    1,
+                )
+                loss = loss.mean()
+                # Gather loss across processes
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            # Checks if the accelerator has performed an optimization step
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if hasattr(accelerator, 'trackers') and accelerator.trackers:
+                    accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+                # Save checkpoint
+                if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+                                logger.info(f"Removing {len(removing_checkpoints)} checkpoints")
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+                        # 同时保存标准的LoRA权重格式，方便采样时直接加载
+                        try:
+                            # 获取当前模型的LoRA权重
+                            unwrapped_transformer = unwrap_model(transformer)
+                            transformer_lora_layers = get_peft_model_state_dict(unwrapped_transformer)
+                            text_encoder_lora_layers = None
+                            text_encoder_2_lora_layers = None
+                            if args.train_text_encoder:
+                                unwrapped_text_encoder_one = unwrap_model(text_encoder_one)
+                                unwrapped_text_encoder_two = unwrap_model(text_encoder_two)
+                                text_encoder_lora_layers = get_peft_model_state_dict(unwrapped_text_encoder_one)
+                                text_encoder_2_lora_layers = get_peft_model_state_dict(unwrapped_text_encoder_two)
+                            # 保存为标准LoRA格式到checkpoint目录
+                            StableDiffusion3Pipeline.save_lora_weights(
+                                save_directory=save_path,
+                                transformer_lora_layers=transformer_lora_layers,
+                                text_encoder_lora_layers=text_encoder_lora_layers,
+                                text_encoder_2_lora_layers=text_encoder_2_lora_layers,
+                            )
+                            logger.info(f"Saved LoRA weights in standard format to {save_path}")
+                        except Exception as e:
+                            logger.warning(f"Failed to save LoRA weights in standard format: {e}")
+                            logger.warning("Checkpoint saved with accelerator format only. You can extract LoRA weights later.")
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            if global_step >= args.max_train_steps:
+                break
+        # Validation
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None :#and epoch % args.validation_epochs == 0:
+                pipeline = StableDiffusion3Pipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=unwrap_model(text_encoder_one),
+                    text_encoder_2=unwrap_model(text_encoder_two),
+                    text_encoder_3=unwrap_model(text_encoder_three),
+                    transformer=unwrap_model(transformer),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                images = log_validation(pipeline, args, accelerator, epoch, global_step=global_step)
+                del pipeline
+                torch.cuda.empty_cache()
+    # Save final LoRA weights
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        transformer = unwrap_model(transformer)
+        transformer_lora_layers = get_peft_model_state_dict(transformer)
+        if args.train_text_encoder:
+            text_encoder_one = unwrap_model(text_encoder_one)
+            text_encoder_two = unwrap_model(text_encoder_two)
+            text_encoder_lora_layers = get_peft_model_state_dict(text_encoder_one)
+            text_encoder_2_lora_layers = get_peft_model_state_dict(text_encoder_two)
+        else:
+            text_encoder_lora_layers = None
+            text_encoder_2_lora_layers = None
+        StableDiffusion3Pipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            transformer_lora_layers=transformer_lora_layers,
+            text_encoder_lora_layers=text_encoder_lora_layers,
+            text_encoder_2_lora_layers=text_encoder_2_lora_layers,
+        )
+        # Final inference
+        if args.mixed_precision == "fp16":
+            vae.to(weight_dtype)
+        pipeline = StableDiffusion3Pipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            vae=vae,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        pipeline.load_lora_weights(args.output_dir)
+        if args.validation_prompt and args.num_validation_images > 0:
+            images = log_validation(pipeline, args, accelerator, epoch, is_final_validation=True, global_step=global_step)
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                dataset_name=args.dataset_name,
+                train_text_encoder=args.train_text_encoder,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+    accelerator.end_training()
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

train_rectified_noise.py ADDED Viewed

The diff for this file is too large to render. See raw diff

train_rectified_noise.sh ADDED Viewed

	@@ -0,0 +1,104 @@

+#!/bin/bash
+# SD3 Rectified Noise Training Script
+# 这个脚本展示了如何使用 train_rectified_noise.py 进行训练
+set -e
+# 激活正确的conda环境
+source /root/miniconda3/etc/profile.d/conda.sh
+conda activate SiT
+# 基础配置
+export CUDA_VISIBLE_DEVICES=0,1,2,3  # 设置使用4个GPU（0,1,2,3）
+#export OMP_NUM_THREADS=1
+# 内存优化设置
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=false
+# 模型和数据路径
+PRETRAINED_MODEL="/gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671"
+LORA_MODEL_PATH="/gemini/space/gzy_new/models/Sida/sd3-lora-finetuned-batch-4/checkpoint-500000"  # LoRA微调后的SD3模型路径
+TRAIN_DATA_DIR="/gemini/space/hsd/project/dataset/cc3m-wds/train"  # 训练数据目录
+OUTPUT_DIR="./rectified-noise-batch-2"  # 输出目录
+# 训练参数
+NUM_SIT_LAYERS=1         # SIT块的层数
+SIT_LEARNING_RATE=1e-5    # SIT块的学习率
+KL_LOSS_WEIGHT=0.5        # KL散度损失权重
+RESOLUTION=512            # 图像分辨率
+BATCH_SIZE=2              # 批次大小
+GRADIENT_ACCUMULATION=2  # 梯度累积步数
+MAX_TRAIN_STEPS=500000     # 最大训练步数
+# 验证参数
+VALIDATION_PROMPT="A bicycle replica with a clock as the front wheel."
+NUM_VALIDATION_IMAGES=1
+echo "开始 SD3 Rectified Noise 训练..."
+echo "LoRA模型路径: $LORA_MODEL_PATH"
+echo "SIT层数: $NUM_SIT_LAYERS"
+echo "输出目录: $OUTPUT_DIR"
+# 检查LoRA模型路径是否存在
+if [ ! -d "$LORA_MODEL_PATH" ]; then
+    echo "错误: LoRA模型路径不存在: $LORA_MODEL_PATH"
+    echo "请先使用 train_lora_sd3.py 训练LoRA模型"
+    exit 1
+fi
+# 使用accelerate启动训练
+# 注意：移除了命令行中的mixed_precision参数，因为已经在accelerate_config.yaml中设置
+accelerate launch --config_file accelerate_config.yaml train_rectified_noise.py \
+  --pretrained_model_name_or_path="$PRETRAINED_MODEL" \
+  --lora_model_path="$LORA_MODEL_PATH" \
+  --train_data_dir="$TRAIN_DATA_DIR" \
+  --num_sit_layers=$NUM_SIT_LAYERS \
+  --sit_learning_rate=$SIT_LEARNING_RATE \
+  --kl_loss_weight=$KL_LOSS_WEIGHT \
+  --resolution=$RESOLUTION \
+  --train_batch_size=$BATCH_SIZE \
+  --gradient_accumulation_steps=$GRADIENT_ACCUMULATION \
+  --gradient_checkpointing \
+  --learning_rate=1e-5 \
+  --time_weight_alpha=5.0 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=$MAX_TRAIN_STEPS \
+  --output_dir="$OUTPUT_DIR" \
+  --validation_prompt="$VALIDATION_PROMPT" \
+  --num_validation_images=$NUM_VALIDATION_IMAGES \
+  --validation_steps=20000 \
+  --seed=42 \
+  --dataloader_num_workers=8 \
+  --save_sit_weights_only \
+  --checkpointing_steps=20000 \
+  --checkpoints_total_limit=10 \
+  --report_to="tensorboard" \
+  --logging_dir="./logs"
+echo "训练完成！"
+echo "SIT权重保存在: $OUTPUT_DIR/sit_weights/"
+echo "验证图像保存在: $OUTPUT_DIR/validation_images/"
+# 可选：快速测试训练命令
+# cat << 'EOF'
+# # 快速测试命令（少量步数）:
+# accelerate launch train_rectified_noise.py \
+#   --pretrained_model_name_or_path="stabilityai/stable-diffusion-3-medium-diffusers" \
+#   --lora_model_path="./sd3-lora-finetuned" \
+#   --train_data_dir="./dataset" \
+#   --num_sit_layers=2 \
+#   --resolution=256 \
+#   --train_batch_size=1 \
+#   --gradient_accumulation_steps=4 \
+#   --max_train_steps=100 \
+#   --output_dir="./test-rectified-noise" \
+#   --mixed_precision="fp16" \
+#   --save_sit_weights_only
+# EOF
+# nohup bash train_rectified_noise.sh > train_rectified_noise.log 2>&1 &

train_rectified_noise2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

train_sd3_lora.log ADDED Viewed

	@@ -0,0 +1,27 @@

+nohup: ignoring input
+检测到 4 个GPU
+每个GPU批次大小: 4
+总有效批次大小: 16
+===== SD3 LoRA 多GPU训练开始 =====
+模型: /gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671
+输出目录: sd3-lora-finetuned-batch-8
+分辨率: 512
+每个GPU批次大小: 4
+梯度累积步数: 1
+总有效批次大小: 16
+学习率: 1e-5
+最大训练步数: 500000
+LoRA Rank: 32
+使用GPU: 0,1,2,3
+断点重训: latest
+===========================================
+使用 accelerate 启动多GPU训练...
+/root/miniconda3/envs/SiT/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+Terminated
+===========================================
+训练完成！
+模型保存在: sd3-lora-finetuned-batch-8
+日志保存在: sd3-lora-finetuned-batch-8/logs
+验证图片保存在: sd3-lora-finetuned-batch-8/validation_images
+===========================================

train_sd3_lora.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# SD3 LoRA Fine-tuning Training Script
+# 使用 Stable Diffusion 3 进行 LoRA 微调训练 - 多GPU优化版本
+# 设置环境变量
+export CUDA_VISIBLE_DEVICES=0,1,2,3  # 根据可用GPU数量调整
+# export PYTHONPATH=$PYTHONPATH:/gemini/space/gzy_new/Rectified_Noise/Finetune/finetune-coco
+# 检查GPU数量
+num_gpus=$(nvidia-smi --list-gpus | wc -l)
+echo "检测到 $num_gpus 个GPU"
+# 训练参数配置
+MODEL_NAME="/gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671"
+#DATASET_NAME="lambdalabs/naruto-blip-captions"  # 或者使用本地数据集路径
+TRAIN_DATA_DIR="/gemini/space/hsd/project/dataset/cc3m-wds/train"  # 本地数据集路径0
+OUTPUT_DIR="sd3-lora-finetuned-batch-8"
+RESOLUTION=512
+# 多GPU训练时调整批次大小 - 每个GPU的批次大小
+TRAIN_BATCH_SIZE=4  # 每个GPU的批次大小，总批次大小 = TRAIN_BATCH_SIZE * num_gpus * GRADIENT_ACCUMULATION_STEPS
+GRADIENT_ACCUMULATION_STEPS=1  # 梯度累积步数
+LEARNING_RATE=1e-5
+MAX_TRAIN_STEPS=500000
+NUM_TRAIN_EPOCHS=50
+VALIDATION_PROMPT="A photo of a beautiful landscape with mountains and lake"
+NUM_VALIDATION_IMAGES=2
+VALIDATION_EPOCHS=1  # 每10个epoch验证一次
+LORA_RANK=32
+SEED=42
+RESUME_FROM_CHECKPOINT="latest"  # 设置为 "latest" 以自动从最新checkpoint恢复，或指定checkpoint路径，或设为 "" 以不恢复
+# 计算有效批次大小
+effective_batch_size=$((TRAIN_BATCH_SIZE * num_gpus * GRADIENT_ACCUMULATION_STEPS))
+echo "每个GPU批次大小: $TRAIN_BATCH_SIZE"
+echo "总有效批次大小: $effective_batch_size"
+# 创建输出目录
+mkdir -p $OUTPUT_DIR
+echo "===== SD3 LoRA 多GPU训练开始 ====="
+echo "模型: $MODEL_NAME"
+echo "输出目录: $OUTPUT_DIR"
+echo "分辨率: $RESOLUTION"
+echo "每个GPU批次大小: $TRAIN_BATCH_SIZE"
+echo "梯度累积步数: $GRADIENT_ACCUMULATION_STEPS"
+echo "总有效批次大小: $effective_batch_size"
+echo "学习率: $LEARNING_RATE"
+echo "最大训练步数: $MAX_TRAIN_STEPS"
+echo "LoRA Rank: $LORA_RANK"
+echo "使用GPU: $CUDA_VISIBLE_DEVICES"
+echo "断点重训: $RESUME_FROM_CHECKPOINT"
+echo "==========================================="
+# 检查accelerate配置
+if [ ! -f "accelerate_config.yaml" ]; then
+    echo "错误: 未找到 accelerate_config.yaml 文件"
+    echo "请运行: accelerate config 来配置多GPU训练"
+    exit 1
+fi
+echo "使用 accelerate 启动多GPU训练..."
+# 使用 accelerate 启动训练
+accelerate launch --config_file=accelerate_config.yaml train_lora_sd3.py \
+  --pretrained_model_name_or_path="$MODEL_NAME" \
+  --train_data_dir="$TRAIN_DATA_DIR" \
+  --output_dir="$OUTPUT_DIR" \
+  --mixed_precision="no" \
+  --resolution=$RESOLUTION \
+  --train_batch_size=$TRAIN_BATCH_SIZE \
+  --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS \
+  --learning_rate=$LEARNING_RATE \
+  --scale_lr \
+  --lr_scheduler="cosine" \
+  --lr_warmup_steps=100 \
+  --max_train_steps=$MAX_TRAIN_STEPS \
+  --num_train_epochs=$NUM_TRAIN_EPOCHS \
+  --validation_prompt="$VALIDATION_PROMPT" \
+  --num_validation_images=$NUM_VALIDATION_IMAGES \
+  --validation_epochs=$VALIDATION_EPOCHS \
+  --checkpointing_steps=20000 \
+  --checkpoints_total_limit=10 \
+  --seed=$SEED \
+  --rank=$LORA_RANK \
+  --gradient_checkpointing \
+  --use_8bit_adam \
+  --dataloader_num_workers=0 \
+  --report_to="tensorboard" \
+  --logging_dir="logs" \
+  --adam_beta1=0.9 \
+  --adam_beta2=0.999 \
+  --adam_weight_decay=1e-2 \
+  --adam_epsilon=1e-8 \
+  --max_grad_norm=1.0 \
+  --allow_tf32 \
+  --weighting_scheme="logit_normal" \
+  --logit_mean=0.0 \
+  --logit_std=1.0 \
+  --precondition_outputs=1
+echo "==========================================="
+echo "训练完成！"
+echo "模型保存在: $OUTPUT_DIR"
+echo "日志保存在: $OUTPUT_DIR/logs"
+echo "验证图片保存在: $OUTPUT_DIR/validation_images"
+echo "==========================================="
+# nohup bash train_sd3_lora.sh > train_sd3_lora.log 2>&1 &

train_sd3_lora2.log ADDED Viewed

	@@ -0,0 +1,216 @@

+nohup: ignoring input
+检测到 4 个GPU
+每个GPU批次大小: 8
+总有效批次大小: 32
+===== SD3 LoRA 多GPU训练开始 =====
+模型: /gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671
+输出目录: sd3-lora-finetuned-batch-32
+分辨率: 512
+每个GPU批次大小: 8
+梯度累积步数: 1
+总有效批次大小: 32
+学习率: 1e-5
+最大训练步数: 500000
+LoRA Rank: 32
+使用GPU: 0,1,2,3
+===========================================
+使用 accelerate 启动多GPU训练...
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+Found 4 GPUs available
+Multi-GPU training enabled with 4 GPUs
+Found 4 GPUs available
+Multi-GPU training enabled with 4 GPUs
+Found 4 GPUs available
+Multi-GPU training enabled with 4 GPUs
+Found 4 GPUs available
+Multi-GPU training enabled with 4 GPUs
+03/09/2026 10:48:06 - INFO - __main__ - Distributed environment: MULTI_GPU  Backend: nccl
+Num processes: 4
+Process index: 2
+Local process index: 2
+Device: cuda:2
+Mixed precision type: no
+Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
+03/09/2026 10:48:07 - INFO - __main__ - Distributed environment: MULTI_GPU  Backend: nccl
+Num processes: 4
+Process index: 0
+Local process index: 0
+Device: cuda:0
+Mixed precision type: no
+[INFO] Accelerator initialized
+03/09/2026 10:48:07 - INFO - __main__ - Number of processes: 4
+03/09/2026 10:48:07 - INFO - __main__ - Distributed type: MULTI_GPU
+03/09/2026 10:48:07 - INFO - __main__ - Mixed precision: no
+03/09/2026 10:48:07 - INFO - __main__ - GPU 0: NVIDIA A100-PCIE-40GB
+03/09/2026 10:48:07 - INFO - __main__ - GPU 0 memory: 39.4 GB
+03/09/2026 10:48:07 - INFO - __main__ - GPU 1: NVIDIA A100-PCIE-40GB
+03/09/2026 10:48:07 - INFO - __main__ - GPU 1 memory: 39.4 GB
+03/09/2026 10:48:07 - INFO - __main__ - GPU 2: NVIDIA A100-PCIE-40GB
+03/09/2026 10:48:07 - INFO - __main__ - GPU 2 memory: 39.4 GB
+03/09/2026 10:48:07 - INFO - __main__ - GPU 3: NVIDIA A100-PCIE-40GB
+03/09/2026 10:48:07 - INFO - __main__ - GPU 3 memory: 39.4 GB
+[INFO] Seed set to 42
+[INFO] Loading tokenizers...
+03/09/2026 10:48:07 - INFO - __main__ - Distributed environment: MULTI_GPU  Backend: nccl
+Num processes: 4
+Process index: 1
+Local process index: 1
+Device: cuda:1
+Mixed precision type: no
+03/09/2026 10:48:07 - INFO - __main__ - Distributed environment: MULTI_GPU  Backend: nccl
+Num processes: 4
+Process index: 3
+Local process index: 3
+Device: cuda:3
+Mixed precision type: no
+[INFO] Tokenizers loaded. Loading text encoders, VAE, and transformer...
+You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
+You are using a model of type clip_text_model to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
+You are using a model of type clip_text_model to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
+You are using a model of type t5 to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
+{'max_image_seq_len', 'shift_terminal', 'use_beta_sigmas', 'time_shift_type', 'use_dynamic_shifting', 'stochastic_sampling', 'base_shift', 'invert_sigmas', 'use_exponential_sigmas', 'use_karras_sigmas', 'max_shift', 'base_image_seq_len'} was not found in config. Values will be initialized to default values.
+{'mid_block_add_attention'} was not found in config. Values will be initialized to default values.
+If your task is similar to the task the model of the checkpoint was trained on, you can already use AutoencoderKL for predictions without further training.
+{'qk_norm', 'dual_attention_layers'} was not found in config. Values will be initialized to default values.
+All model checkpoint weights were used when initializing SD3Transformer2DModel.
+All the weights of SD3Transformer2DModel were initialized from the model checkpoint at /gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671.
+If your task is similar to the task the model of the checkpoint was trained on, you can already use SD3Transformer2DModel for predictions without further training.
+[INFO] Text encoders, VAE, and transformer loaded
+[rank2]:[W309 10:48:36.300339912 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[INFO] Optimizer created. Loading dataset...
+[INFO] Found metadata.jsonl, using efficient loading method
+[INFO] Loading dataset from metadata.jsonl: /gemini/space/hsd/project/dataset/cc3m-wds/train/metadata.jsonl
+[rank1]:[W309 10:48:38.132724538 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[INFO] Processed 100000 entries from metadata.jsonl
+[rank3]:[W309 10:48:39.804821476 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[INFO] Processed 200000 entries from metadata.jsonl
+[INFO] Processed 300000 entries from metadata.jsonl
+[INFO] Processed 400000 entries from metadata.jsonl
+[INFO] Processed 500000 entries from metadata.jsonl
+[INFO] Processed 600000 entries from metadata.jsonl
+[INFO] Processed 700000 entries from metadata.jsonl
+[INFO] Processed 800000 entries from metadata.jsonl
+[INFO] Processed 900000 entries from metadata.jsonl
+[INFO] Processed 1000000 entries from metadata.jsonl
+[INFO] Processed 1100000 entries from metadata.jsonl
+[INFO] Processed 1200000 entries from metadata.jsonl
+[INFO] Processed 1300000 entries from metadata.jsonl
+[INFO] Processed 1400000 entries from metadata.jsonl
+[INFO] Processed 1500000 entries from metadata.jsonl
+[INFO] Processed 1600000 entries from metadata.jsonl
+[INFO] Processed 1700000 entries from metadata.jsonl
+[INFO] Processed 1800000 entries from metadata.jsonl
+[INFO] Processed 1900000 entries from metadata.jsonl
+[INFO] Processed 2000000 entries from metadata.jsonl
+[INFO] Processed 2100000 entries from metadata.jsonl
+[INFO] Processed 2200000 entries from metadata.jsonl
+[INFO] Processed 2300000 entries from metadata.jsonl
+[INFO] Processed 2400000 entries from metadata.jsonl
+[INFO] Processed 2500000 entries from metadata.jsonl
+[INFO] Processed 2600000 entries from metadata.jsonl
+[INFO] Processed 2700000 entries from metadata.jsonl
+[INFO] Processed 2800000 entries from metadata.jsonl
+[INFO] Processed 2900000 entries from metadata.jsonl
+[INFO] Loaded 2905954 image-caption pairs from metadata.jsonl
+[INFO] Dataset loaded successfully.
+[rank0]:[W309 10:49:04.100729496 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[INFO] All processes synchronized. Building transforms and DataLoader...
+[INFO] Dataset columns: ['image', 'text']
+[INFO] Using image column: image
+[WARNING] Specified caption_column 'caption' not found. Using 'text' instead.
+[INFO] Using caption column: text
+03/09/2026 10:49:34 - INFO - __main__ - Auto-setting dataloader_num_workers to 4 for multi-GPU training
+[INFO] DataLoader ready. Computing training steps and scheduler...
+03/09/2026 10:49:36 - WARNING - __main__ - Failed to initialize trackers: Descriptors cannot be created directly.
+If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
+If you cannot immediately regenerate your protos, some other possible workarounds are:
+ 1. Downgrade the protobuf package to 3.20.x or lower.
+ 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).
+More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates
+03/09/2026 10:49:36 - WARNING - __main__ - Continuing without tracking. You can monitor training through console logs.
+03/09/2026 10:49:36 - INFO - __main__ - ***** Running training *****
+03/09/2026 10:49:36 - INFO - __main__ -   Num examples = 2905954
+03/09/2026 10:49:36 - INFO - __main__ -   Num Epochs = 6
+03/09/2026 10:49:36 - INFO - __main__ -   Instantaneous batch size per device = 8
+03/09/2026 10:49:36 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 32
+03/09/2026 10:49:36 - INFO - __main__ -   Gradient Accumulation steps = 1
+03/09/2026 10:49:36 - INFO - __main__ -   Total optimization steps = 500000
+03/09/2026 10:49:36 - INFO - __main__ -   Number of GPU processes = 4
+03/09/2026 10:49:36 - INFO - __main__ -   Effective batch size per GPU = 8
+03/09/2026 10:49:36 - INFO - __main__ -   Total effective batch size across all GPUs = 32
+[INFO] Training setup complete. num_examples=2905954, max_train_steps=500000, num_epochs=6
+[rank3]:[W309 10:49:40.986061745 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
+[rank2]:[W309 10:49:40.986962214 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
+[rank1]:[W309 10:49:40.988288933 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
+[rank0]:[W309 10:49:40.989585595 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
+[rank1]:   File "/gemini/space/gzy_new/models/Sida/train_lora_sd3.py", line 1597, in <module>
+[rank1]:     main(args)
+[rank1]:   File "/gemini/space/gzy_new/models/Sida/train_lora_sd3.py", line 1410, in main
+[rank1]:     timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+[rank1]:   File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/utils/data/_utils/signal_handling.py", line 73, in handler
+[rank1]:     _error_if_any_worker_fails()
+[rank1]: RuntimeError: DataLoader worker (pid 8746) is killed by signal: Terminated.
+W0309 10:54:03.295000 7448 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 7639 closing signal SIGTERM
+W0309 10:54:03.296000 7448 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 7640 closing signal SIGTERM
+W0309 10:54:03.296000 7448 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 7641 closing signal SIGTERM
+E0309 10:54:03.762000 7448 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: -15) local_rank: 0 (pid: 7638) of binary: /root/miniconda3/envs/SiT/bin/python3.10
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+Traceback (most recent call last):
+  File "/root/miniconda3/envs/SiT/bin/accelerate", line 6, in <module>
+    sys.exit(main())
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 50, in main
+    args.func(args)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/accelerate/commands/launch.py", line 1189, in launch_command
+    multi_gpu_launcher(args)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/accelerate/commands/launch.py", line 815, in multi_gpu_launcher
+    distrib_run.run(args)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run
+    elastic_launch(
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+==========================================================
+train_lora_sd3.py FAILED
+----------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+----------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2026-03-09_10:54:03
+  host      : 1406241bacf2123cb0cdd1395a94d0f5-taskrole1-0
+  rank      : 0 (local_rank: 0)
+  exitcode  : -15 (pid: 7638)
+  error_file: <N/A>
+  traceback : Signal 15 (SIGTERM) received by PID 7638
+==========================================================
+===========================================
+训练完成！
+模型保存在: sd3-lora-finetuned-batch-32
+日志保存在: sd3-lora-finetuned-batch-32/logs
+验证图片保存在: sd3-lora-finetuned-batch-32/validation_images
+===========================================

train_sd3_lora2.sh ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/bin/bash
+# SD3 LoRA Fine-tuning Training Script
+# 使用 Stable Diffusion 3 进行 LoRA 微调训练 - 多GPU优化版本
+# 设置环境变量
+export CUDA_VISIBLE_DEVICES=0,1,2,3  # 根据可用GPU数量调整
+# export PYTHONPATH=$PYTHONPATH:/gemini/space/gzy_new/Rectified_Noise/Finetune/finetune-coco
+# 检查GPU数量
+num_gpus=$(nvidia-smi --list-gpus | wc -l)
+echo "检测到 $num_gpus 个GPU"
+# 训练参数配置
+MODEL_NAME="/gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671"
+#DATASET_NAME="lambdalabs/naruto-blip-captions"  # 或者使用本地数据集路径
+TRAIN_DATA_DIR="/gemini/space/hsd/project/dataset/cc3m-wds/train"  # 本地数据集路径0
+OUTPUT_DIR="sd3-lora-finetuned-batch-32"
+RESOLUTION=512
+# 多GPU训练时调整批次大小 - 每个GPU的批次大小
+TRAIN_BATCH_SIZE=8  # 每个GPU的批次大小，总批次大小 = TRAIN_BATCH_SIZE * num_gpus * GRADIENT_ACCUMULATION_STEPS
+GRADIENT_ACCUMULATION_STEPS=1  # 梯度累积步数
+LEARNING_RATE=1e-5
+MAX_TRAIN_STEPS=500000
+NUM_TRAIN_EPOCHS=50
+VALIDATION_PROMPT="A photo of a beautiful landscape with mountains and lake"
+NUM_VALIDATION_IMAGES=2
+VALIDATION_EPOCHS=1  # 每10个epoch验证一次
+LORA_RANK=32
+SEED=42
+# 计算有效批次大小
+effective_batch_size=$((TRAIN_BATCH_SIZE * num_gpus * GRADIENT_ACCUMULATION_STEPS))
+echo "每个GPU批次大小: $TRAIN_BATCH_SIZE"
+echo "总有效批次大小: $effective_batch_size"
+# 创建输出目录
+mkdir -p $OUTPUT_DIR
+echo "===== SD3 LoRA 多GPU训练开始 ====="
+echo "模型: $MODEL_NAME"
+echo "输出目录: $OUTPUT_DIR"
+echo "分辨率: $RESOLUTION"
+echo "每个GPU批次大小: $TRAIN_BATCH_SIZE"
+echo "梯度累积步数: $GRADIENT_ACCUMULATION_STEPS"
+echo "总有效批次大小: $effective_batch_size"
+echo "学习率: $LEARNING_RATE"
+echo "最大训练步数: $MAX_TRAIN_STEPS"
+echo "LoRA Rank: $LORA_RANK"
+echo "使用GPU: $CUDA_VISIBLE_DEVICES"
+echo "==========================================="
+# 检查accelerate配置
+if [ ! -f "accelerate_config.yaml" ]; then
+    echo "错误: 未找到 accelerate_config.yaml 文件"
+    echo "请运行: accelerate config 来配置多GPU训练"
+    exit 1
+fi
+echo "使用 accelerate 启动多GPU训练..."
+# 使用 accelerate 启动训练
+accelerate launch --config_file=accelerate_config.yaml train_lora_sd3.py \
+  --pretrained_model_name_or_path="$MODEL_NAME" \
+  --train_data_dir="$TRAIN_DATA_DIR" \
+  --output_dir="$OUTPUT_DIR" \
+  --mixed_precision="no" \
+  --resolution=$RESOLUTION \
+  --train_batch_size=$TRAIN_BATCH_SIZE \
+  --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS \
+  --learning_rate=$LEARNING_RATE \
+  --scale_lr \
+  --lr_scheduler="cosine" \
+  --lr_warmup_steps=100 \
+  --max_train_steps=$MAX_TRAIN_STEPS \
+  --num_train_epochs=$NUM_TRAIN_EPOCHS \
+  --validation_prompt="$VALIDATION_PROMPT" \
+  --num_validation_images=$NUM_VALIDATION_IMAGES \
+  --validation_epochs=$VALIDATION_EPOCHS \
+  --checkpointing_steps=50000 \
+  --checkpoints_total_limit=10 \
+  --seed=$SEED \
+  --rank=$LORA_RANK \
+  --gradient_checkpointing \
+  --use_8bit_adam \
+  --dataloader_num_workers=0 \
+  --report_to="tensorboard" \
+  --logging_dir="logs" \
+  --adam_beta1=0.9 \
+  --adam_beta2=0.999 \
+  --adam_weight_decay=1e-2 \
+  --adam_epsilon=1e-8 \
+  --max_grad_norm=1.0 \
+  --allow_tf32 \
+  --weighting_scheme="logit_normal" \
+  --logit_mean=0.0 \
+  --logit_std=1.0 \
+  --precondition_outputs=1
+echo "==========================================="
+echo "训练完成！"
+echo "模型保存在: $OUTPUT_DIR"
+echo "日志保存在: $OUTPUT_DIR/logs"
+echo "验证图片保存在: $OUTPUT_DIR/validation_images"
+echo "==========================================="
+# nohup bash train_sd3_lora2.sh > train_sd3_lora2.log 2>&1 &

visual.sh ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/bin/bash
+set -euo pipefail
+# 分两次主模型运行，最后拼图，避免同时加载双模型导致 OOM
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
+PYTHON_BIN="${PYTHON_BIN:-/root/miniconda3/envs/SiT/bin/python}"
+PRETRAINED_MODEL="/gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671"
+LOCAL_PIPELINE_PATH="/gemini/space/gzy_new/models/Sida/pipeline_stable_diffusion_3.py"
+LORA_PATH="/gemini/space/gzy_new/models/Sida/sd3-lora-finetuned-batch-4/checkpoint-500000"
+RECTIFIED_WEIGHTS="/gemini/space/gzy_new/models/Sida/rectified-noise-batch-2/checkpoint-220000/sit_weights"
+# 可按需修改为你想看的文本
+PROMPT="young man beside a dog wearing sunglasses"
+OUTPUT_DIR="/gemini/space/gzy_new/models/Sida/sd3_lora_rn_pair_samples"
+OUTPUT_FILE="${OUTPUT_DIR}/lora_rn_4x8_step180.png"
+LORA_NPZ="${OUTPUT_DIR}/lora_trace_4x8.npz"
+RN_NPZ="${OUTPUT_DIR}/rn_trace_4x8.npz"
+STEPS=180
+GUIDANCE_SCALE=7.0
+HEIGHT=512
+WIDTH=512
+SEED=42
+MIXED_PRECISION="fp16"   # no / fp16 / bf16
+NUM_SIT_LAYERS=1    # 需与训练一致
+mkdir -p "$OUTPUT_DIR"
+if [ ! -x "$PYTHON_BIN" ]; then
+  echo "ERROR: PYTHON_BIN not executable: $PYTHON_BIN"
+  echo "Hint: export PYTHON_BIN=/path/to/your/python"
+  exit 1
+fi
+if [ ! -e "$PRETRAINED_MODEL" ]; then
+  echo "ERROR: PRETRAINED_MODEL not found: $PRETRAINED_MODEL"
+  exit 1
+fi
+if [ ! -f "$LOCAL_PIPELINE_PATH" ]; then
+  echo "ERROR: LOCAL_PIPELINE_PATH not found: $LOCAL_PIPELINE_PATH"
+  exit 1
+fi
+if [ ! -e "$LORA_PATH" ]; then
+  echo "ERROR: LORA_PATH not found: $LORA_PATH"
+  exit 1
+fi
+if [ ! -e "$RECTIFIED_WEIGHTS" ]; then
+  echo "ERROR: RECTIFIED_WEIGHTS not found: $RECTIFIED_WEIGHTS"
+  exit 1
+fi
+COMMON_ARGS=(
+  --pretrained_model_name_or_path "$PRETRAINED_MODEL"
+  --local_pipeline_path "$LOCAL_PIPELINE_PATH"
+  --lora_path "$LORA_PATH"
+  --rectified_weights "$RECTIFIED_WEIGHTS"
+  --num_sit_layers "$NUM_SIT_LAYERS"
+  --prompt "$PROMPT"
+  --output "$OUTPUT_FILE"
+  --lora_npz "$LORA_NPZ"
+  --rn_npz "$RN_NPZ"
+  --steps "$STEPS"
+  --guidance_scale "$GUIDANCE_SCALE"
+  --height "$HEIGHT"
+  --width "$WIDTH"
+  --seed "$SEED"
+  --mixed_precision "$MIXED_PRECISION"
+)
+"$PYTHON_BIN" /gemini/space/gzy_new/models/Sida/visualize_lora_rn_4x8.py "${COMMON_ARGS[@]}" --stage lora
+"$PYTHON_BIN" /gemini/space/gzy_new/models/Sida/visualize_lora_rn_4x8.py "${COMMON_ARGS[@]}" --stage rn
+"$PYTHON_BIN" /gemini/space/gzy_new/models/Sida/visualize_lora_rn_4x8.py "${COMMON_ARGS[@]}" --stage pair
+echo "Done. Saved to: $OUTPUT_FILE"
+# nohup bash run_sd3_rectified_sampling.sh > run_sd3_rectified_sampling.log 2>&1 &

visualize_lora_rn_4x8.py ADDED Viewed

	@@ -0,0 +1,406 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""
+分阶段生成 4x8 对比图（默认 180 步）：
+- stage=lora: 仅生成 LoRA 轨迹中间结果并保存中间 npz
+- stage=rn:   仅生成 RN 轨迹中间结果并保存中间 npz
+- stage=pair: 读取两阶段 npz，计算第3/4行并拼接总图
+"""
+import argparse
+import importlib.util
+from pathlib import Path
+import numpy as np
+import torch
+from PIL import Image
+from diffusers import AutoencoderKL
+from diffusers import StableDiffusion3Pipeline as DiffusersStableDiffusion3Pipeline
+def dynamic_import_training_classes(project_root: str):
+    import sys
+    sys.path.insert(0, project_root)
+    import train_rectified_noise as trn
+    return trn.RectifiedNoiseModule, trn.SD3WithRectifiedNoise
+def load_local_pipeline_class(local_pipeline_path: str):
+    module_name = "diffusers.pipelines.stable_diffusion_3.local_pipeline_stable_diffusion_3"
+    spec = importlib.util.spec_from_file_location(module_name, local_pipeline_path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Failed to import local pipeline: {local_pipeline_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module.StableDiffusion3Pipeline
+def load_sit_weights(rectified_module, weights_path: str):
+    p = Path(weights_path)
+    if p.is_dir():
+        search_dirs = [p, p / "sit_weights"]
+        for d in search_dirs:
+            if not d.exists():
+                continue
+            st = d / "pytorch_sit_weights.safetensors"
+            if st.exists():
+                from safetensors.torch import load_file
+                rectified_module.load_state_dict(load_file(str(st)), strict=False)
+                return True
+            for name in ["pytorch_sit_weights.bin", "pytorch_sit_weights.pt", "sit_weights.pt", "sit.pt"]:
+                ck = d / name
+                if ck.exists():
+                    rectified_module.load_state_dict(torch.load(str(ck), map_location="cpu"), strict=False)
+                    return True
+        return False
+    if str(p).endswith(".safetensors"):
+        from safetensors.torch import load_file
+        state = load_file(str(p))
+    else:
+        state = torch.load(str(p), map_location="cpu")
+    rectified_module.load_state_dict(state, strict=False)
+    return True
+def build_rn_model(base_pipeline, rectified_weights, num_sit_layers, device):
+    RectifiedNoiseModule, SD3WithRectifiedNoise = dynamic_import_training_classes(str(Path(__file__).parent))
+    tfm = base_pipeline.transformer
+    sit_hidden_size = getattr(tfm.config, "joint_attention_dim", None) or getattr(tfm.config, "inner_dim", 4096)
+    rectified_module = RectifiedNoiseModule(
+        hidden_size=sit_hidden_size,
+        num_sit_layers=num_sit_layers,
+        num_attention_heads=getattr(tfm.config, "num_attention_heads", 32),
+        input_dim=getattr(tfm.config, "in_channels", 16),
+        transformer_hidden_size=getattr(tfm.config, "hidden_size", 1536),
+    )
+    if not load_sit_weights(rectified_module, rectified_weights):
+        raise RuntimeError(f"Failed to load rectified weights: {rectified_weights}")
+    model = SD3WithRectifiedNoise(base_pipeline.transformer, rectified_module).to(device)
+    model.eval()
+    return model
+def set_pipeline_modules_eval(pipe):
+    for name in ["transformer", "vae", "text_encoder", "text_encoder_2", "text_encoder_3", "model"]:
+        m = getattr(pipe, name, None)
+        if m is not None and hasattr(m, "eval"):
+            m.eval()
+def align_rn_branch_dtype(pipe, dtype):
+    model = getattr(pipe, "model", None)
+    if model is None:
+        return
+    rn_branch = getattr(model, "rectified_noise_module", None)
+    if rn_branch is not None:
+        rn_branch.to(device=pipe._execution_device, dtype=dtype)
+    if hasattr(model, "to"):
+        model.to(device=pipe._execution_device)
+@torch.no_grad()
+def decode_latent_to_uint8(vae, latents, normalize=False):
+    shift = getattr(vae.config, "shift_factor", 0.0) or 0.0
+    scaled = (latents / vae.config.scaling_factor) + shift
+    image = vae.decode(scaled, return_dict=False)[0]
+    x = image[0].float()
+    if normalize:
+        x = (x - x.min()) / (x.max() - x.min() + 1e-6)
+    else:
+        x = (x / 2.0) + 0.5
+    x = x.clamp(0, 1)
+    return (x.permute(1, 2, 0).cpu().numpy() * 255.0).astype(np.uint8)
+def encode_prompt_for_pipe(pipe, prompt, guidance_scale):
+    do_cfg = guidance_scale > 1.0
+    pe, npe, pp, npp = pipe.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt,
+        prompt_3=prompt,
+        do_classifier_free_guidance=do_cfg,
+        num_images_per_prompt=1,
+        device=pipe._execution_device,
+    )
+    if do_cfg:
+        pe = torch.cat([npe, pe], dim=0)
+        pp = torch.cat([npp, pp], dim=0)
+    return pe, pp
+@torch.no_grad()
+def run_single_trajectory(pipe, prompt, steps, guidance_scale, height, width, seed, use_rn_model=False, autocast_dtype=None):
+    device = pipe._execution_device
+    dtype = next(pipe.transformer.parameters()).dtype
+    sample_idx = np.linspace(0, steps - 1, 8, dtype=int).tolist()
+    latent_h = height // pipe.vae_scale_factor
+    latent_w = width // pipe.vae_scale_factor
+    g = torch.Generator(device=device).manual_seed(seed)
+    init_latents = torch.randn(
+        (1, pipe.transformer.config.in_channels, latent_h, latent_w),
+        device=device,
+        dtype=dtype,
+        generator=g,
+    )
+    if use_rn_model:
+        effective_model = getattr(pipe, "model", None)
+        if effective_model is not None:
+            pipe.model = effective_model
+    step_latents = {}
+    def _capture_callback(_pipe, i, _t, callback_kwargs):
+        if i in sample_idx:
+            step_latents[i] = callback_kwargs["latents"].detach().clone()
+        return callback_kwargs
+    # 跟参考脚本一致，主路径走 pipeline(...)，通过 callback 抓中间 latent
+    if autocast_dtype is None:
+        _ = pipe(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            guidance_scale=guidance_scale,
+            latents=init_latents,
+            num_images_per_prompt=1,
+            callback_on_step_end=_capture_callback,
+            callback_on_step_end_tensor_inputs=["latents"],
+        )
+    else:
+        with torch.autocast("cuda", dtype=autocast_dtype):
+            _ = pipe(
+                prompt=prompt,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                guidance_scale=guidance_scale,
+                latents=init_latents,
+                num_images_per_prompt=1,
+                callback_on_step_end=_capture_callback,
+                callback_on_step_end_tensor_inputs=["latents"],
+            )
+    images = []
+    latents = []
+    noises = []
+    prev_lat = init_latents
+    pe = pp = None
+    do_cfg = guidance_scale > 1.0
+    timesteps = None
+    if use_rn_model:
+        pe, pp = encode_prompt_for_pipe(pipe, prompt, guidance_scale)
+        pipe.scheduler.set_timesteps(steps, device=device)
+        timesteps = pipe.scheduler.timesteps
+    for i in sample_idx:
+        cur_lat = step_latents[i]
+        images.append(decode_latent_to_uint8(pipe.vae, cur_lat, normalize=False))
+        latents.append(cur_lat.squeeze(0).float().cpu().numpy())
+        if use_rn_model:
+            # 第3行严格使用 RN 噪声分支（速度场）输出
+            model_in = torch.cat([cur_lat] * 2) if do_cfg else cur_lat
+            ts = timesteps[i].expand(model_in.shape[0])
+            if autocast_dtype is not None and device == "cuda":
+                with torch.autocast("cuda", dtype=autocast_dtype):
+                    rn_out = pipe.model(
+                        hidden_states=model_in,
+                        timestep=ts,
+                        encoder_hidden_states=pe,
+                        pooled_projections=pp,
+                        return_dict=False,
+                    )
+            else:
+                rn_out = pipe.model(
+                    hidden_states=model_in,
+                    timestep=ts,
+                    encoder_hidden_states=pe,
+                    pooled_projections=pp,
+                    return_dict=False,
+                )
+            # SD3WithRectifiedNoise: (final_output, mean_out, var_out)
+            rn_branch = rn_out[1] if isinstance(rn_out, tuple) and len(rn_out) > 1 else rn_out[0]
+            if do_cfg:
+                ru, rt = rn_branch.chunk(2)
+                rn_branch = ru + guidance_scale * (rt - ru)
+            noises.append(rn_branch.squeeze(0).float().cpu().numpy())
+        else:
+            # lora 阶段保留占位，pair 阶段不会用到
+            delta = (cur_lat - prev_lat)
+            noises.append(delta.squeeze(0).float().cpu().numpy())
+        prev_lat = cur_lat
+    return {
+        "images": np.stack(images, axis=0),
+        "latents": np.stack(latents, axis=0),
+        "noises": np.stack(noises, axis=0),
+        "sample_idx": np.array(sample_idx, dtype=np.int32),
+    }
+def save_grid_4x8(rows, sample_idx, out_path, cell_w=512, cell_h=512):
+    cols = 8
+    grid = Image.new("RGB", (cols * cell_w, 4 * cell_h), color=(245, 245, 245))
+    for r in range(4):
+        for c in range(cols):
+            img = Image.fromarray(rows[r][c]).resize((cell_w, cell_h), Image.BILINEAR)
+            x = c * cell_w
+            y = r * cell_h
+            grid.paste(img, (x, y))
+    grid.save(out_path)
+def stage_lora(args, dtype, device):
+    pipe = DiffusersStableDiffusion3Pipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=dtype,
+    ).to(device)
+    pipe.load_lora_weights(args.lora_path)
+    set_pipeline_modules_eval(pipe)
+    pipe.set_progress_bar_config(disable=True)
+    data = run_single_trajectory(
+        pipe=pipe,
+        prompt=args.prompt,
+        steps=args.steps,
+        guidance_scale=args.guidance_scale,
+        height=args.height,
+        width=args.width,
+        seed=args.seed,
+        use_rn_model=False,
+        autocast_dtype=dtype if args.mixed_precision != "no" and device == "cuda" else None,
+    )
+    np.savez_compressed(args.lora_npz, **data)
+    print(f"[lora] saved: {args.lora_npz}")
+def stage_rn(args, dtype, device):
+    LocalPipe = load_local_pipeline_class(args.local_pipeline_path)
+    pipe = LocalPipe.from_pretrained(
+        args.pretrained_model_name_or_path,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=dtype,
+    ).to(device)
+    pipe.load_lora_weights(args.lora_path)
+    pipe.model = build_rn_model(pipe, args.rectified_weights, args.num_sit_layers, device)
+    # 避免 RN 速度场额外前向时出现 Half/Float 冲突
+    align_rn_branch_dtype(pipe, dtype)
+    set_pipeline_modules_eval(pipe)
+    pipe.set_progress_bar_config(disable=True)
+    data = run_single_trajectory(
+        pipe=pipe,
+        prompt=args.prompt,
+        steps=args.steps,
+        guidance_scale=args.guidance_scale,
+        height=args.height,
+        width=args.width,
+        seed=args.seed,
+        use_rn_model=True,
+        autocast_dtype=dtype if args.mixed_precision != "no" and device == "cuda" else None,
+    )
+    np.savez_compressed(args.rn_npz, **data)
+    print(f"[rn] saved: {args.rn_npz}")
+def stage_pair(args, dtype, device):
+    lora = np.load(args.lora_npz)
+    rn = np.load(args.rn_npz)
+    lora_images = lora["images"]
+    rn_images = rn["images"]
+    sample_idx = lora["sample_idx"]
+    rn_noises = rn["noises"]  # RN 速度场，shape: [8, C, H, W]
+    def _velocity_to_sparse_points(vel_chw, out_h, out_w, q=99.6, point_color=(245, 245, 245)):
+        # 通道聚合成强度图，再转成黑底稀疏点图
+        mag = np.sqrt(np.sum(np.square(vel_chw.astype(np.float32)), axis=0))  # [H, W]
+        thr = np.percentile(mag, q)
+        mask = mag >= thr
+        # 最近邻放大到输出分辨率
+        sy = max(1, out_h // mask.shape[0])
+        sx = max(1, out_w // mask.shape[1])
+        up = np.repeat(np.repeat(mask, sy, axis=0), sx, axis=1)
+        up = up[:out_h, :out_w]
+        canvas = np.zeros((out_h, out_w, 3), dtype=np.uint8)
+        canvas[up] = np.array(point_color, dtype=np.uint8)
+        return canvas, mag
+    # 第3行：黑底 + 稀疏速度点（接近你最初图风格）
+    step_noise_imgs = []
+    # 第4行：速度场累积后再做稀疏点图
+    sum_noise_imgs = []
+    running_mag = None
+    for i in range(8):
+        step_vis, mag = _velocity_to_sparse_points(rn_noises[i], args.height, args.width, q=99.6)
+        if running_mag is None:
+            running_mag = mag
+        else:
+            running_mag = running_mag + mag
+        thr_sum = np.percentile(running_mag, 99.2)
+        mask_sum = running_mag >= thr_sum
+        sy = max(1, args.height // mask_sum.shape[0])
+        sx = max(1, args.width // mask_sum.shape[1])
+        up_sum = np.repeat(np.repeat(mask_sum, sy, axis=0), sx, axis=1)[:args.height, :args.width]
+        sum_vis = np.zeros((args.height, args.width, 3), dtype=np.uint8)
+        sum_vis[up_sum] = np.array([245, 245, 245], dtype=np.uint8)
+        step_noise_imgs.append(step_vis)
+        sum_noise_imgs.append(sum_vis)
+    rows = [
+        [lora_images[i] for i in range(8)],
+        [rn_images[i] for i in range(8)],
+        step_noise_imgs,
+        sum_noise_imgs,
+    ]
+    save_grid_4x8(rows, sample_idx, args.output, cell_w=args.width, cell_h=args.height)
+    print(f"[pair] saved: {args.output}")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--stage", type=str, default="all", choices=["all", "lora", "rn", "pair"])
+    parser.add_argument("--pretrained_model_name_or_path", type=str, required=True)
+    parser.add_argument("--revision", type=str, default=None)
+    parser.add_argument("--variant", type=str, default=None)
+    parser.add_argument("--local_pipeline_path", type=str, required=True)
+    parser.add_argument("--lora_path", type=str, required=True)
+    parser.add_argument("--rectified_weights", type=str, required=True)
+    parser.add_argument("--num_sit_layers", type=int, default=1)
+    parser.add_argument("--prompt", type=str, required=True)
+    parser.add_argument("--output", type=str, default="lora_rn_4x8.png")
+    parser.add_argument("--lora_npz", type=str, default="lora_trace_4x8.npz")
+    parser.add_argument("--rn_npz", type=str, default="rn_trace_4x8.npz")
+    parser.add_argument("--steps", type=int, default=180)
+    parser.add_argument("--guidance_scale", type=float, default=7.0)
+    parser.add_argument("--height", type=int, default=512)
+    parser.add_argument("--width", type=int, default=512)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["no", "fp16", "bf16"])
+    args = parser.parse_args()
+    dtype = torch.float16 if args.mixed_precision == "fp16" else (torch.bfloat16 if args.mixed_precision == "bf16" else torch.float32)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    Path(args.output).parent.mkdir(parents=True, exist_ok=True)
+    Path(args.lora_npz).parent.mkdir(parents=True, exist_ok=True)
+    Path(args.rn_npz).parent.mkdir(parents=True, exist_ok=True)
+    if args.stage in ("all", "lora"):
+        stage_lora(args, dtype, device)
+        if device == "cuda":
+            torch.cuda.empty_cache()
+    if args.stage in ("all", "rn"):
+        stage_rn(args, dtype, device)
+        if device == "cuda":
+            torch.cuda.empty_cache()
+    if args.stage in ("all", "pair"):
+        stage_pair(args, dtype, device)
+if __name__ == "__main__":
+    main()

visualize_sitf2_noise_evolution.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import torch
+import numpy as np
+import imageio
+from tqdm import tqdm
+import torch.distributed as dist
+from models import SiTF1, SiTF2, SiT, CombinedModel
+from download import find_model
+from diffusers.models import AutoencoderKL
+def tensor_to_img(tensor):
+    arr = tensor.detach().cpu().numpy()
+    if arr.ndim == 3:
+        arr = arr[0]
+    arr = (arr - arr.min()) / (arr.max() - arr.min() + 1e-8) * 255
+    return arr.astype(np.uint8)
+def main(
+    sit_ckpt, sitf2_ckpt,
+    image_size=256,
+    patch_size=2,
+    hidden_size=1152,
+    out_channels=8,
+    steps=50,
+    gif_path='sitf2_noise_evolution.gif',
+    device='cuda'
+):
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    device = rank % torch.cuda.device_count()
+    latent_size = image_size // 8
+    sitf1 = SiTF1(
+        input_size=latent_size,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=3,
+        learn_sigma=False
+    ).to(device)
+    sitf1_state = find_model(sit_ckpt)
+    try:
+        sitf1.load_state_dict(sitf1_state["model"], strict=False)
+    except Exception:
+        sitf1.load_state_dict(sitf1_state, strict=False)
+    sitf1.eval()
+    sitf2 = SiTF2(
+        hidden_size=768,
+        out_channels=8,
+        patch_size=2,
+        num_heads=12,
+        mlp_ratio=4.0,
+        depth=2,
+        learn_sigma=False,
+        learn_mu=True
+    ).to(device)
+    from torch.nn.parallel import DistributedDataParallel as DDP
+    sitf2 = DDP(sitf2, device_ids=[device])
+    sitf2_state = find_model(args.sitf2_ckpt)
+    try:
+        sitf2.load_state_dict(sitf2_state["ema"])
+    except Exception:
+        sitf2.load_state_dict(sitf2_state)
+    sitf2.eval()
+    batch = 1
+    x = torch.randn(batch, 4, latent_size, latent_size, device=device)
+    x0= x
+    y = torch.zeros(batch, dtype=torch.long, device=device)
+    t = torch.ones(batch, device=device)
+    imgs = []
+    imgs1 = []
+    imgs2 = []
+    img_original=[]
+    sit = SiT(
+        input_size=latent_size,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=3,
+        learn_sigma=False
+    ).to(device)
+    try:
+        sit.load_state_dict(sitf1_state["model"])
+    except Exception:
+        sit.load_state_dict(sitf1_state)
+    sit.eval()
+    combined_model = CombinedModel(sitf1, sitf2).to(device)
+    combined_model.eval()
+    vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-ema").to(device)
+    for step in tqdm(range(steps)):
+        t = torch.full((batch,), step / (steps - 1), device=device)
+        with torch.no_grad():
+            patch_tokens = sitf1(x, t, y)
+            t_emb = sitf1.t_embedder(t)
+            y_emb = sitf1.y_embedder(y, False)
+            c = t_emb + y_emb
+            std = sitf2.module.forward_noise(patch_tokens, c)
+            x1=x
+            drift = sit(x1, t, y)
+            delta_t = 1.0 / steps
+            x1 = x1 + drift * delta_t
+            x_dec = vae.decode(x1 / 0.18215).sample
+            img = torch.clamp(127.5 * x_dec + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+            img1 = img[0]
+            drift = sit(x, t, y)
+            delta_t = 1.0 / steps
+            noise = torch.randn_like(x)
+            x = x + drift * delta_t
+            x_dec = vae.decode(x / 0.18215).sample
+            img = torch.clamp(127.5 * x_dec + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+            img2 = img[0]
+            imgs.append(img2-img1)
+            imgs1.append(img1)
+            imgs2.append(img2)
+    x=x0
+    for step in tqdm(range(steps)):
+        t = torch.full((batch,), step / (steps - 1), device=device)
+        with torch.no_grad():
+            patch_tokens = sitf1(x, t, y)
+            t_emb = sitf1.t_embedder(t)
+            y_emb = sitf1.y_embedder(y, False)
+            c = t_emb + y_emb
+            std = sitf2.module.forward_noise(patch_tokens, c)
+            drift = sit(x, t, y)
+            delta_t = 1.0 / steps
+            x = x + drift * delta_t
+            x_dec = vae.decode(x / 0.18215).sample
+            img = torch.clamp(127.5 * x_dec + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+            img1 = img[0]
+            img_original.append(img1)
+    imageio.mimsave(gif_path, imgs, duration=0.1)
+    print(f"Saved gif to {gif_path}")
+    imageio.mimsave('noise.gif', imgs1, duration=0.1)
+    print(f"Saved gif to {gif_path}")
+    imageio.mimsave('std.gif', imgs2, duration=0.1)
+    imageio.mimsave('nothing.gif', img_original, duration=0.1)
+    print(f"Saved gif to {gif_path}")
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--ckpt', type=str, default='/gemini/space/gzy/w_w_last/Celeba/w_w_sit_1/0200000.pt')
+    parser.add_argument('--sitf2-ckpt', type=str,default='/gemini/space/gzy/w_w_last/Celeba/w_w_sit_1/results/depth-mu-2-014-SiT-XL-2-Linear-velocity-None/checkpoints/0010000.pt')
+    parser.add_argument('--steps', type=int, default=100)
+    parser.add_argument('--gif-path', type=str, default='sitf2_noise_evolution.gif')
+    parser.add_argument('--gif-path2', type=str, default='noise.gif')
+    parser.add_argument('--gif-path1', type=str, default='std.gif')
+    parser.add_argument('--image-size', type=int, default=256)
+    parser.add_argument('--device', type=str, default='cuda')
+    args = parser.parse_args()
+    main(
+        sit_ckpt=args.ckpt,
+        sitf2_ckpt=args.sitf2_ckpt,
+        image_size=args.image_size,
+        steps=args.steps,
+        gif_path=args.gif_path,
+        device=args.device
+    )