Mrbizarro commited on 13 days ago

Commit

ffe929e

verified ·

1 Parent(s): 269dd41

Initial release: code, docs, hero samples

Browse files

Files changed (34) hide show

.gitattributes +17 -32
.gitignore +25 -0
CLAUDE.md +95 -0
LICENSE +27 -0
README.md +210 -0
STATE.md +83 -0
UPLOAD_TO_HF.md +102 -0
docs/EVALUATION.md +145 -0
docs/HIDREAM_O1_MLX_PORT_REPORT.md +146 -0
docs/PHOSPHENE_INTEGRATION_PLAN.md +158 -0
requirements.txt +15 -0
sample_outputs/hero/01_tea_master.png +3 -0
sample_outputs/hero/02_tropical_beach.png +3 -0
sample_outputs/hero/03_astronaut.png +3 -0
sample_outputs/hero/04_construction_worker.png +3 -0
sample_outputs/hero/05_mountain_peak.png +3 -0
sample_outputs/hero/06_alice_cyberpunk.png +3 -0
sample_outputs/hero/07_kitchen_morning.png +3 -0
sample_outputs/hero/08_fitness_BF16.png +3 -0
scripts/hidream_o1/__init__.py +0 -0
scripts/hidream_o1/_compile_bench.py +90 -0
scripts/hidream_o1/_edit_diag.py +81 -0
scripts/hidream_o1/_precompute_diag.py +88 -0
scripts/hidream_o1/anti_plastic_batch.sh +42 -0
scripts/hidream_o1/cinematic_batch.sh +49 -0
scripts/hidream_o1/convert_hidream_o1_to_mlx.py +210 -0
scripts/hidream_o1/creative_showcase.sh +56 -0
scripts/hidream_o1/flow_match.py +102 -0
scripts/hidream_o1/generate_hidream_o1_mlx.py +327 -0
scripts/hidream_o1/hidream_model.py +175 -0
scripts/hidream_o1/pipeline_helpers.py +420 -0
scripts/hidream_o1/postprocess.py +106 -0
scripts/hidream_o1/realism_batch.sh +51 -0
scripts/hidream_o1/showcase_batch.sh +39 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,20 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+# Git LFS configuration for HuggingFace model release.
+# Weights live under mlx_models/ and are git-LFS-tracked.
+*.safetensors filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
+*.gguf filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+# Sample images stay regular git (small enough for plain repo storage)
+sample_outputs/**/*.png -filter -diff -merge text=auto
+sample_outputs/hero/01_tea_master.png filter=lfs diff=lfs merge=lfs -text
+sample_outputs/hero/02_tropical_beach.png filter=lfs diff=lfs merge=lfs -text
+sample_outputs/hero/03_astronaut.png filter=lfs diff=lfs merge=lfs -text
+sample_outputs/hero/04_construction_worker.png filter=lfs diff=lfs merge=lfs -text
+sample_outputs/hero/05_mountain_peak.png filter=lfs diff=lfs merge=lfs -text
+sample_outputs/hero/06_alice_cyberpunk.png filter=lfs diff=lfs merge=lfs -text
+sample_outputs/hero/07_kitchen_morning.png filter=lfs diff=lfs merge=lfs -text
+sample_outputs/hero/08_fitness_BF16.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,25 @@

+__pycache__/
+*.pyc
+*.pyo
+.venv/
+env/
+# Model weights — too large for code repo. Push to HF Hub separately
+# (see UPLOAD_TO_HF.md). Hero samples are kept under sample_outputs/hero/
+# for the model card; per-batch outputs are gitignored.
+mlx_models/
+# Per-batch sample outputs are scratch; only the curated hero set is checked in.
+sample_outputs/*.png
+sample_outputs/showcase/
+sample_outputs/showcase_q6/
+sample_outputs/showcase_q8/
+sample_outputs/showcase_creative/
+sample_outputs/showcase_realism/
+sample_outputs/showcase_antiplastic/
+sample_outputs/cinematic_*/
+sample_outputs/artifact_test/
+sample_outputs/ab_mflux/
+# Logs are local-only
+logs/

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,95 @@

+# HIDREAM-O1-MLX-LAB — agent manual
+**Read this first** when entering this lab.
+## What this is
+A standalone lab for porting **HiDream-O1-Image-Dev** (8B Qwen3-VL-based unified pixel-patch transformer, MIT licence) to **MLX** for fast local image generation on Apple Silicon. Status as of 2026-05-09: **shipped to Phosphene `dev` branch**. Lab continues to host the conversion + inference scripts and serve as the home for future work (edit/multi-ref, 2048 generation, post-process experiments).
+## Where it lives
+- **This dir**: `/Users/salo/HIDREAM-O1-MLX-LAB-active/`
+- **Branch**: `perf-lab-hidream-o1-mlx` (local-only git, no remote yet)
+- **Outside `~/pinokio/`** deliberately so Pinokio cleanup can't touch it
+- **README.md** marker at root: "DO NOT DELETE"
+- **Phosphene integration** lives in `~/pinokio/api/phosphene-dev.git/agent/image_engine.py` (`kind="hidream"`), shipped on `dev` branch
+## Session-start protocol
+1. `git fetch && git status -sb` — check the branch
+2. Read **STATE.md** — current state, recent work, open items
+3. Read **docs/EVALUATION.md** — what HiDream is good/weak at, A/B vs mflux, perf numbers
+4. Read **docs/HIDREAM_O1_MLX_PORT_REPORT.md** — architecture details, weight conversion, Q4 vs Q8 finding
+5. Read **docs/PHOSPHENE_INTEGRATION_PLAN.md** — what we shipped to Phosphene and how
+## Layout
+```
+.
+├── README.md                                    DO NOT DELETE marker
+├── CLAUDE.md                                    this file
+├── STATE.md                                     current state, open items
+├── docs/
+│   ├── EVALUATION.md                            quality + perf, A/B vs mflux, blend experiment
+│   ├── HIDREAM_O1_MLX_PORT_REPORT.md            architecture, weight conversion
+│   └── PHOSPHENE_INTEGRATION_PLAN.md            integration plan + actual diff
+├── scripts/hidream_o1/
+│   ├── flow_match.py                            FlashFlowMatch scheduler in MLX
+│   ├── pipeline_helpers.py                      T2I sample, mrope, mask, patchify
+│   ├── hidream_model.py                         custom heads + forward_generation
+│   ├── convert_hidream_o1_to_mlx.py             HF safetensors -> MLX, Q4/6/8
+│   ├── generate_hidream_o1_mlx.py               T2I generator (CLI entry-point)
+│   ├── _compile_bench.py                        mx.compile A/B bench (0% gain — bandwidth-bound)
+│   └── showcase_batch.sh                        10-prompt showcase battery
+├── notes/weight_map.json                        cached HF safetensors index
+├── mlx_models/
+│   ├── hidream-o1-dev-q4/   (5.6 GB backbone + 75 MB custom heads)
+│   └── hidream-o1-dev-q8/   (9.96 GB backbone + 75 MB custom heads)
+├── sample_outputs/                              generated samples (gitignored)
+├── logs/                                        run logs
+└── .venv/                                       uv venv (mlx 0.31.2, mlx-vlm 0.5.0, transformers 5.8.0)
+```
+## How to run
+```bash
+# Generate one image (Q8 recommended)
+.venv/bin/python scripts/hidream_o1/generate_hidream_o1_mlx.py \
+  --model-path mlx_models/hidream-o1-dev-q8 \
+  --prompt "your prompt here" \
+  --width 1024 --height 1024 \
+  --output sample_outputs/whatever.png \
+  --seed 42
+# Re-convert from HF (only needed if you delete mlx_models/)
+.venv/bin/python scripts/hidream_o1/convert_hidream_o1_to_mlx.py \
+  --hf-source HiDream-ai/HiDream-O1-Image-Dev \
+  --out-dir mlx_models/hidream-o1-dev-q8 \
+  --bits 8 --check-disk
+```
+## Hard rules
+1. **Q8 only.** Q4 ships dark; the bright/colourful ground truth comes back at Q8. Documented in EVALUATION.md.
+2. **`s_noise=7.5` is load-bearing.** Lowering it collapses the image. FlashFlowMatch tuned for the Dev distillation.
+3. **28 steps.** Dev was distilled to 28; lower is undertrained.
+4. **Splitting safetensors after conversion = land mine.** The original converter overwrote source mmap mid-read and zeroed every weight silently. Now split happens inside the converter in one pass; never re-read+overwrite the same file.
+5. **Custom heads go into `extras/custom_heads.safetensors`** (subfolder so mlx-vlm's `glob *.safetensors` doesn't pick them up).
+6. **Phosphene `agent/image_engine.py` calls this lab via subprocess** — don't import mlx-vlm into Phosphene's interpreter.
+7. **No edit/multi-ref support yet.** Architecture supports it, lab pipeline doesn't. Refs through Phosphene continue to use `mflux qwen-edit`.
+## Performance ceiling
+`mx.compile` on the forward pass = **0% gain**. We are bandwidth-bound on the 36-layer Q8 decoder. **2.36 s/step at 1024 is the floor** on this hardware. To go faster you need a smaller distillation, fewer steps, or text-cache reuse across denoising steps (~2-5% gain at most, very invasive).
+## Identity rules
+- Lab repo is local-only, **no remote** — commit author is `hidream-o1-mlx-lab <lab@local>` (cosmetic; doesn't matter)
+- **Phosphene-dev.git commits**: identity is `mrbizarro <mrbizarro@users.noreply.github.com>`. **No Co-Authored-By trailer.** Branch is `dev`, never `main` without explicit OK.
+## Cross-references
+- Phosphene CLAUDE.md: `~/pinokio/api/phosphene-dev.git/CLAUDE.md`
+- HF model: https://huggingface.co/HiDream-ai/HiDream-O1-Image-Dev
+- Reference repo: https://github.com/HiDream-ai/HiDream-O1-Image
+- mlx-vlm qwen3_vl: https://github.com/Blaizzy/mlx-vlm/tree/main/mlx_vlm/models/qwen3_vl

LICENSE ADDED Viewed

	@@ -0,0 +1,27 @@

+MIT License
+Copyright (c) 2026 mrbizarro and contributors
+This project (hidream-o1-mlx) is an MLX port of HiDream-O1-Image-Dev for Apple
+Silicon. The upstream HiDream-O1-Image source code (https://github.com/HiDream-ai/HiDream-O1-Image)
+and the model weights (https://huggingface.co/HiDream-ai/HiDream-O1-Image-Dev)
+are released under the MIT License by HiDream-ai. This port preserves that
+license.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,210 @@

+---
+license: mit
+base_model: HiDream-ai/HiDream-O1-Image-Dev
+tags:
+  - mlx
+  - mlx-vlm
+  - hidream
+  - text-to-image
+  - apple-silicon
+  - bf16
+language:
+  - en
+pipeline_tag: text-to-image
+library_name: mlx
+inference: false
+---
+# HiDream-O1-Image-Dev — MLX port for Apple Silicon
+A native MLX port of [HiDream-ai/HiDream-O1-Image-Dev](https://huggingface.co/HiDream-ai/HiDream-O1-Image-Dev) for fast local image generation on Apple Silicon Macs. **No PyTorch, no CUDA, no flash-attn required at inference time.**
+HiDream-O1 is an 8B Qwen3-VL-based **unified pixel-patch transformer** — it predicts raw 32×32 RGB patches directly through the same backbone that handles text, with no separate VAE. The Dev variant is a 28-step distillation of the 50-step Full model, released under the MIT license.
+This port:
+- Reuses [`mlx-vlm`](https://github.com/Blaizzy/mlx-vlm)'s Qwen3-VL backbone (vision tower, decoder layers, mrope-3D)
+- Adds the three diffusion-side custom heads (`t_embedder1`, `x_embedder`, `final_layer2`)
+- Ports the `FlashFlowMatchEulerDiscreteScheduler` and the unified-token-sequence builder
+- Ships **BF16 weights** (no quantization — see "Why BF16" below)
+## Hero samples
+All generated by the included generator script on a 64 GB Mac Studio. Click any image to open full-resolution.
+<table>
+<tr>
+<td><a href="sample_outputs/hero/04_construction_worker.png"><img src="sample_outputs/hero/04_construction_worker.png" width="350"/></a></td>
+<td><a href="sample_outputs/hero/01_tea_master.png"><img src="sample_outputs/hero/01_tea_master.png" width="350"/></a></td>
+</tr>
+<tr>
+<td>Construction worker on a rainy rooftop, Kodak Tri-X B&amp;W. 2048×2048, BF16, 213s.</td>
+<td>Elderly Japanese tea master holding a ceramic cup. 1024×1024, Q6 (showcase), 36s.</td>
+</tr>
+<tr>
+<td><a href="sample_outputs/hero/02_tropical_beach.png"><img src="sample_outputs/hero/02_tropical_beach.png" width="350"/></a></td>
+<td><a href="sample_outputs/hero/07_kitchen_morning.png"><img src="sample_outputs/hero/07_kitchen_morning.png" width="350"/></a></td>
+</tr>
+<tr>
+<td>Tropical beach with turquoise water and palms. 1024×1024, Q8, 67s.</td>
+<td>Candid morning portrait, woman with coffee + toast, soft window light. 1440×2560, BF16, 127s.</td>
+</tr>
+<tr>
+<td><a href="sample_outputs/hero/03_astronaut.png"><img src="sample_outputs/hero/03_astronaut.png" width="350"/></a></td>
+<td><a href="sample_outputs/hero/05_mountain_peak.png"><img src="sample_outputs/hero/05_mountain_peak.png" width="350"/></a></td>
+</tr>
+<tr>
+<td>Astronaut in space-station corridor, anamorphic lens flare. 2560×1440, BF16, 187s.</td>
+<td>Snow-capped mountain peak at sunset. 2048×2048, Q4 (early), 236s.</td>
+</tr>
+<tr>
+<td><a href="sample_outputs/hero/06_alice_cyberpunk.png"><img src="sample_outputs/hero/06_alice_cyberpunk.png" width="350"/></a></td>
+<td><a href="sample_outputs/hero/08_fitness_BF16.png"><img src="sample_outputs/hero/08_fitness_BF16.png" width="350"/></a></td>
+</tr>
+<tr>
+<td>Alice in cyberpunk, neon Cheshire cat hologram. 2048×2048, Q8, 276s.</td>
+<td>Fitness influencer mid-deadlift in industrial gym. 1440×2560, BF16, 127s.</td>
+</tr>
+</table>
+More: [`sample_outputs/hero/`](sample_outputs/hero/).
+## Why BF16, not Q4/Q6/Q8
+| Quant | Backbone size | 1024×1024 wall | Quality |
+|---|---|---|---|
+| Q4 | 5.6 GB | 25 s | ❌ Brightness collapses — ships dark |
+| Q6 | 8 GB | 36 s | ⚠ Visible 32-px patch grid at non-square dims |
+| Q8 | 10 GB | 67 s | ⚠ Same — works only at square 2048×2048 |
+| **BF16** | **17.55 GB** | **67 s** | ✅ Clean across all trained dimensions |
+Per-group dequantization rounding compounds across the 36 decoder layers and shows as a 32-pixel grid in flat regions (skies, walls, water). BF16 matches the upstream's `torch_dtype=torch.float32 + autocast(bfloat16)` precision and is the only quant we tested that produces clean output across all trained dimensions. On a 64 GB Mac the 16 GB working set is comfortable; on 32 GB it's tight — use Q8 at square 2048×2048 there.
+## Install
+Requires macOS on Apple Silicon (M1 or newer). Tested on macOS 14+ with a 64 GB Mac Studio.
+```bash
+git clone https://github.com/<you>/hidream-o1-mlx
+cd hidream-o1-mlx
+uv venv --python 3.11
+uv pip install -r requirements.txt
+# Convert the upstream HF weights to MLX BF16 (~5 minutes, requires ~50 GB free disk)
+.venv/bin/python scripts/hidream_o1/convert_hidream_o1_to_mlx.py \
+  --hf-source HiDream-ai/HiDream-O1-Image-Dev \
+  --out-dir mlx_models/hidream-o1-dev-bf16 \
+  --bits 16
+```
+## Usage
+```bash
+# Single image, default 1024×1024 BF16
+.venv/bin/python scripts/hidream_o1/generate_hidream_o1_mlx.py \
+  --model-path mlx_models/hidream-o1-dev-bf16 \
+  --prompt "your prompt here" \
+  --output sample_outputs/whatever.png \
+  --seed 42
+# Higher resolution (2048×2048 = upstream default)
+.venv/bin/python scripts/hidream_o1/generate_hidream_o1_mlx.py \
+  --model-path mlx_models/hidream-o1-dev-bf16 \
+  --prompt "..." \
+  --width 2048 --height 2048 \
+  --output sample_outputs/big.png
+# Vertical / cinema (auto-snaps to nearest trained ratio)
+.venv/bin/python scripts/hidream_o1/generate_hidream_o1_mlx.py \
+  --model-path mlx_models/hidream-o1-dev-bf16 \
+  --prompt "..." \
+  --width 1440 --height 2560 \
+  --output sample_outputs/portrait.png
+```
+### Trained resolutions
+HiDream-O1 was trained on a fixed list of resolutions. The generator auto-snaps to the closest. Off-spec dims produce visible patch artifacts. The trained list:
+```
+2048×2048, 2304×1728, 1728×2304, 2560×1440, 1440×2560,
+2496×1664, 1664×2496, 3104×1312, 1312×3104, 2304×1792, 1792×2304
+```
+## Prompt tips for realism
+HiDream is responsive to camera/film terminology. To avoid the AI-glossy look:
+- Lead with `masterpiece, best quality` (community-found responder phrase)
+- Subject + Actions → Setting → Style → Details ordering
+- Specify equipment: `Leica M6 with Kodak Tri-X 400`, `Pentax K1000 + Cinestill 800T`, `Hasselblad H6D medium format`
+- Reference real photographers: Sebastião Salgado, Saul Leiter, Wim Wenders, Annie Leibovitz, Anders Petersen
+- Spell out skin imperfection: "natural pores", "faint laugh lines", "weathered hands", "no retouching"
+- Avoid "stunning", "perfect", "beautiful" — they push toward AI-glamour aesthetics
+The Dev model uses `guidance_scale=0.0` so negative prompts have no effect — push positive prompts harder instead.
+## What's in this repo
+```
+hidream-o1-mlx/
+├── README.md                                 (this file)
+├── LICENSE                                   (MIT)
+├── requirements.txt                          (mlx-vlm 0.5.0, transformers 5.8+, deps)
+├── scripts/hidream_o1/
+│   ├── convert_hidream_o1_to_mlx.py          (HF → MLX, BF16 / Q4 / Q6 / Q8)
+│   ├── generate_hidream_o1_mlx.py            (T2I generator + experimental edit/multi-ref)
+│   ├── hidream_model.py                      (custom heads + forward_generation)
+│   ├── pipeline_helpers.py                   (T2I sample, mrope, mask, patchify)
+│   └── flow_match.py                         (FlashFlowMatchScheduler in MLX)
+├── docs/
+│   ├── EVALUATION.md                         (perf + quality findings, A/B vs mflux)
+│   ├── HIDREAM_O1_MLX_PORT_REPORT.md         (architecture + weight conversion details)
+│   └── PHOSPHENE_INTEGRATION_PLAN.md         (how it slots into a host app)
+├── sample_outputs/                           (gallery)
+└── mlx_models/                               (where converted weights land)
+```
+## Performance
+| Resolution | Per step | Total (28 steps) | Peak RAM |
+|---|---|---|---|
+| 1024×1024 | 2.4 s | 67 s | 16 GB |
+| 1440×2560 | 4.5 s | 127 s | 16 GB |
+| 2048×2048 | 6.7 s | 187 s | 16 GB |
+| 3104×1312 | 7.6 s | 213 s | 16 GB |
+`mx.compile` gives 0% speedup — the inference loop is bandwidth-bound on the 36-layer BF16 decoder. To go faster you'd need a smaller distillation (none public) or text-cache reuse across denoising steps.
+## Status
+- ✅ Text-to-image: production-quality, BF16 default
+- ✅ Native MLX, no PyTorch / CUDA / flash-attn at inference time
+- ⚠ Edit / multi-reference: scaffolding present (`--ref-images` flag) but produces degenerate output — needs debugging. Refs through other engines (e.g. `mflux qwen-edit`) work correctly.
+- ❌ Multi-reference subject personalization: same as above
+## Acknowledgements
+- [HiDream-ai](https://github.com/HiDream-ai) for the original HiDream-O1-Image model + MIT license
+- [Blaizzy/mlx-vlm](https://github.com/Blaizzy/mlx-vlm) for the Qwen3-VL MLX backbone (this port reuses their vision tower + decoder layers + mrope-3D wholesale)
+- [Apple ml-explore/mlx](https://github.com/ml-explore/mlx) for the MLX framework
+- The Civitai community's [HiDream prompt-engineering guide](https://civitai.com/articles/16050/hi-dream-prompt-engineering)
+## Citation
+If you use this in research, cite the upstream model:
+```bibtex
+@misc{hidream-o1-image,
+  author = {HiDream-ai},
+  title = {HiDream-O1-Image: Pixel-Level Unified Transformer},
+  year = {2026},
+  url = {https://github.com/HiDream-ai/HiDream-O1-Image}
+}
+```
+## License
+MIT — see [LICENSE](LICENSE).

STATE.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# HIDREAM-O1-MLX-LAB — STATE
+**Last updated:** 2026-05-09 (session that landed Q8 + Phosphene integration)
+## TL;DR — where we are
+- **Q6 is the new sweet spot.** 1.30 s/step at 1024×1024, ~36 s per image, ~8.5 GB RAM. 2× faster than Q8 with equivalent quality.
+- Q8 still works (2.36 s/step, 11.5 GB RAM) — keep it for deterministic upper-bound RAM use cases.
+- Q4 deleted from disk: ships dark, no reason to keep around (regenerable in 5 min if needed).
+- Backbone sizes: Q6 backbone 7.95 GB, Q8 backbone 9.96 GB. Custom heads 75 MB.
+- **Shipped to Phosphene `dev`** as `kind="hidream"` in `agent/image_engine.py` (commits `45cad69`, `962b353`). Default model on Phosphene side will be updated to Q6.
+- Showcase battery + A/B vs mflux Z-Image-Turbo done at Q8. At Q6, HiDream is now ~2× faster than Z-Image-Turbo (36s vs 80s) AND has lower deterministic RAM (8.5 GB vs 5.9–29.4 GB variable).
+- 19+ sample images in `sample_outputs/`.
+- Lab branch: `perf-lab-hidream-o1-mlx`, **no remote**.
+## What's been done
+| Date | Work | Commit |
+|---|---|---|
+| 2026-05-09 | Initial scaffolding (Path B chosen) | `746efe9` |
+| 2026-05-09 | Wire mlx-vlm Qwen3VLModel directly (4D mask path) | `53eb605` |
+| 2026-05-09 | First working images (mushroom 512, cat/beach/portrait 1024 at Q4) | `d944a31` |
+| 2026-05-09 | Q8 conversion + samples (dark aesthetic was Q4, not the model) | `2bf029a` |
+| 2026-05-09 | Showcase battery + evaluation + Phosphene plan | `0bac049` |
+| 2026-05-09 | Phosphene integration shipped to `dev` | phos `45cad69` |
+| 2026-05-09 | A/B vs mflux Z-Image-Turbo on 3 prompts | `2761ad8` |
+| 2026-05-09 | Phosphene IMAGE_GEN_RESEARCH doc updated | phos `962b353` |
+| 2026-05-09 | --blend-seams post-process (opt-in, below-threshold at Q8) | `0583356` |
+| 2026-05-09 | Q6 = sweet spot (2× faster than Q8, same quality) — Phosphene default switched | `f4fb0ba` + phos `8a48953` |
+| 2026-05-09 | Q6 verified across 10-prompt showcase battery | `4d3f18c` |
+| 2026-05-09 | Edit/multi-ref scaffold (WIP — runs but output degenerate) | `525b7ec` |
+| 2026-05-09 | BF16 default — Q4/Q6/Q8 all show patch-grid at non-square dims | (next) |
+| 2026-05-09 | Phosphene default switched to BF16 | phos `af94bd0` |
+| 2026-05-09 | OSS release prep: HF model card, LICENSE, requirements, gitignore | (next) |
+## Known characteristics (not bugs)
+- **Patch grid in flat regions** — architectural (PATCH_SIZE=32 with no overlap). Mild at Q8. `--blend-seams 1` is opt-in but doesn't visibly help.
+- **Text rendering** — short, structured signs work ("BLOOM CAFE"). Long text falls apart.
+- **Deterministic per-prompt RAM** — 11.5 GB at 1024 Q8 regardless of prompt complexity. Z-Image-Turbo varies wildly (5.9–29.4 GB).
+## Open work / next session candidates
+Pick from these, listed roughly cheapest-first:
+1. ~~**2048×2048 Q8 generation pass**~~ — DONE 2026-05-09. `sample_outputs/v4_2048_alice_q8.png` — 276 s (9.86 s/step), peak RAM 10.8 GB. Q8 at 2048 is slower per step than Q4 (10s vs 8.4s) due to bandwidth. Output is showcase-grade: detailed cybernetic dress, holographic Cheshire cat, near-legible neon signs.
+2. **Test the Phosphene integration through the dev panel UI** (port 8199). Generate one shot via the Image Studio dropdown, confirm pill goes green, the PNG lands.
+3. **Edit / multi-reference path** — SCAFFOLD LANDED, NEEDS DEBUGGING.
+   - `build_edit_text_sample`, `resize_pilimage`, `calculate_dimensions`, `patchify_ref_image` all ported from upstream pipeline.py + utils.py.
+   - `--ref-images` flag wired in generate_hidream_o1_mlx.py.
+   - `precompute_text_embeds_with_vision` precomputes the text+vision embeds once before the loop (since they don't change with timestep) — a meaningful perf win.
+   - **Smoke test (synthesized two-color ref, K=1, 28 steps, Q6) runs end-to-end without errors but output is uniform tan/khaki.** T2I path with same prompt+seed produces a vibrant abstract correctly, so the model and weights are fine.
+   - Debugging done so far (see `scripts/hidream_o1/_edit_diag.py` and `_precompute_diag.py`):
+     - **All shapes verified correct** (input_ids 174 with 144 image-placeholders, vision tower outputs 144 features, vinput_mask = 256 tgt + 256 ref, position_ids 686 covering all spans).
+     - **Vision feature scatter verified mathematically correct** — at image_token positions `combined` equals `image_features` exactly (diff=0); at text positions `combined` equals `embed_tokens(input_ids)` exactly (diff=0). Vision features are well-behaved (mean ~0, std ~0.4).
+     - **Position_ids structure looks right** — text positions are sequential, target span gets fix_point=4096 base (per upstream), ref diffusion span continues sequentially.
+   - **Remaining suspects** (in order of likelihood):
+     - Mask construction: maybe text-row causal needs to ALSO see the K image_placeholder positions inside proc.input_ids? Upstream `_run_decoder_flash` has special handling — the non-flash 4D mask path may treat text positions as needing to see embedded vision features. Worth re-reading qwen3_vl_transformers.py:1486-1520.
+     - Position_ids semantic alignment: my appended-vinputs at positions [174..686) get mrope codes from input_ids_pad's vision_tokens portion, but maybe these need to match the appended embedding ORDER not just their positions in input_ids_pad.
+     - bf16 underflow in attention with the larger 686-token sequence vs T2I's 268.
+   - Samples: `sample_outputs/v6_edit_smoke.png` (degenerate, synthesized 2-color ref), `sample_outputs/v6_edit_cat_real.png` (degenerate, real cat photo as ref), `sample_outputs/v6_edit_t2i_baseline.png` (T2I works fine same prompt+seed).
+   - This is the single biggest open item. Would let HiDream replace mflux qwen-edit functionally.
+4. **Promote Phosphene integration to `main`** after the user has tested on dev panel.
+5. **Quality-aware post-process** — try a cheap learned upscaler instead of the seam blend (e.g. SeedVR2 via mflux's `mflux-upscale-seedvr2` to take 1024 → 2048).
+6. **Text-cache reuse across denoising steps** — fork mlx-vlm's Qwen3VLModel to cache the text-portion KV across the 28 denoising calls. ~2-5% speedup max but a real architectural improvement.
+## Hard stop conditions (still relevant)
+- Q4 ships dark — established. Use Q8.
+- mx.compile = 0% gain — established. Inference loop is at the floor.
+- Splitting safetensors mid-read zeroed weights — fixed in converter; don't re-introduce.
+## How to ramp up fast (next session)
+1. `cd /Users/salo/HIDREAM-O1-MLX-LAB-active`
+2. `cat README.md CLAUDE.md STATE.md docs/EVALUATION.md` (in that order)
+3. `git log --oneline | head -10` to see where we are
+4. `ls sample_outputs/` to see what's been generated
+5. To regenerate or extend: see the commands in CLAUDE.md
+## Disk situation snapshot
+As of 2026-05-09 the data volume `/dev/disk3s5` had ~45 GB free of 926 GB after the user's mid-session cleanup (deleted `phosphene-model-lab.git` and `comfy.git`, freed ~83 GB). The lab itself is ~16 GB on disk (10 GB Q8 + 6 GB Q4 models + 1.5 GB venv + samples + lab code). **Do not re-download** the HiDream HF source unless `mlx_models/hidream-o1-dev-q4/` AND `mlx_models/hidream-o1-dev-q8/` both go missing — both can be regenerated from one HF download.

UPLOAD_TO_HF.md ADDED Viewed

	@@ -0,0 +1,102 @@

+# How to publish this to Hugging Face
+Two-repo plan (recommended):
+1. **Code repo** on GitHub: `https://github.com/<you>/hidream-o1-mlx`
+2. **Weights repo** on HF Hub: `https://huggingface.co/<you-or-org>/HiDream-O1-Image-Dev-mlx-bf16`
+Linking them keeps clones fast (people who just want the code don't pull the 17 GB safetensors via LFS) while still making the weights one-click pullable from `huggingface_hub`.
+## Option A — separate code + weights repos
+### 1. Code repo on GitHub
+```bash
+cd /Users/salo/HIDREAM-O1-MLX-LAB-active
+# Initialize a new public-friendly remote
+git remote add origin git@github.com:<you>/hidream-o1-mlx.git
+# .gitignore mlx_models/ so weights don't go to GitHub (HF Hub will host them)
+echo "mlx_models/" >> .gitignore
+git add -A
+git commit -m "Initial public release"
+git branch -M main
+git push -u origin main
+```
+### 2. Weights repo on HF Hub
+```bash
+# Install hf CLI if needed
+pip install huggingface_hub
+# Login once
+hf auth login   # paste a write-token from https://huggingface.co/settings/tokens
+# Create the repo
+hf repo create HiDream-O1-Image-Dev-mlx-bf16 --type model
+# Upload only the weights dir + config + tokenizer + custom heads
+cd mlx_models/hidream-o1-dev-bf16
+hf upload <you>/HiDream-O1-Image-Dev-mlx-bf16 . . \
+  --repo-type model \
+  --commit-message "Initial BF16 release"
+```
+What gets uploaded (~17.5 GB total):
+- `model.safetensors` (17 GB) — backbone, mlx-vlm-loadable
+- `extras/custom_heads.safetensors` (75 MB) — diffusion-side heads
+- `config.json` — Qwen3-VL config (no `quantization` field for BF16)
+- `tokenizer.json`, `tokenizer_config.json`, `vocab.json`, `merges.txt`, `chat_template.json`
+- `preprocessor_config.json`, `video_preprocessor_config.json`
+- `mlx_lab_meta.json` — provenance marker
+### 3. Cross-reference
+In the GitHub README, point to the HF weights repo. In the HF model card README (which we already prepped), point to the GitHub code.
+## Option B — single HF repo with everything
+If you want the simplest user experience (`hf download <repo>` → ready to run):
+```bash
+hf repo create hidream-o1-mlx --type model
+cd /Users/salo/HIDREAM-O1-MLX-LAB-active
+# Track .py + .md as plain files; .safetensors via LFS (already in .gitattributes)
+git remote add hf https://huggingface.co/<you>/hidream-o1-mlx
+git lfs install
+git lfs track "*.safetensors"
+git add -A
+git commit -m "Initial release"
+git push hf main
+```
+People then do:
+```bash
+hf download <you>/hidream-o1-mlx
+cd hidream-o1-mlx
+uv venv --python 3.11 && uv pip install -r requirements.txt
+.venv/bin/python scripts/hidream_o1/generate_hidream_o1_mlx.py --prompt "..." --output out.png
+```
+## What NOT to upload
+- `.venv/` — gitignored
+- `logs/` — gitignored
+- `notes/` — internal scratch, optional
+- `__pycache__/` — gitignored
+- `mlx_models/hidream-o1-dev-q4/` and `q6/`, `q8/` — only ship BF16. They're regenerable with `--bits 4|6|8` and have known quality issues at non-square dims.
+## Pre-flight checklist
+- [ ] LICENSE file (MIT) at root — done
+- [ ] README.md as HF-format model card — done
+- [ ] requirements.txt with pinned versions — done
+- [ ] .gitattributes for LFS — done
+- [ ] No personal paths (`/Users/salo/...`) hardcoded in scripts that aren't optional — verify with `grep -r "/Users/salo" scripts/`
+- [ ] Sample images included for the model card — copy 4-6 best to `sample_outputs/hero/`
+- [ ] Test fresh clone install on a different machine if possible

docs/EVALUATION.md ADDED Viewed

	@@ -0,0 +1,145 @@

+# HiDream-O1-Image-Dev (Q8 MLX) — evaluation
+**Setup:** lab branch `perf-lab-hidream-o1-mlx`, mlx-vlm 0.5.0 + mlx 0.31.2, Mac Studio (64 GB).
+**Recipe:** Dev — 28 steps, FlashFlowMatch, `s_noise=7.5`, `noise_clip_std=2.5`, `shift=1.0`.
+**All times** are honest wall-clock with `mx.eval` per step. **All RAM** is peak `maximum resident set size`.
+## Q6 showcase verification (2026-05-09 evening)
+Re-ran the same 10-prompt battery at Q6 with identical seeds. **All 10 are visually equivalent or better than the Q8 versions:**
+- 9/10 are near-pixel-identical aesthetics (different latent noise from quant differences yields same compositions / lighting / subjects)
+- **10 (text rendering) is visibly better at Q6** — "BLOOM CAFE" neon sign is crisp at Q6 vs a glitched "M" at Q8
+Per-image timing was rock-steady at **35.9 s** (1.28 s/step). Total battery time: ~6 minutes vs ~12 minutes at Q8.
+Outputs: `sample_outputs/showcase_q6/` (compare against `sample_outputs/showcase/` for the Q8 originals).
+## Battery: 10 prompts, 1024×1024, all Q8
+| # | Genre | Prompt summary | Result | Time |
+|---|---|---|---|---|
+| 01 | photo portrait | elderly Japanese tea master | **Excellent** — face character, gentle smile, paper screens, calligraphy | 81.5 s* |
+| 02 | anime / illustration | pink-haired girl on Tokyo rooftop at dusk | **Excellent** — anime style + cherry blossoms + neon city below | 65.3 s |
+| 03 | macro photo | dewdrop on spiderweb | **Excellent** — refractions, blurred leaf bg, crisp web detail | 65.9 s |
+| 04 | architecture | futuristic library, holographic displays | **Excellent** — vaulted ceiling, stained glass, holo screens | 66.3 s |
+| 05 | surreal painting | whale floating over desert at sunset | **Excellent** — magical realism, painterly clouds | 65.8 s |
+| 06 | food flatlay | rustic Italian breakfast on marble | **Excellent** — golden croissants, espresso, berries, soft light | 66.4 s |
+| 07 | cinematic action | samurai mid-leap with katana, Mt. Fuji bg | **Excellent** — dynamic pose, cherry blossoms, real mountain | 66.1 s |
+| 08 | fantasy | dragon on crystal mountain with aurora | **Excellent** — iridescent scales, snow swirling, aurora visible | 66.4 s |
+| 09 | wildlife photo | snow leopard staring at camera | **Excellent** — direct gaze, falling snow, mountain bg | 67.1 s |
+| 10 | text rendering | "BLOOM CAFE" pink neon diner | **Good** — sign legible (small "M" glitch), retro diner, rainy street | 67.1 s |
+*Image 01 included cold model load (~12-15 s).
+**Steady-state per-image: 65-67 s at 1024×1024 Q8.** Dead-consistent across genres.
+## Honest timings
+| Resolution | Quant | Per step | Total (28 steps) | Peak RAM |
+|---|---|---|---|---|
+| 512×512 | Q4 | 0.89 s | 24.9 s | ~6 GB |
+| 1024×1024 | Q4 | 2.37 s | 66 s | ~6 GB |
+| 1024×1024 | **Q6** | **1.30 s** | **36 s** | **~8.5 GB** |
+| 1024×1024 | Q8 | 2.36 s | 66 s | ~11.5 GB |
+| 1280×704 | Q8 | 2.53 s | 70.7 s | ~7 GB |
+| 704×1280 | Q8 | 2.35 s | 65.9 s | ~3 GB (warm cache) |
+| 2048×2048 | Q4 | 8.44 s | 236 s | ~7.2 GB |
+| 2048×2048 | Q8 | 9.86 s | 276 s | ~10.8 GB |
+**Q6 is the sweet spot.** 2× faster than Q8 at 1024 with the same prompt fidelity (cat in sunlit kitchen + beach with palm trees both rendered identically to Q8 outputs). 30% less RAM. The bandwidth-bound theory holds: fewer bits per param → less weight bandwidth → faster per-step.
+**Q4 corrupts brightness** (ships dark) so the speed of Q4 vs Q6 is academic — never use Q4 for production. Q6 has the speed and Q8 has the steady-state safety; Q6 wins on perf, Q8 wins on a deterministic upper bound on RAM.
+## Where HiDream-O1-Image-Dev shines
+- **Subject identity** — every prompt subject was rendered correctly. No "vibrant orange tabby" → cat-shape-blob. The model knows what things look like.
+- **Multi-element scenes** — samurai + Fuji + cherry blossoms; cyberpunk Alice + neon Cheshire cat + circuit dress + rain. Composition stays coherent.
+- **Style adherence** — anime ≠ photorealism ≠ oil painting ≠ macro. Got all four right.
+- **Light realism** — the architecture image's light through stained glass; the food flatlay's morning warmth; the action scene's sunset rim lighting. Light feels real, not stamped on.
+- **Text rendering** (limited) — "BLOOM CAFE" in neon was readable. Better than most diffusion models; not as clean as a model with explicit OCR pretraining.
+## Where it's weak
+- **Patch-grid artifact** in flat regions. PATCH_SIZE=32 with no overlap → visible 32×32 grid in skies, water, walls. Most visible at low-frequency content. Architectural — not fixable without retraining or an overlap-blending postprocess.
+- **Q4 brightness collapse** — Q4 desaturates and darkens everything. Q8 fixes it. **Ship Q8.**
+- **Hands** — hands when present in scenes (e.g. tea master holding cup) look fine at moderate detail, but the model isn't immune to the standard diffusion hand failure modes; haven't stress-tested.
+- **Dense long text** — "BLOOM CAFE" is short and structured. A paragraph of text would likely fall apart.
+- **Speed at 2048** — 4 minutes per image is slow for iterative work. Fine for a final pass.
+## Sweet spot
+**1024×1024, Q6, default Dev recipe, ~36 s/image, ~8.5 GB RAM.** Bright/colourful output equivalent to Q8, half the wall time, 30% less RAM. 512 is fast (~25 s) but loses detail. 2048 is gorgeous but iterative-unfriendly.
+**Quant decision tree:**
+- 16 GB Mac → don't run HiDream; use mflux Z-Image-Turbo
+- 32 GB Mac → Q6 is comfortable, Q8 leaves no headroom alongside LTX
+- 64 GB Mac → Q6 default; Q8 only when you want deterministic upper-bound RAM
+## A/B vs mflux Z-Image-Turbo
+Same prompts, same seeds, both at 1024×1024.
+| # | Prompt | HiDream Q8 | Z-Image-Turbo Q4 (mflux) | Subjective winner |
+|---|---|---|---|---|
+| 1 | tea master | [v3](../sample_outputs/showcase/01_portrait_photo.png) — wide scene, paper screens, calligraphy | [zimg](../sample_outputs/ab_mflux/01_portrait_zimage.png) — tighter portrait, gray garment, smile | **Tie** — different framings, both excellent |
+| 2 | sunlit beach | [v3](../sample_outputs/v3_1024_beach_q8.png) — turquoise water, palm trees, beach chair | [zimg](../sample_outputs/ab_mflux/02_beach_zimage.png) — vivid blue water, palms, big sand foreground | **Tie** — both nail the prompt |
+| 3 | alice cyberpunk | [v3](../sample_outputs/v3_alice_horizontal_q8.png) (horizontal) — clear dress + face + Cheshire | [zimg](../sample_outputs/ab_mflux/03_alice_zimage.png) — more painterly, atmospheric Cheshire silhouette | **HiDream** for face/dress detail; **Z-Image** for atmosphere |
+### Speed + RAM (measured, not estimated)
+| Engine | Steps | Wall (1024) | Per step | Peak RAM |
+|---|---|---|---|---|
+| HiDream-O1-Dev / Q8 | 28 | **67 s** | 2.41 s | **11.5 GB** |
+| Z-Image-Turbo / Q4 | 9 | 80 s | 8.85 s | **5.9–29.4 GB** (varies by prompt) |
+Surprises:
+- HiDream is **faster per image** despite needing 28 steps vs Z-Image-Turbo's 9 — Z-Image's per-step cost is ~3.7× HiDream's.
+- Z-Image's peak RAM **varied wildly across prompts** (5.9 GB for portrait, 29.4 GB for the alice cyberpunk). HiDream's peak was steady at ~11.5 GB regardless of prompt complexity.
+### Verdict
+Both are excellent local engines. Pick by the workload:
+- **Default/compact**: keep **Z-Image-Turbo** — 5.9 GB RAM on most prompts, runs anywhere.
+- **Hero shots / max prompt fidelity**: **HiDream-O1-Q8** — faster wall time, deterministic memory, more environmental detail in the output.
+- **Editing / multi-ref**: keep **mflux qwen-edit** — HiDream lab pipeline doesn't support refs yet.
+## Patch-grid post-blend experiment
+Implemented `--blend-seams <radius>` post-process in `generate_hidream_o1_mlx.py`: after decoding the final image, average a thin band across each 32-pixel patch boundary line (radius=1 → blend the seam row with one neighbour on each side, then 50% blend back into the seam itself).
+**Result on the same beach prompt + seed 11 + Q8:**
+| Comparison | Mean abs diff (out of 255) |
+|---|---|
+| baseline vs blend r=1 | 0.18 |
+| baseline vs blend r=2 | 0.23 |
+Per-row breakdown confirms the blend is **surgical** — only seam rows (every 32) change, by 1–2.7 pixel values; non-seam rows shift by <0.2. So the math is doing exactly what it says.
+**But visually**: at Q8 the seam artifact is already mild. The blend's 1–2 pixel-value smoothing is below visual threshold. No win, but no harm — and zero added latency (numpy vector ops on a 1024×1024 image are sub-ms).
+Bottom line: kept as opt-in flag `--blend-seams 1`. Did not enable by default. The real fix for the patch grid would need overlap-blended patches (architectural change) or a stronger spatial filter (which would visibly blur the image).
+## Software-side speed: nothing left
+Tested `mx.compile` on the forward pass: **0% improvement** (2.366 s/step compiled vs 2.368 s/step uncompiled). The forward is already bandwidth-bound by the 36-layer Q8 decoder's matmul stream — MLX is already at near-GPU-saturation. Same conclusion for `mx.fast.scaled_dot_product_attention` (already used inside mlx-vlm's Qwen3VLAttention).
+**The path to faster is architectural, not algorithmic:**
+- Fewer steps (would need a smaller distillation; Dev is already the distilled variant)
+- Smaller backbone (would need re-distillation onto a 4B Qwen3-VL — no public version)
+- Caching the text-portion hidden states across denoising steps — possible but invasive (would need to subclass mlx-vlm's Qwen3VLModel; ~2-5% speedup at best since text is <2% of seq length)
+## Verdict
+- **Working.** Q8 produces real, prompt-faithful, high-quality images at ~67 s/1024.
+- **No more easy speedups.** The lab's inference loop is already at the floor for this architecture on this hardware.
+- **Patch artifacts are real but mild.** Low-frequency regions show a 32-pixel grid. Subjects-with-content scenes hide it well.
+- **Q8 is the only acceptable quant.** Q4 ships dark. If we ever want a smaller variant, would need different bit packing or selective Q6.
+## Recommendation for Phosphene
+Slot it in as a third local engine alongside `mflux Z-Image-Turbo` (compact tier) and `mflux FLUX.2-klein-4B` (comfortable tier). Mark HiDream as **comfortable+** (32 GB+) due to the 11.5 GB working set. Don't make it the default — it's slower per image and uses more RAM than Z-Image-Turbo. Make it **the option** for users who want max prompt fidelity and license clarity (MIT, no NC restriction).
+See [PHOSPHENE_INTEGRATION_PLAN.md](PHOSPHENE_INTEGRATION_PLAN.md) for the patch.

docs/HIDREAM_O1_MLX_PORT_REPORT.md ADDED Viewed

	@@ -0,0 +1,146 @@

+# HiDream-O1-Image MLX port — working
+Lab branch: `perf-lab-hidream-o1-mlx`
+Lab path:   `/Users/salo/HIDREAM-O1-MLX-LAB-active/`
+Date:       2026-05-09
+Status:     **Shipped. Q8 inference end-to-end on Apple Silicon. Phosphene `dev` integration live (commit 45cad69).**
+---
+## TL;DR
+- Path B (standalone MLX wrapper around `mlx-vlm` Qwen3-VL backbone) — confirmed viable.
+- **Q8 is the right configuration.** Q4 corrupts the brightness distribution badly enough to make every image dark/moody. Q8 produces clean, fully prompt-faithful images.
+- Sizes: Q4 backbone 5.6 GB, Q8 backbone 10 GB. Custom heads 75 MB.
+- 512×512 in **24.9 s** Q4 (28 steps × 0.89 s). 1024×1024 in **65–67 s** at both Q4 and Q8 (28 steps × 2.4 s — Q8 not measurably slower because the bottleneck is bandwidth, not arithmetic). 2048×2048 in **236 s** Q4 (8.4 s/step).
+- Peak RAM: Q4 ≈ 6 GB at 1024, ≈ 7.2 GB at 2048. Q8 ≈ 11.5 GB at 1024.
+- 32 GB Mac plausible at Q4 only; 64 GB comfortable at Q8 + 2048.
+- **Phosphene integration shipped to `dev`** (`agent/image_engine.py` `kind="hidream"`, commits `45cad69` + `962b353`). Available in the Image Studio engine dropdown.
+- A/B vs `mflux` Z-Image-Turbo done — see [EVALUATION.md](EVALUATION.md). Both engines competitive; HiDream is faster per image (67s vs 80s) and uses deterministic memory; Z-Image is leaner most of the time but spikes on complex prompts.
+- Edit + multi-reference paths still TODO — refs continue to flow through `mflux qwen-edit` per existing convention.
+## What landed
+```
+/Users/salo/HIDREAM-O1-MLX-LAB-active/
+├── README.md                                       (DO NOT DELETE marker)
+├── docs/HIDREAM_O1_MLX_PORT_REPORT.md              (this file)
+├── notes/weight_map.json                           (HF safetensors index)
+├── scripts/hidream_o1/
+│   ├── flow_match.py                               (FlashFlowMatch in MLX)
+│   ├── pipeline_helpers.py                         (T2I sample, mrope, mask)
+│   ├── hidream_model.py                            (custom heads + forward)
+│   ├── convert_hidream_o1_to_mlx.py                (HF -> MLX, Q4/6/8)
+│   └── generate_hidream_o1_mlx.py                  (T2I generator)
+├── mlx_models/hidream-o1-dev-q4/
+│   ├── model.safetensors                           (5.6 GB, mlx-vlm-loadable)
+│   ├── extras/custom_heads.safetensors             (75 MB)
+│   ├── config.json                                 (with "quantization" field)
+│   └── tokenizer/processor metadata
+├── sample_outputs/
+│   ├── v2_512_mushroom.png         (24.9 s, "red mushroom on moss")
+│   ├── v2_1024_cat.png             (67.1 s, "tabby on wooden chair")
+│   ├── v2_1024_beach.png           (66.6 s, "sunlit beach with palms")
+│   └── v2_1024_portrait.png        (65.8 s, "portrait, red curly hair")
+└── .venv/                                          (uv venv: mlx 0.31.2, mlx-vlm 0.5.0)
+```
+## How to run
+```bash
+cd /Users/salo/HIDREAM-O1-MLX-LAB-active
+.venv/bin/python scripts/hidream_o1/generate_hidream_o1_mlx.py \
+  --model-path mlx_models/hidream-o1-dev-q4 \
+  --prompt "your prompt here" \
+  --width 1024 --height 1024 \
+  --output sample_outputs/your_image.png \
+  --seed 42
+```
+To re-convert from a fresh HF download:
+```bash
+.venv/bin/python scripts/hidream_o1/convert_hidream_o1_to_mlx.py \
+  --hf-source HiDream-ai/HiDream-O1-Image-Dev \
+  --out-dir mlx_models/hidream-o1-dev-q4 \
+  --bits 4 --check-disk
+```
+## Implementation summary
+- **Backbone**: mlx-vlm `qwen3_vl.Model` (vision tower + text decoder + mrope-3D), unchanged. 36 layers, hidden 4096, 32 heads, 8 KV heads, head_dim 128. Vision: 27 blocks, hidden 1152, deepstack at [8, 16, 24].
+- **Custom heads** (under `model.` in HF, mapped to root in MLX):
+  - `t_embedder1` — sinusoidal-256 → SiLU → 4096 (timestep embedding)
+  - `x_embedder` — 32×32×3 → 1024 → 4096 (patch embedding)
+  - `final_layer2` — 4096 → 32×32×3 (patch output)
+- **Forward**: text tokens via embed_tokens, replace tms positions with `t_emb`, append `x_embedder(vinputs)` to the sequence, run all decoder layers with a custom 4D additive mask (text causal, image bidirectional), apply `final_layer2`, slice at `vinput_mask`. Calls `mlx-vlm`'s `Qwen3VLModel.__call__` directly — it already accepts the 4D mask.
+- **Scheduler**: `FlashFlowMatchScheduler` ported verbatim from `models/flash_scheduler.py` (Euler with optional fresh-noise injection). Dev recipe: 28 steps, custom `DEFAULT_TIMESTEPS`, `s_noise=7.5`, `noise_clip_std=2.5`.
+- **Quantisation**: `mx.quantize(group_size=64, bits=4)` on Linear weights where the inner dim is divisible by 64. Vision MLP `linear_fc2` (1152, 4304) doesn't qualify and stays bf16 (~270 MB extra). Custom heads kept bf16 (small + sensitive).
+## Bugs found and fixed
+1. **mlx-vlm strict-load rejects the 9 custom-head keys.** Fix: write the diffusion-side weights to `extras/custom_heads.safetensors` (subdir, so mlx-vlm's `glob *.safetensors` doesn't pick it up). Wrapper loads both.
+2. **mlx-vlm needs `quantization` in config.json** to wrap `Linear → QuantizedLinear` before loading weights. Converter writes it.
+3. **Splitting the safetensors AFTER conversion overwrote the source mmap mid-read**, zeroing all weights silently. Fix: do the split inside the converter (write backbone and custom heads to different paths in one pass; never re-read and overwrite the same path).
+4. **bf16 → numpy raises** ("PEP 3118 buffer format string"). numpy has no bf16 dtype. Cast to fp32 first.
+5. **`mx.array([float], dtype=mx.float32)` is invalid syntax** in mlx 0.31.2. Use `mx.array(np.asarray([float], dtype=np.float32))`.
+6. **`vinput_mask` included the tms position**, causing `gen_patches` to be one row too long. Fix: tag tms positions as `3` so `(token_types == 1)` excludes them.
+7. **512×512 was being snapped to 2048×2048** by the predefined-resolution table (smallest entry is 1440×2560). Fix: snapping is now opt-in via `--snap-resolution`. By default we just patch-align (multiple of 32) and use the requested size.
+## Numbers
+| Resolution | Quant | Steps | Wall time | s/step | Patches | Peak RAM |
+|---|---|---|---|---|---|---|
+| 512×512 | Q4 | 28 | 24.9 s | 0.89 | 256 | ~6 GB |
+| 1024×1024 | Q4 | 28 | 65–67 s | 2.36 | 1024 | ~6 GB |
+| 1024×1024 | Q8 | 28 | 67–68 s | 2.41 | 1024 | ~11.5 GB |
+| 1280×704 | Q8 | 28 | 70.7 s | 2.53 | 880 | ~7 GB |
+| 704×1280 | Q8 | 28 | 65.9 s | 2.35 | 880 | ~3 GB (warm cache) |
+| 2048×2048 | Q4 | 28 | 236 s | 8.44 | 4096 | ~7.2 GB |
+Model load: 0.5–4.6 s. Custom-head load: <0.1 s. Disk: Q4 backbone 5.6 GB, Q8 backbone 9.96 GB, custom heads 75 MB.
+Q8 is **not measurably slower than Q4** at the same resolution — bandwidth-bound, not compute-bound. Use Q8 unless RAM is tight.
+## Aesthetic notes
+**The "dark mood" was Q4 quantisation, not the model.** Q8 of the same prompt + seed produces fully prompt-faithful images:
+- Cat prompt: Q4 → tabby in dim room. Q8 → vibrant orange tabby in bright sunlit kitchen with plant on windowsill.
+- Beach prompt: Q4 → moonlit silhouette beach. Q8 → bright tropical beach with turquoise water, white sand, blue sky, beach chair.
+Bottom line: **Q4 distorts the brightness/colour distribution of HiDream-O1's outputs significantly. Q8 is fine.** If you need Q4, expect dark images.
+A small remaining artifact in flat regions (sky, water): a **patch grid** at the 32×32 boundary. This is intrinsic to the architecture — `final_layer2` predicts each patch independently with no overlap. Not fixable without architectural changes (e.g. a lightweight overlap-blending pass, or finetuning with patch-edge loss).
+`s_noise=7.5` is load-bearing across both Q4 and Q8 — lowering it collapses the image to a near-uniform colour. This is the FlashFlowMatch scheduler's tuned configuration for the Dev distillation; don't change it.
+## Showcase prompts (Q8)
+- "alice in cyberpunk" — vertical 704×1280 ([sample_outputs/v3_alice_vertical_q8.png](sample_outputs/v3_alice_vertical_q8.png), 65.9 s)
+- "alice in cyberpunk" — horizontal 1280×704 ([sample_outputs/v3_alice_horizontal_q8.png](sample_outputs/v3_alice_horizontal_q8.png), 70.7 s)
+- "vibrant orange tabby in sunlit kitchen" 1024×1024 ([sample_outputs/v3_1024_cat_q8.png](sample_outputs/v3_1024_cat_q8.png))
+- "bright sunlit beach" 1024×1024 ([sample_outputs/v3_1024_beach_q8.png](sample_outputs/v3_1024_beach_q8.png))
+## Open questions / next steps if you want to keep going
+1. **Compare with full-precision reference** on the same prompts to isolate Q4 vs Dev-distillation effects on brightness.
+2. **Try Q6 or Q8** of just the decoder layers (vision can stay Q4) to see if attention values get under-represented.
+3. **Implement edit + multi-reference paths** (the build_*_sample helpers + ref_patches concat from `pipeline.py`).
+4. **Higher resolution (2048×2048)** — should fit on 64 GB. ~4 min predicted (4× the seq length, but attention is O(S²), so closer to ~6 min).
+5. **Promote a path forward**: package as `hidream-o1-mlx` Python module that can be imported into a Phosphene engine. NOT yet — wait for an apples-to-apples vs `mflux` Qwen-Image-Edit comparison on ≥5 prompts.
+## Hard-stop conditions — where we landed
+- ✅ mlx-vlm Qwen3-VL is reusable (Path B confirmed).
+- ✅ Q4 output is recognisable and not slow.
+- ✅ Memory stays well under 64 GB.
+- ✅ No new MLX kernels needed.
+- ✅ No CUDA, no PyTorch at runtime.
+- 🟡 Quality leans dark — needs comparison to confirm acceptable.
+## Recommendation
+**Continue.** The hard parts are done: backbone reuse works, custom heads load, the forward pass produces real predictions, the flow-matching loop converges, and we have a working converter + generator. The next session can iterate on quality (compare with reference, try Q6, run more prompts) without any new architectural work.
+Don't promote to Phosphene yet. Wait for: (a) the brightness question to be resolved, (b) at least 5 side-by-side prompts vs `mflux` Qwen-Image-Edit, (c) edit and multi-ref paths to be wired up so it can replace Qwen-Image-Edit functionally.

docs/PHOSPHENE_INTEGRATION_PLAN.md ADDED Viewed

	@@ -0,0 +1,158 @@

+# HiDream-O1 → Phosphene integration plan
+**Status:** plan only. No edits to Phosphene yet. Show this to Salo for approval first.
+## Where it slots in
+Phosphene's `agent/image_engine.py` already abstracts image generation behind
+`generate(prompt, n, output_dir, ..., config)` with a `kind` discriminator.
+Three kinds exist today: `mock`, `mflux`, `bfl`. We add a fourth: `hidream`.
+Pattern matches `mflux`: subprocess invocation of an external Python that owns
+its own venv. Phosphene stays clean, dependencies stay isolated.
+## Files touched (3)
+### 1. `agent/image_engine.py` — add config fields, dispatch, generator
+```python
+# Inside ImageEngineConfig (after mflux_quantize):
+hidream_python: str = ""                 # path to lab venv python; empty = autodetect
+hidream_model_path: str = ""             # path to converted MLX model dir; empty = autodetect
+hidream_steps: int = 28
+hidream_noise_scale: float = 7.5         # Dev recipe default; do not change
+hidream_noise_clip_std: float = 2.5
+```
+```python
+# Inside generate():
+if config.kind == "hidream":
+    return _generate_hidream(prompt, n, width, height, output_dir, base_seed, config, on_log=on_log)
+```
+```python
+# Inside health_check():
+if config.kind == "hidream":
+    py = _resolve_hidream_python(config)
+    model = _resolve_hidream_model(config)
+    if not py:
+        return False, "HiDream python not found. Install lab at /Users/salo/HIDREAM-O1-MLX-LAB-active/"
+    if not model:
+        return False, f"HiDream model dir not found at {config.hidream_model_path or 'autodetect'}"
+    return True, f"HiDream ready: {py} + {model}"
+```
+```python
+# New module-level constants + helpers:
+HIDREAM_LAB_DIR = Path("/Users/salo/HIDREAM-O1-MLX-LAB-active")
+HIDREAM_DEFAULT_PY = HIDREAM_LAB_DIR / ".venv" / "bin" / "python"
+HIDREAM_DEFAULT_MODEL = HIDREAM_LAB_DIR / "mlx_models" / "hidream-o1-dev-q8"
+HIDREAM_GENERATE_SCRIPT = HIDREAM_LAB_DIR / "scripts" / "hidream_o1" / "generate_hidream_o1_mlx.py"
+def _resolve_hidream_python(config) -> str | None:
+    p = Path(config.hidream_python) if config.hidream_python else HIDREAM_DEFAULT_PY
+    return str(p) if p.is_file() and os.access(p, os.X_OK) else None
+def _resolve_hidream_model(config) -> str | None:
+    p = Path(config.hidream_model_path) if config.hidream_model_path else HIDREAM_DEFAULT_MODEL
+    return str(p) if (p / "model.safetensors").exists() else None
+def _generate_hidream(prompt, n, width, height, output_dir, base_seed, config, on_log=None):
+    """Subprocess pattern matching _generate_mflux. One PNG per call to the
+    generator script, n calls total. Each candidate uses base_seed+i."""
+    py = _resolve_hidream_python(config) or sys.exit("HiDream python missing")
+    model = _resolve_hidream_model(config) or sys.exit("HiDream model missing")
+    script = str(HIDREAM_GENERATE_SCRIPT)
+    out: list[dict] = []
+    for i in range(n):
+        seed = (base_seed + i) if base_seed is not None else random.randint(0, 2**31 - 1)
+        png = output_dir / f"hidream_{int(time.time()*1000)}_{i:02d}.png"
+        cmd = [
+            py, script,
+            "--model-path", model,
+            "--prompt", prompt,
+            "--width", str(width),
+            "--height", str(height),
+            "--output", str(png),
+            "--seed", str(seed),
+            "--num-inference-steps", str(config.hidream_steps),
+            "--noise-scale-start", str(config.hidream_noise_scale),
+            "--noise-scale-end", str(config.hidream_noise_scale),
+            "--noise-clip-std", str(config.hidream_noise_clip_std),
+        ]
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+        for line in proc.stdout:
+            if on_log: on_log(line.rstrip())
+        rc = proc.wait()
+        if rc != 0 or not png.exists():
+            raise RuntimeError(f"hidream gen failed (rc={rc})")
+        out.append({
+            "png_path": str(png),
+            "seed": seed,
+            "engine": "hidream-o1-dev-q8",
+            "width": width,
+            "height": height,
+        })
+    return out
+```
+### 2. `mlx_ltx_panel.py` — settings UI option (one dropdown entry)
+`update_settings()` and `_load_agent_image_config()` already accept `kind`
+strings. Just add `"hidream"` to whatever validation lists exist (likely a
+single line). The panel already shows config.kind in the agent settings card.
+### 3. `docs/IMAGE_GEN_RESEARCH_2026-05.md` — note the new option
+Add a row to the engine comparison table:
+| Engine | Local | Speed (1024) | RAM | Quality | License |
+|---|---|---|---|---|---|
+| FLUX.2 klein 4B / mflux | yes | ~50 s | ~16 GB | great | Apache 2.0 |
+| Z-Image-Turbo / mflux | yes | ~30 s | ~6 GB | good | Apache 2.0 |
+| **HiDream-O1-Image-Dev / Q8** | **yes** | **~67 s** | **~11 GB** | **great** | **MIT** |
+## What does NOT need to change
+- `start.js` / `install.js` / `pinokio.js` — HiDream's lab is **outside**
+  Pinokio; Phosphene just shells out to the lab's python. No new install step.
+- `mlx_warm_helper.py` — that's LTX-only. HiDream is sub-minute, no warm
+  helper needed for now (could add one later if we go to a long session of
+  many shots).
+- Phosphene's venv (`ltx-2-mlx/env`) — untouched. mlx-vlm is in the lab's
+  separate `.venv`.
+## Risks & mitigations
+| Risk | Mitigation |
+|---|---|
+| Lab path is hard-coded — moves break it | Configurable via `hidream_python` / `hidream_model_path`. Defaults are absolute; users can override in `state/agent_image_config.json`. |
+| HiDream + LTX run at the same time (both want GPU) | Already a problem with mflux + LTX; Phosphene queue serialises shot generation. No new mitigation needed. |
+| Lab dir gets nuked again | `README.md` marker is in place; user is aware. If it goes, Phosphene's `health_check` returns clearly and panel surfaces it. |
+| Quality-tier defaults: most users won't have a 64 GB Mac | Mark HiDream as **Comfortable+ (32 GB+)** tier in the docs. Don't make it the default — keep mflux Z-Image-Turbo as default for compact tier, FLUX.2 klein as default for comfortable. |
+## Cost / size
+- Disk: ~10 GB additional in lab (already there)
+- RAM at 1024×1024: ~11.5 GB (Q8). Same RAM tier as FLUX.2 klein.
+- One-time setup: lab venv install (~1.5 GB, already done).
+## Roll-out
+1. Patch `image_engine.py` (above).
+2. Add `"hidream"` to settings validation in `mlx_ltx_panel.py`.
+3. Switch agent_image_config.json kind to `"hidream"` in a single test session.
+4. Generate one shot through the agent UI; confirm PNG lands.
+5. Compare to the same prompt through `mflux qwen-image-edit`.
+6. If quality wins on at least 3 prompts → make it a real option in docs.
+7. Don't switch the default until we have ≥5 prompts where HiDream is clearly better than mflux Z-Image-Turbo, AND the dark-aesthetic concern is fully ruled out.
+## What I'd want before merging this
+1. ✅ Q8 conversion of HiDream-O1-Image-Dev (DONE)
+2. ✅ Stable single-shot text-to-image (DONE — sample images in `sample_outputs/`)
+3. 🟡 Showcase pass to characterise quality across genres (RUNNING)
+4. ❌ Side-by-side vs Phosphene's existing mflux engines on ≥5 matched prompts (NOT YET — needs the showcase to finish + a parallel run on mflux)
+5. ❌ One real agent-flow render that uses HiDream as the anchor engine and
+   feeds the result into LTX 2.3 (NOT YET — easy once health_check passes)

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+# Apple Silicon only. Tested on macOS 14+ with Python 3.11 in a uv venv.
+# These are the exact pins the lab was developed against — newer minor
+# versions of mlx-vlm and transformers have been observed to break the
+# Qwen3-VL backbone import path; if you upgrade, re-test before shipping.
+mlx>=0.31.2
+mlx-vlm>=0.5.0
+transformers>=4.57.0,<6.0
+huggingface_hub>=0.30
+safetensors>=0.6
+numpy>=2.0
+pillow>=10.0
+tqdm>=4.66
+sentencepiece>=0.2.0
+hf_transfer>=0.1.9   # optional, speeds up the HF download

sample_outputs/hero/01_tea_master.png ADDED Viewed

Git LFS Details

SHA256: 38ac862dd666dfca0ef0f632661a4f93298e6113eb851c3a048ddd3c1b36422f
Pointer size: 132 Bytes
Size of remote file: 1.47 MB

sample_outputs/hero/02_tropical_beach.png ADDED Viewed

Git LFS Details

SHA256: 20743ab3e89312e3e348a42b43eb3c405986ced8792968f8c772fde2363a7a4d
Pointer size: 132 Bytes
Size of remote file: 1.39 MB

sample_outputs/hero/03_astronaut.png ADDED Viewed

Git LFS Details

SHA256: 7e876f6c286fc12f521308cc24acfe39647aff29e820b2c81b595825ed7f6de1
Pointer size: 132 Bytes
Size of remote file: 5.14 MB

sample_outputs/hero/04_construction_worker.png ADDED Viewed

Git LFS Details

SHA256: 03649c1acb92b57175c9055245f3e5da04c6e2eb5ccd3aa01a8ff1d7edc6287c
Pointer size: 132 Bytes
Size of remote file: 4.38 MB

sample_outputs/hero/05_mountain_peak.png ADDED Viewed

Git LFS Details

SHA256: fdb3a292322f91601f3204d6f507d8a496c146533dc6d5c33c881355d909b93a
Pointer size: 132 Bytes
Size of remote file: 3.05 MB

sample_outputs/hero/06_alice_cyberpunk.png ADDED Viewed

Git LFS Details

SHA256: 7e842b91de564975224ef9097b8ad91b26bd0253d58beac20ea31644a0463f70
Pointer size: 132 Bytes
Size of remote file: 5.68 MB

sample_outputs/hero/07_kitchen_morning.png ADDED Viewed

Git LFS Details

SHA256: 178da6df36debb128ce756bd05e165e5ae8af10789e321cd3c205e030f3c541d
Pointer size: 132 Bytes
Size of remote file: 3.65 MB

sample_outputs/hero/08_fitness_BF16.png ADDED Viewed

Git LFS Details

SHA256: 515265ae082e4596fdd6f2373be9f29aee0c3a2f408b614963396c8176d44289
Pointer size: 132 Bytes
Size of remote file: 3.61 MB

scripts/hidream_o1/__init__.py ADDED Viewed

File without changes

scripts/hidream_o1/_compile_bench.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""Quick A/B: forward_generation with vs without mx.compile.
+Times 5 forward passes after warm-up. Same shapes as a 1024x1024 inference.
+"""
+from __future__ import annotations
+import sys, time
+from pathlib import Path
+HERE = Path(__file__).parent
+sys.path.insert(0, str(HERE))
+import numpy as np
+import mlx.core as mx
+from mlx_vlm import load as mlx_vlm_load
+from pipeline_helpers import build_t2i_text_sample, build_attention_mask, PATCH_SIZE
+from hidream_model import HiDreamConfig, build_model, forward_generation
+LAB = Path(__file__).resolve().parents[2]
+MODEL_PATH = LAB / "mlx_models" / "hidream-o1-dev-q8"
+print("loading model...")
+t0 = time.time()
+backbone, processor = mlx_vlm_load(str(MODEL_PATH))
+print(f"  {time.time()-t0:.1f}s")
+cfg = HiDreamConfig()
+model = build_model(cfg, backbone)
+custom = mx.load(str(MODEL_PATH / "extras" / "custom_heads.safetensors"))
+model.load_weights(list(custom.items()), strict=False)
+mx.eval(model.parameters())
+print("model ready")
+# Build inputs at 1024x1024
+WIDTH, HEIGHT = 1024, 1024
+N_PATCH = (WIDTH // PATCH_SIZE) * (HEIGHT // PATCH_SIZE)  # 1024
+tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor
+for n in ("boi", "bor", "eor", "bot", "tms"):
+    if not hasattr(tokenizer, f"{n}_token"):
+        setattr(tokenizer, f"{n}_token", f"<|{n}_token|>")
+sample = build_t2i_text_sample(
+    "a small red mushroom on a bed of moss",
+    HEIGHT, WIDTH, tokenizer, processor, backbone.config,
+)
+input_ids = mx.array(sample["input_ids"])
+position_ids = mx.array(sample["position_ids"])
+token_types = mx.array(sample["token_types"])
+mask4d = mx.array(build_attention_mask(sample["token_types"], -1e4)).astype(mx.bfloat16)
+vinputs = mx.random.normal((1, N_PATCH, 3 * PATCH_SIZE * PATCH_SIZE)).astype(mx.bfloat16)
+timestep = mx.array([0.5], dtype=mx.float32)
+print(f"shapes: input_ids={input_ids.shape} pos={position_ids.shape} "
+      f"vinputs={vinputs.shape} mask={mask4d.shape}")
+# --- Uncompiled baseline ---
+print("\n=== baseline (uncompiled) ===")
+# warmup
+for _ in range(2):
+    out = forward_generation(model, cfg, input_ids, position_ids, vinputs, timestep, token_types, mask4d)
+    mx.eval(out)
+# time
+N = 5
+t0 = time.time()
+for _ in range(N):
+    out = forward_generation(model, cfg, input_ids, position_ids, vinputs, timestep, token_types, mask4d)
+    mx.eval(out)
+elapsed = time.time() - t0
+print(f"  baseline: {elapsed/N:.3f}s/step over {N} steps")
+# --- Compiled ---
+print("\n=== mx.compile ===")
+def fwd(input_ids, position_ids, vinputs, timestep, token_types, mask4d):
+    return forward_generation(model, cfg, input_ids, position_ids, vinputs, timestep, token_types, mask4d)
+try:
+    fwd_c = mx.compile(fwd)
+    # warmup (first call compiles)
+    for _ in range(2):
+        out = fwd_c(input_ids, position_ids, vinputs, timestep, token_types, mask4d)
+        mx.eval(out)
+    t0 = time.time()
+    for _ in range(N):
+        out = fwd_c(input_ids, position_ids, vinputs, timestep, token_types, mask4d)
+        mx.eval(out)
+    elapsed_c = time.time() - t0
+    print(f"  compiled: {elapsed_c/N:.3f}s/step over {N} steps  (speedup {elapsed/elapsed_c:.2f}x)")
+except Exception as e:
+    print(f"  mx.compile failed: {type(e).__name__}: {e}")

scripts/hidream_o1/_edit_diag.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Diagnose what build_edit_text_sample produces, no model load."""
+from __future__ import annotations
+import sys
+from pathlib import Path
+HERE = Path(__file__).parent
+sys.path.insert(0, str(HERE))
+import numpy as np
+from mlx_vlm import load as mlx_vlm_load
+LAB = Path(__file__).resolve().parents[2]
+MODEL_PATH = LAB / "mlx_models" / "hidream-o1-dev-q6"
+REF = "/tmp/hidream_edit_smoke/ref.png"
+# Use mlx-vlm to get a working processor that skips the video-processor dep issue
+backbone, processor = mlx_vlm_load(str(MODEL_PATH))
+tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor
+for n in ("boi", "bor", "eor", "bot", "tms"):
+    if not hasattr(tokenizer, f"{n}_token"):
+        setattr(tokenizer, f"{n}_token", f"<|{n}_token|>")
+MC = backbone.config
+from pipeline_helpers import build_edit_text_sample, PATCH_SIZE
+prompt = "in the style of the reference image, a vibrant abstract composition, vivid colors, modern art"
+H = W = 512
+sample = build_edit_text_sample(prompt, [REF], H, W, tokenizer, processor, MC)
+print("=== build_edit_text_sample shapes ===")
+for k, v in sample.items():
+    if hasattr(v, "shape"):
+        print(f"  {k}: shape={v.shape} dtype={v.dtype}")
+    else:
+        print(f"  {k}: {v}")
+iid = sample["input_ids"][0]
+img_token_id = MC.image_token_id
+vs_token_id = MC.vision_start_token_id
+img_count = int((iid == img_token_id).sum())
+vs_count = int((iid == vs_token_id).sum())
+tms_count = int((iid == 151673).sum())   # tms_token_id
+print(f"\n=== input_ids breakdown (text-side, length {iid.shape[0]}) ===")
+print(f"  image_token_id ({img_token_id}): {img_count} positions  <-- vision tower fills these")
+print(f"  vision_start_token_id ({vs_token_id}): {vs_count}")
+print(f"  tms_token_id (151673): {tms_count}")
+print(f"  first 30 ids: {list(iid[:30])}")
+print(f"  last 5 ids:   {list(iid[-5:])}")
+pix = sample["pixel_values"]
+g = sample["image_grid_thw"]
+print(f"\n=== vision tower input ===")
+print(f"  pixel_values shape: {pix.shape}")
+print(f"  image_grid_thw: {g}")
+# Per-image vision patch count = T*H*W, post-merge = T*H/m*W/m
+m = backbone.config.vision_config.spatial_merge_size
+for i, row in enumerate(g):
+    t, h, w = row
+    pre_merge = int(t * h * w)
+    post_merge = int(t * (h//m) * (w//m))
+    print(f"  ref {i}: pre-merge patches={pre_merge}, post-merge={post_merge}")
+print(f"  TOTAL post-merge features (what vision tower outputs): {sum(int(r[0])*(int(r[1])//m)*(int(r[2])//m) for r in g)}")
+print(f"  TOTAL image_token_id positions in input_ids: {img_count}")
+print(f"  ** these must match for scatter to work **")
+vinput_mask = sample["vinput_mask"][0]
+vinput_mask_tgt = sample["vinput_mask_tgt_only"][0]
+print(f"\n=== mask checks ===")
+print(f"  total vinput positions (tgt+refs): {int(vinput_mask.sum())} = {sample['tgt_image_len']} + {int(vinput_mask.sum()) - sample['tgt_image_len']}")
+print(f"  total tgt-only positions: {int(vinput_mask_tgt.sum())} (expect {sample['tgt_image_len']})")
+# Position IDs
+pids = sample["position_ids"]
+print(f"\n=== position_ids ===")
+print(f"  shape: {pids.shape}  (3D mrope: rope_dim, batch, seq)")
+print(f"  ranges per dim: {[(int(pids[d].min()), int(pids[d].max())) for d in range(pids.shape[0])]}")
+# Where are the discontinuities? Look at the boundary between text-side and vision-token-side
+txt_seq_len = iid.shape[0]
+print(f"  text/vision boundary at position {txt_seq_len}")
+print(f"  pids[:, 0, txt_seq_len-3:txt_seq_len+3] (around the boundary):")
+print(pids[:, 0, max(0, txt_seq_len-3):txt_seq_len+3])

scripts/hidream_o1/_precompute_diag.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""Verify precompute_text_embeds_with_vision actually scatters image features
+into the right positions of inputs_embeds, without mangling text positions.
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+HERE = Path(__file__).parent
+sys.path.insert(0, str(HERE))
+import numpy as np
+import mlx.core as mx
+from mlx_vlm import load as mlx_vlm_load
+from pipeline_helpers import build_edit_text_sample
+from hidream_model import HiDreamConfig, build_model, precompute_text_embeds_with_vision
+LAB = Path(__file__).resolve().parents[2]
+MODEL_PATH = LAB / "mlx_models" / "hidream-o1-dev-q6"
+REF = "sample_outputs/v3_1024_cat_q8.png"
+print("loading model...")
+backbone, processor = mlx_vlm_load(str(MODEL_PATH))
+cfg = HiDreamConfig()
+model = build_model(cfg, backbone)
+custom = mx.load(str(MODEL_PATH / "extras" / "custom_heads.safetensors"))
+model.load_weights(list(custom.items()), strict=False)
+mx.eval(model.parameters())
+tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor
+for n in ("boi", "bor", "eor", "bot", "tms"):
+    if not hasattr(tokenizer, f"{n}_token"):
+        setattr(tokenizer, f"{n}_token", f"<|{n}_token|>")
+sample = build_edit_text_sample(
+    "a cat", [str(LAB / REF)], 1024, 1024, tokenizer, processor, backbone.config,
+)
+input_ids = mx.array(sample["input_ids"])
+pixel_values = mx.array(sample["pixel_values"]).astype(mx.bfloat16)
+image_grid_thw = mx.array(sample["image_grid_thw"])
+# 1) Just the embed_tokens output (no scatter)
+embed_tokens = model.language_model.model.embed_tokens
+text_only_embeds = embed_tokens(input_ids)
+mx.eval(text_only_embeds)
+print(f"\ntext-only embeds shape: {text_only_embeds.shape} dtype: {text_only_embeds.dtype}")
+# 2) Vision tower output
+vt_out = model.visual(pixel_values, image_grid_thw)
+img_features = vt_out[0] if isinstance(vt_out, tuple) else vt_out
+mx.eval(img_features)
+print(f"image_features shape: {img_features.shape} dtype: {img_features.dtype}")
+print(f"  stats: mean={float(mx.mean(img_features.astype(mx.float32))):.4f} std={float(mx.std(img_features.astype(mx.float32))):.4f} min={float(mx.min(img_features.astype(mx.float32))):.3f} max={float(mx.max(img_features.astype(mx.float32))):.3f}")
+# 3) Run our precompute
+combined = precompute_text_embeds_with_vision(model, cfg, input_ids, pixel_values, image_grid_thw)
+mx.eval(combined)
+print(f"\ncombined embeds shape: {combined.shape} dtype: {combined.dtype}")
+# 4) Inspect: at image positions, combined should equal image_features
+ids_np = np.asarray(input_ids[0])
+img_pos = np.where(ids_np == cfg.image_token_id)[0]
+text_pos = np.where(ids_np != cfg.image_token_id)[0]
+print(f"\nimage_token positions: {len(img_pos)} (first 5: {img_pos[:5].tolist()}, last 5: {img_pos[-5:].tolist()})")
+print(f"text positions: {len(text_pos)} (first 5: {text_pos[:5].tolist()})")
+# At image positions: combined should be image_features (in same order)
+# combined[0, img_pos[i], :] should equal img_features[i, :]
+combined_np = np.asarray(combined[0].astype(mx.float32))
+img_feat_np = np.asarray(img_features.astype(mx.float32))
+print("\n--- check: combined[0, img_pos[0]] vs img_features[0] ---")
+print(f"  combined[0, {img_pos[0]}, :8] = {combined_np[img_pos[0], :8]}")
+print(f"  image_features[0, :8]         = {img_feat_np[0, :8]}")
+print(f"  diff: {np.abs(combined_np[img_pos[0]] - img_feat_np[0]).max():.4f}")
+print("\n--- check: combined[0, img_pos[5]] vs img_features[5] ---")
+print(f"  combined[0, {img_pos[5]}, :8] = {combined_np[img_pos[5], :8]}")
+print(f"  image_features[5, :8]         = {img_feat_np[5, :8]}")
+print(f"  diff: {np.abs(combined_np[img_pos[5]] - img_feat_np[5]).max():.4f}")
+# At text positions: combined should equal embed_tokens output
+text_only_np = np.asarray(text_only_embeds[0].astype(mx.float32))
+diff_at_text = np.abs(combined_np[text_pos] - text_only_np[text_pos]).max()
+print(f"\n--- check: combined matches text embeddings at text positions ---")
+print(f"  max abs diff at text positions: {diff_at_text:.6f} (should be 0)")
+# Also: at image positions, embed_tokens gives the image_token's WEIRD embedding (since the token is just a placeholder)
+print(f"\n  embed_tokens at img_pos[0] (the placeholder embedding): {text_only_np[img_pos[0], :8]}")

scripts/hidream_o1/anti_plastic_batch.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/usr/bin/env bash
+# Anti-plastic batch v2 — incorporates HiDream-specific prompt tips:
+# - "masterpiece, best quality" prefix (Civitai community finding)
+# - Subject + Actions → Setting → Style → Details ordering
+# - Specific cameras (Leica 50mm, Pentax K1000, Hasselblad)
+# - Specific film stocks (Tri-X 400, Portra 400, Cinestill 800T)
+# - Documentary photographer references
+# - BF16 weights (no quantization)
+set -euo pipefail
+LAB="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+PY="$LAB/.venv/bin/python"
+MODEL="$LAB/mlx_models/hidream-o1-dev-bf16"
+OUT="$LAB/sample_outputs/showcase_antiplastic"
+mkdir -p "$OUT"
+run() {
+  local name="$1" w="$2" h="$3" prompt="$4" seed="${5:-42}"
+  echo "=== $name ${w}x${h} (seed=$seed) ==="
+  cd "$LAB" && "$PY" scripts/hidream_o1/generate_hidream_o1_mlx.py \
+    --model-path "$MODEL" \
+    --prompt "$prompt" \
+    --width "$w" --height "$h" \
+    --output "$OUT/$name.png" \
+    --seed "$seed" 2>&1 | grep -E "loaded|using|generation:|saved" | tail -3
+  echo ""
+}
+run "01_construction_rain" 2048 2048 \
+  "masterpiece, best quality, 35mm DSLR photograph. A construction worker leans against a steel I-beam, taking a long drag from a cigarette between gloved fingers. On a half-built skyscraper rooftop in heavy rain, water streaming off his hard hat and reflective vest. Documentary photojournalism, Sebastião Salgado aesthetic, shot on Leica M6 with Kodak Tri-X 400 black and white film, harsh overcast daylight, deep grain, raw skin texture with rain droplets and stubble visible, no retouching, 50mm Summicron lens" \
+  701
+run "02_pub_musician" 2048 2048 \
+  "masterpiece, best quality, 35mm DSLR photograph. A bearded musician in his late thirties sings into a vintage Shure SM58 microphone, eyes closed mid-note, fingers callused on a worn acoustic guitar. In a dim London pub on a Tuesday night, three half-empty pint glasses on a small wooden stage edge, a single warm tungsten spotlight from above creating sharp shadows. Cinematic documentary, shot on Pentax K1000 with Cinestill 800T film, visible grain and halation around the spotlight, real sweat on his forehead, natural skin pores and laugh lines, Anders Petersen mood" \
+  702
+run "03_mechanic_garage" 3104 1312 \
+  "masterpiece, best quality, ultrawide editorial photograph. A female mechanic in her mid forties wipes engine grease from her hands with a faded red rag, standing beside the open hood of a 1967 Pontiac GTO. In her cluttered garage on a quiet Sunday afternoon, tool chests and stacks of car magazines along the wall, sun streaming through high windows catching dust motes in the air. Annie Leibovitz Vanity Fair aesthetic, shot on Hasselblad H6D medium format with natural skin texture retention, soft fill light, visible pores and faint freckles, weathered hands with chipped nail polish, no glamour retouching, real and lived-in" \
+  703
+echo "=== batch complete ==="
+ls -la "$OUT"

scripts/hidream_o1/cinematic_batch.sh ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/usr/bin/env bash
+# Cinematic batch — people doing things, specific dress, photoreal/cinematic style.
+# Usage: cinematic_batch.sh <quant> <width> <height>
+set -euo pipefail
+LAB="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+PY="$LAB/.venv/bin/python"
+QUANT="${1:-q6}"
+WIDTH="${2:-1920}"
+HEIGHT="${3:-1088}"
+MODEL="$LAB/mlx_models/hidream-o1-dev-${QUANT}"
+OUT="$LAB/sample_outputs/cinematic_${QUANT}_${WIDTH}x${HEIGHT}"
+mkdir -p "$OUT"
+echo "cinematic batch: quant=${QUANT}, ${WIDTH}x${HEIGHT}, model=${MODEL}, out=${OUT}"
+run() {
+  local name="$1" prompt="$2" seed="${3:-42}"
+  echo "=== $name (seed=$seed) ==="
+  cd "$LAB" && /usr/bin/time -l "$PY" scripts/hidream_o1/generate_hidream_o1_mlx.py \
+    --model-path "$MODEL" \
+    --prompt "$prompt" \
+    --width "$WIDTH" --height "$HEIGHT" \
+    --output "$OUT/$name.png" \
+    --seed "$seed" 2>&1 | grep -E "loaded|using|generation:|saved|maximum resident" | tail -5
+  echo ""
+}
+run "01_jazz_pianist" \
+  "cinematic medium shot of a jazz pianist in his fifties at a baby grand piano, wearing a navy three-piece suit with a thin gold pocket watch chain, fingers blurred mid-arpeggio, dim smoky club lighting from above, deep shadows, anamorphic 35mm film grain, shallow depth of field" \
+  3
+run "02_street_photographer" \
+  "cinematic shot of a young street photographer crouched on a Shinjuku crosswalk at night, holding a Leica M11 camera up to her eye, wearing an olive-green oversized trench coat over black jeans and black leather boots, neon signs in Japanese reflecting off wet asphalt, wide-angle lens, blade runner color palette, photorealistic" \
+  17
+run "03_michelin_chef" \
+  "close-up cinematic shot of a Michelin-star chef in a crisp white double-breasted chef coat with rolled sleeves, tweezers placing a single edible flower onto a black slate plate, steam rising, kitchen brigade in soft focus behind, warm copper-pan lighting, food cinematography, hyperreal" \
+  29
+run "04_ballet_dancer" \
+  "cinematic full-body shot of a ballerina mid-grand-jeté across an empty rehearsal studio, wearing a slate-grey leotard and pink satin pointe shoes, hair in a tight bun, golden afternoon sunlight streaming through tall windows, dust particles visible in the light beams, motion blur on her trailing foot" \
+  41
+run "05_astronaut" \
+  "cinematic wide shot of an astronaut in a battered orange ACES launch-and-entry suit walking down a long curved corridor inside a space station, helmet tucked under one arm, clipboard in the other, fluorescent overhead strip lighting, scratched white wall panels, anamorphic lens flare, sci-fi realism" \
+  53
+echo "=== batch complete ==="
+ls -la "$OUT"

scripts/hidream_o1/convert_hidream_o1_to_mlx.py ADDED Viewed

	@@ -0,0 +1,210 @@

+#!/usr/bin/env python3
+"""Convert HiDream-O1-Image-Dev safetensors -> MLX format."""
+from __future__ import annotations
+import argparse
+import json
+import shutil
+import sys
+import time
+from pathlib import Path
+from typing import Iterable
+import mlx.core as mx
+HF_TO_MLX_CUSTOM = {
+    "model.t_embedder1.mlp.0.weight":   "t_embedder1.fc1.weight",
+    "model.t_embedder1.mlp.0.bias":     "t_embedder1.fc1.bias",
+    "model.t_embedder1.mlp.2.weight":   "t_embedder1.fc2.weight",
+    "model.t_embedder1.mlp.2.bias":     "t_embedder1.fc2.bias",
+    "model.x_embedder.proj1.weight":    "x_embedder.proj1.weight",
+    "model.x_embedder.proj2.weight":    "x_embedder.proj2.weight",
+    "model.x_embedder.proj2.bias":      "x_embedder.proj2.bias",
+    "model.final_layer2.linear.weight": "final_layer2.linear.weight",
+    "model.final_layer2.linear.bias":   "final_layer2.linear.bias",
+}
+CUSTOM_HEAD_PREFIXES = ("t_embedder1.", "x_embedder.", "final_layer2.")
+def remap_hf_to_mlx(hf_key: str) -> str | None:
+    if hf_key == "lm_head.weight":
+        # mlx-vlm Qwen3-VL Model has language_model.lm_head when tie_word_embeddings=False.
+        # We don't call it (HiDream uses final_layer2 for image patches), but keeping the
+        # weight avoids strict-load failures when mlx-vlm imports the checkpoint.
+        return "language_model.lm_head.weight"
+    if hf_key in HF_TO_MLX_CUSTOM:
+        return HF_TO_MLX_CUSTOM[hf_key]
+    if hf_key.startswith("model.language_model."):
+        return "language_model.model." + hf_key[len("model.language_model."):]
+    if hf_key.startswith("model.visual."):
+        return "vision_tower." + hf_key[len("model.visual."):]
+    return hf_key
+def stream_safetensors(shard_paths: Iterable[Path]) -> dict[str, mx.array]:
+    out: dict[str, mx.array] = {}
+    for p in shard_paths:
+        print(f"  loading {p.name} ({p.stat().st_size / 1e9:.2f} GB) ...", flush=True)
+        shard = mx.load(str(p))
+        for k, v in shard.items():
+            mlx_key = remap_hf_to_mlx(k)
+            if mlx_key is None:
+                continue
+            out[mlx_key] = v
+    return out
+def quantise(weights: dict[str, mx.array], bits: int, group_size: int = 64) -> dict[str, mx.array]:
+    if bits == 16:
+        # No quantization — return weights unchanged. Caller has already cast f32 -> bf16.
+        return weights
+    if bits not in (4, 6, 8):
+        raise ValueError(f"--bits must be 4, 6, 8, or 16 (got {bits})")
+    out: dict[str, mx.array] = {}
+    quantised = 0
+    skipped = 0
+    for k, v in weights.items():
+        if any(k.startswith(p) for p in CUSTOM_HEAD_PREFIXES):
+            out[k] = v
+            skipped += 1
+            continue
+        if v.ndim != 2 or "embed_tokens" in k or "norm" in k or k.endswith(".bias"):
+            out[k] = v
+            skipped += 1
+            continue
+        try:
+            qw, scales, biases = mx.quantize(v, group_size=group_size, bits=bits)
+            base = k[: -len(".weight")] if k.endswith(".weight") else k
+            out[k] = qw
+            out[base + ".scales"] = scales
+            out[base + ".biases"] = biases
+            quantised += 1
+        except Exception as e:
+            print(f"  [warn] could not quantise {k}: {e!s}; keeping fp", flush=True)
+            out[k] = v
+            skipped += 1
+    print(f"quantised {quantised} tensors, skipped {skipped} (kept as-is)")
+    return out
+def copy_metadata(src_dir: Path, dst_dir: Path):
+    keep = {
+        "config.json", "configuration.json", "generation_config.json",
+        "preprocessor_config.json", "video_preprocessor_config.json",
+        "chat_template.json", "tokenizer.json", "tokenizer_config.json",
+        "vocab.json", "merges.txt", "README.md",
+    }
+    for name in keep:
+        src = src_dir / name
+        if src.exists():
+            shutil.copy2(src, dst_dir / name)
+def resolve_source(arg: str, cache_dir: Path | None) -> Path:
+    p = Path(arg)
+    if p.exists() and p.is_dir():
+        return p
+    try:
+        from huggingface_hub import snapshot_download
+    except ImportError as e:
+        sys.exit(f"huggingface_hub is required to download {arg!r}: {e}")
+    print(f"downloading {arg!r} ...", flush=True)
+    return Path(snapshot_download(arg, cache_dir=str(cache_dir) if cache_dir else None))
+def main(argv=None):
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--hf-source", default="HiDream-ai/HiDream-O1-Image-Dev")
+    ap.add_argument("--cache-dir", default=None)
+    ap.add_argument("--out-dir", default="./mlx_models/hidream-o1-dev-q4")
+    ap.add_argument("--bits", type=int, default=4, choices=[4, 6, 8, 16],
+                    help="16 = no quantization, store as bf16 (matches upstream's master-weight precision)")
+    ap.add_argument("--group-size", type=int, default=64)
+    ap.add_argument("--check-disk", action="store_true")
+    ap.add_argument("--dry-run", action="store_true")
+    args = ap.parse_args(argv)
+    out_dir = Path(args.out_dir).resolve()
+    out_dir.mkdir(parents=True, exist_ok=True)
+    if args.check_disk:
+        free_gb = shutil.disk_usage(out_dir).free / 1e9
+        if free_gb < 40:
+            sys.exit(f"free disk on {out_dir.parent}: {free_gb:.1f} GB; need >= 40 GB")
+    src_dir = resolve_source(args.hf_source, Path(args.cache_dir) if args.cache_dir else None)
+    print(f"source: {src_dir}")
+    idx_path = src_dir / "model.safetensors.index.json"
+    if not idx_path.exists():
+        sys.exit(f"no model.safetensors.index.json under {src_dir}")
+    idx = json.loads(idx_path.read_text())
+    shard_names = sorted({v for v in idx["weight_map"].values()})
+    shard_paths = [src_dir / name for name in shard_names]
+    total_gb = sum(p.stat().st_size for p in shard_paths) / 1e9
+    print(f"shards: {len(shard_paths)}, total {total_gb:.2f} GB")
+    if args.dry_run:
+        for p in shard_paths:
+            print(f"  {p.stat().st_size / 1e9:6.2f} GB  {p.name}")
+        return
+    t0 = time.time()
+    weights = stream_safetensors(shard_paths)
+    print(f"loaded {len(weights)} tensors in {time.time() - t0:.1f}s")
+    for k in list(weights.keys()):
+        if weights[k].dtype == mx.float32:
+            weights[k] = weights[k].astype(mx.bfloat16)
+    weights = quantise(weights, bits=args.bits, group_size=args.group_size)
+    # Split: mlx-vlm-loadable backbone goes to model.safetensors;
+    # our diffusion-side heads go to custom_heads.safetensors so mlx-vlm's
+    # strict load doesn't reject them.
+    backbone = {k: v for k, v in weights.items()
+                if not any(k.startswith(p) for p in CUSTOM_HEAD_PREFIXES)}
+    custom = {k: v for k, v in weights.items()
+              if any(k.startswith(p) for p in CUSTOM_HEAD_PREFIXES)}
+    out_path = out_dir / "model.safetensors"
+    print(f"saving {len(backbone)} backbone tensors to {out_path} ...")
+    mx.save_safetensors(str(out_path), backbone)
+    extras_dir = out_dir / "extras"
+    extras_dir.mkdir(exist_ok=True)
+    custom_path = extras_dir / "custom_heads.safetensors"
+    print(f"saving {len(custom)} custom-head tensors to {custom_path} ...")
+    mx.save_safetensors(str(custom_path), custom)
+    copy_metadata(src_dir, out_dir)
+    # Update config.json. For bits<16, write the quantization field so mlx-vlm
+    # wraps Linear -> QuantizedLinear before loading. For bits=16 (BF16, no quant),
+    # remove any pre-existing quantization field so mlx-vlm loads as plain Linear.
+    cfg_path = out_dir / "config.json"
+    cfg = json.loads(cfg_path.read_text())
+    if args.bits == 16:
+        cfg.pop("quantization", None)
+        print("config.json: bits=16 (BF16), no quantization field written")
+    else:
+        cfg["quantization"] = {"group_size": args.group_size, "bits": args.bits}
+        print(f"config.json: quantization={{group_size: {args.group_size}, bits: {args.bits}}}")
+    cfg_path.write_text(json.dumps(cfg, indent=2))
+    (out_dir / "mlx_lab_meta.json").write_text(json.dumps({
+        "format": "hidream-o1-mlx-lab/v0",
+        "bits": args.bits,
+        "group_size": args.group_size,
+        "source": str(src_dir),
+    }, indent=2))
+    print(f"done; output dir size: {sum(f.stat().st_size for f in out_dir.glob('*')) / 1e9:.2f} GB")
+if __name__ == "__main__":
+    main()

scripts/hidream_o1/creative_showcase.sh ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/usr/bin/env bash
+# Creative showcase — push the envelope.
+# Vertical social-media format (1440x2560) + cinema ultrawide (3104x1312).
+set -euo pipefail
+LAB="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+PY="$LAB/.venv/bin/python"
+MODEL="$LAB/mlx_models/hidream-o1-dev-q6"
+OUT_V="$LAB/sample_outputs/showcase_creative/vertical_1440x2560"
+OUT_W="$LAB/sample_outputs/showcase_creative/wide_3104x1312"
+mkdir -p "$OUT_V" "$OUT_W"
+run() {
+  local out_dir="$1" name="$2" w="$3" h="$4" prompt="$5" seed="${6:-42}"
+  echo "=== $name ${w}x${h} (seed=$seed) ==="
+  cd "$LAB" && /usr/bin/time -l "$PY" scripts/hidream_o1/generate_hidream_o1_mlx.py \
+    --model-path "$MODEL" \
+    --prompt "$prompt" \
+    --width "$w" --height "$h" \
+    --output "$out_dir/$name.png" \
+    --seed "$seed" 2>&1 | grep -E "loaded|using|generation:|saved" | tail -3
+  echo ""
+}
+# === VERTICAL — social media influencer aesthetic (9:16 trained = 1440x2560) ===
+run "$OUT_V" "01_fitness_influencer" 1440 2560 \
+  "vertical full-body shot of a strong female fitness influencer in matching black lululemon top and high-waist leggings, mid-deadlift in a sunlit industrial gym, hair in a slick high ponytail, chalk dust in the air around her hands, hyperreal sweat detail, dramatic side lighting from large factory windows, professional sports photography, hyper sharp focus, 50mm lens" \
+  101
+run "$OUT_V" "02_glamour_mirror" 1440 2560 \
+  "vertical full-body mirror selfie of a stylish female fashion influencer in a luxury Paris hotel bathroom, wearing a black silk slip dress and gold hoop earrings, holding a vintage Polaroid camera up to take the shot, marble walls and warm sconce lighting behind her, vogue magazine aesthetic, soft glamour, photoreal" \
+  202
+run "$OUT_V" "03_travel_iceland" 1440 2560 \
+  "vertical full-body shot of a male travel photographer standing alone on a black volcanic Iceland beach with massive basalt sea stacks behind him, wearing a heavy grey wool turtleneck, dark technical pants, and a beanie, breath visible in the cold air, dramatic overcast moody light, churning North Atlantic waves, cinematic landscape photography, 35mm" \
+  303
+run "$OUT_V" "04_streetwear_tokyo" 1440 2560 \
+  "vertical full-body shot of a Japanese streetwear influencer in front of a graffiti-covered wall in Harajuku, wearing oversized black raf simons hoodie with white text, baggy washed denim, chunky asics sneakers, ear-length neon green dyed hair, hands in pockets, head tilted with a slight smirk, golden-hour rim light, fashion editorial, photoreal" \
+  404
+# === CINEMA — ultrawide environments (21:9 trained = 3104x1312) ===
+run "$OUT_W" "05_samurai_bamboo" 3104 1312 \
+  "ultrawide cinematic shot of two samurai facing off in a misty bamboo forest at dawn, swords drawn, kimonos in deep red and indigo, golden first light filtering through tall bamboo stalks, particles of mist floating between them, anamorphic 35mm film, Akira Kurosawa composition, Roger Deakins cinematography" \
+  505
+run "$OUT_W" "06_astronaut_mars" 3104 1312 \
+  "ultrawide cinematic shot of a single astronaut walking across a vast Martian canyon floor, deep red rocks rising hundreds of meters on either side, footprints behind in the rust dust, faint planet earth visible as a blue dot in the salmon sky, helmet visor reflecting the alien landscape, sense of cosmic scale and isolation, Denis Villeneuve aesthetic" \
+  606
+run "$OUT_W" "07_dragon_kingdom" 3104 1312 \
+  "ultrawide cinematic shot of an enormous black dragon flying low over a medieval mountain kingdom at dusk, scales glistening, wings catching the last orange sunlight, peasants in the foreground looking up in awe from a stone bridge, the kingdom castle and snow-capped peaks in the deep background, fantasy epic, Peter Jackson composition, wide-angle lens, painterly atmosphere" \
+  707
+echo "=== creative showcase complete ==="
+ls -la "$OUT_V" "$OUT_W"

scripts/hidream_o1/flow_match.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""MLX port of FlashFlowMatchEulerDiscreteScheduler from HiDream-O1.
+Reference: HiDream-ai/HiDream-O1-Image @ models/flash_scheduler.py.
+Trimmed to the path the Dev recipe actually uses:
+  - num_train_timesteps=1000, shift=1.0, use_dynamic_shifting=False
+  - timesteps overridden by DEFAULT_TIMESTEPS after construction
+  - karras/exponential/beta sigmas not used
+  - step() with s_churn/s_tmin/s_tmax stripped (always defaults)
+The math is verbatim from upstream — only the framework swap (torch -> mlx).
+"""
+from __future__ import annotations
+import mlx.core as mx
+import numpy as np
+# Verbatim from HiDream-O1 models/pipeline.py
+DEFAULT_TIMESTEPS = [
+    999, 987, 974, 960, 945, 929, 913, 895, 877, 857, 836, 814, 790, 764, 737,
+    707, 675, 640, 602, 560, 515, 464, 409, 347, 278, 199, 110, 8,
+]
+class FlashFlowMatchScheduler:
+    """Euler scheduler for flow matching, with optional noise injection."""
+    def __init__(self, num_train_timesteps: int = 1000, shift: float = 1.0):
+        self.num_train_timesteps = num_train_timesteps
+        self.shift = shift
+        sigmas = np.linspace(1.0, 1.0 / num_train_timesteps, num_train_timesteps, dtype=np.float32)
+        sigmas = shift * sigmas / (1.0 + (shift - 1.0) * sigmas)
+        self.sigmas_np = sigmas
+        self.timesteps_np = sigmas * num_train_timesteps
+        self.num_inference_steps: int | None = None
+        self._step_index: int | None = None
+    def set_timesteps(self, num_inference_steps: int, custom_timesteps: list[int] | None = None):
+        if custom_timesteps is not None:
+            timesteps = np.asarray(custom_timesteps, dtype=np.float32)
+            sigmas = (timesteps / self.num_train_timesteps).astype(np.float32)
+            sigmas = np.append(sigmas, 0.0).astype(np.float32)
+        else:
+            timesteps = np.linspace(self.num_train_timesteps, 1.0, num_inference_steps, dtype=np.float32)
+            sigmas = (timesteps / self.num_train_timesteps).astype(np.float32)
+            sigmas = self.shift * sigmas / (1.0 + (self.shift - 1.0) * sigmas)
+            sigmas = np.append(sigmas, 0.0).astype(np.float32)
+        self.num_inference_steps = len(timesteps)
+        self.timesteps_np = timesteps
+        self.sigmas_np = sigmas
+        self._step_index = None
+    @property
+    def timesteps(self) -> mx.array:
+        return mx.array(self.timesteps_np)
+    @property
+    def sigmas(self) -> mx.array:
+        return mx.array(self.sigmas_np)
+    def _init_step_index(self, timestep_value: float):
+        ts = self.timesteps_np
+        matches = np.where(np.isclose(ts, timestep_value, atol=1e-3))[0]
+        if len(matches) == 0:
+            raise ValueError(f"timestep {timestep_value!r} not in scheduler.timesteps")
+        self._step_index = int(matches[1] if len(matches) > 1 else matches[0])
+    def step(self, model_output, timestep, sample,
+             s_noise=1.0, noise_clip_std=0.0, seed=None):
+        if self._step_index is None:
+            self._init_step_index(float(timestep))
+        idx = self._step_index
+        sigma = float(self.sigmas_np[idx])
+        sigma_next = float(self.sigmas_np[idx + 1])
+        sample_f = sample.astype(mx.float32)
+        model_output_f = model_output.astype(mx.float32)
+        denoised = sample_f - model_output_f * sigma
+        if idx < self.num_inference_steps:
+            if seed is not None:
+                key = mx.random.key(seed + idx)
+                noise = mx.random.normal(model_output_f.shape, key=key)
+            else:
+                noise = mx.random.normal(model_output_f.shape)
+            if noise_clip_std > 0:
+                std = float(mx.std(noise))
+                clip = noise_clip_std * std
+                noise = mx.clip(noise, -clip, clip)
+            new_sample = sigma_next * noise * s_noise + (1.0 - sigma_next) * denoised
+        else:
+            new_sample = denoised
+        self._step_index += 1
+        return new_sample.astype(sample.dtype)

scripts/hidream_o1/generate_hidream_o1_mlx.py ADDED Viewed

	@@ -0,0 +1,327 @@

+#!/usr/bin/env python3
+"""HiDream-O1-Image-Dev inference on MLX (T2I, Dev recipe only)."""
+from __future__ import annotations
+import argparse
+import sys
+import time
+from pathlib import Path
+import numpy as np
+HERE = Path(__file__).parent
+sys.path.insert(0, str(HERE))
+from pipeline_helpers import (
+    PATCH_SIZE, NOISE_SCALE_DEFAULT, T_EPS,
+    build_attention_mask, find_closest_resolution, patchify, unpatchify,
+)
+def shape_test():
+    print("=== HiDream-O1 MLX lab — shape sanity test ===")
+    H, W = 512, 512
+    x = np.random.randn(3, H, W).astype(np.float32)
+    p = patchify(x)
+    expected_patches = (H // PATCH_SIZE) * (W // PATCH_SIZE)
+    expected_dim = 3 * PATCH_SIZE * PATCH_SIZE
+    assert p.shape == (expected_patches, expected_dim)
+    x2 = unpatchify(p, H // PATCH_SIZE, W // PATCH_SIZE)
+    assert np.allclose(x, x2)
+    print(f"  [ok] patchify roundtrip: {x.shape} -> {p.shape} -> {x2.shape}")
+    txt_seq_len = 12
+    img_seq_len = expected_patches
+    total = txt_seq_len + img_seq_len
+    token_types = np.zeros((1, total), dtype=np.int64)
+    token_types[0, txt_seq_len - 1: total] = 1
+    DTYPE_MIN = -1e9
+    mask = build_attention_mask(token_types, DTYPE_MIN)
+    assert mask.shape == (1, 1, total, total)
+    assert mask[0, 0, 0, 1] == DTYPE_MIN and mask[0, 0, 0, 0] == 0
+    assert (mask[0, 0, txt_seq_len + 5] == 0).all()
+    print(f"  [ok] mask: shape={mask.shape}, text rows causal, gen rows bidirectional")
+    from flow_match import FlashFlowMatchScheduler, DEFAULT_TIMESTEPS
+    sched = FlashFlowMatchScheduler(num_train_timesteps=1000, shift=1.0)
+    sched.set_timesteps(28, custom_timesteps=DEFAULT_TIMESTEPS)
+    assert sched.num_inference_steps == 28
+    assert len(sched.sigmas_np) == 29
+    diffs = np.diff(sched.sigmas_np)
+    assert (diffs <= 1e-6).all()
+    print(f"  [ok] scheduler: 28 steps, sigmas {sched.sigmas_np[0]:.4f} -> {sched.sigmas_np[-2]:.4f} -> 0")
+    try:
+        import mlx.core as mx
+    except ImportError:
+        print("  [skip] mlx not available")
+    else:
+        B, N, D = 1, expected_patches, expected_dim
+        z = mx.random.normal((B, N, D))
+        model_output = mx.random.normal((B, N, D))
+        ts0 = float(sched.timesteps_np[0])
+        z2 = sched.step(model_output, ts0, z, s_noise=7.5, noise_clip_std=2.5, seed=42)
+        assert z2.shape == z.shape
+        print(f"  [ok] mlx step: z {z.shape} -> z' {z2.shape}, dtype {z2.dtype}")
+    snapped = find_closest_resolution(540, 960)
+    assert snapped in [(1440, 2560), (1312, 3104)]
+    print(f"  [ok] resolution snap 540x960 -> {snapped}")
+    print("=== all shape tests passed ===")
+def run_inference(args):
+    import mlx.core as mx
+    try:
+        from mlx_vlm import load as mlx_vlm_load
+    except ImportError:
+        sys.exit("mlx-vlm not installed. uv pip install 'mlx-vlm>=0.3.3'")
+    from PIL import Image
+    import tqdm
+    from pipeline_helpers import build_t2i_text_sample
+    from flow_match import FlashFlowMatchScheduler, DEFAULT_TIMESTEPS
+    from hidream_model import HiDreamConfig, build_model, forward_generation, precompute_text_embeds_with_vision
+    print(f"loading model from {args.model_path} ...", flush=True)
+    t0 = time.time()
+    backbone, processor = mlx_vlm_load(args.model_path)
+    print(f"  loaded in {time.time() - t0:.1f}s")
+    cfg = HiDreamConfig()
+    model = build_model(cfg, backbone)
+    custom_path = Path(args.model_path) / "extras" / "custom_heads.safetensors"
+    if not custom_path.exists():
+        sys.exit(f"missing {custom_path}; rerun the converter")
+    custom_weights = mx.load(str(custom_path))
+    model.load_weights(list(custom_weights.items()), strict=False)
+    print(f"  loaded {len(custom_weights)} custom-head tensors")
+    width, height = args.width, args.height
+    if not args.no_snap_resolution:
+        sw, sh = find_closest_resolution(width, height)
+        if (sw, sh) != (width, height):
+            print(f"  resolution snapped {width}x{height} -> {sw}x{sh} (trained dim)")
+            width, height = sw, sh
+    # patch-align fallback (HiDream only operates on multiples of PATCH_SIZE)
+    width = (width // PATCH_SIZE) * PATCH_SIZE
+    height = (height // PATCH_SIZE) * PATCH_SIZE
+    print(f"  using {width}x{height} ({(width//PATCH_SIZE)*(height//PATCH_SIZE)} patches)")
+    h_patches = height // PATCH_SIZE
+    w_patches = width // PATCH_SIZE
+    tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor
+    for n in ("boi", "bor", "eor", "bot", "tms"):
+        if not hasattr(tokenizer, f"{n}_token"):
+            setattr(tokenizer, f"{n}_token", f"<|{n}_token|>")
+    refs = list(args.ref_images or [])
+    if refs:
+        from pipeline_helpers import build_edit_text_sample
+        print(f"  edit mode: K={len(refs)} reference image(s)")
+        sample = build_edit_text_sample(
+            args.prompt, refs, height, width, tokenizer, processor, backbone.config,
+        )
+    else:
+        sample = build_t2i_text_sample(args.prompt, height, width, tokenizer, processor, backbone.config)
+    input_ids   = mx.array(sample["input_ids"])
+    position_ids = mx.array(sample["position_ids"])
+    token_types  = mx.array(sample["token_types"])
+    vinput_mask  = sample["vinput_mask"]
+    # Edit-mode extras (None for T2I)
+    pixel_values_mx = mx.array(sample["pixel_values"]).astype(mx.bfloat16) if refs else None
+    image_grid_thw_mx = mx.array(sample["image_grid_thw"]) if refs else None
+    ref_patches_mx = mx.array(sample["ref_patches"]).astype(mx.bfloat16) if refs else None
+    tgt_image_len = sample.get("tgt_image_len", (height // PATCH_SIZE) * (width // PATCH_SIZE))
+    DTYPE_MIN = -1e4
+    mask4d = mx.array(build_attention_mask(sample["token_types"], DTYPE_MIN)).astype(mx.bfloat16)
+    rng_key = mx.random.key(args.seed + 1)
+    noise = args.noise_scale_start * mx.random.normal((1, 3, height, width), key=rng_key)
+    noise_np = np.asarray(noise)
+    z = mx.array(patchify(noise_np[0])[None]).astype(mx.bfloat16)
+    sched = FlashFlowMatchScheduler(num_train_timesteps=1000, shift=1.0)
+    sched.set_timesteps(args.num_inference_steps, custom_timesteps=DEFAULT_TIMESTEPS)
+    noise_scale_schedule = np.linspace(args.noise_scale_start, args.noise_scale_end,
+                                       len(sched.timesteps_np))
+    # Precompute MLX-native indices for the patch slice. Two index sets:
+    # - vinput_idx: positions where the model gets vinputs (tgt + refs in edit mode, tgt only in T2I)
+    # - tgt_idx: positions where the TARGET image patches live (subset of vinput_idx in edit mode)
+    vinput_idx = mx.array(np.where(vinput_mask[0])[0].astype(np.int32))
+    if refs:
+        tgt_mask = sample["vinput_mask_tgt_only"]
+        tgt_idx = mx.array(np.where(tgt_mask[0])[0].astype(np.int32))
+    else:
+        tgt_idx = vinput_idx
+    # Precompute text+vision inputs_embeds — these are constant across denoising
+    # steps (only the vinputs / timestep change), so we save 28x the vision work.
+    inputs_embeds_pre = precompute_text_embeds_with_vision(
+        model, cfg, input_ids,
+        pixel_values=pixel_values_mx, image_grid_thw=image_grid_thw_mx,
+    )
+    mx.eval(inputs_embeds_pre)
+    print(f"  precomputed inputs_embeds: {inputs_embeds_pre.shape}")
+    t_start = time.time()
+    for step_idx, step_t in enumerate(tqdm.tqdm(sched.timesteps_np, desc="generating")):
+        # Native MLX scalar — no numpy roundtrip
+        t_pixeldit = mx.full([1], 1.0 - float(step_t) / 1000.0, dtype=mx.float32)
+        sigma = max(float(step_t) / 1000.0, T_EPS)
+        # Edit mode: vinputs is the target z concatenated with the clean ref patches.
+        # The forward_generation embeds + concatenates these to inputs_embeds; the
+        # mask routes attention so refs are bidirectional too.
+        if refs:
+            vinputs = mx.concatenate([z, ref_patches_mx], axis=1)
+        else:
+            vinputs = z
+        x_pred = forward_generation(
+            model, cfg,
+            inputs_embeds_with_vision=inputs_embeds_pre,
+            position_ids=position_ids,
+            vinputs=vinputs,
+            timestep=t_pixeldit,
+            input_ids=input_ids,
+            token_types=token_types,
+            attention_mask_4d=mask4d,
+        )
+        # Slice the target patches only (excludes refs in edit mode).
+        gen_patches_mx = mx.take(x_pred, tgt_idx, axis=1).astype(mx.float32)
+        # Optional: clamp x_pred to [q, 1-q] quantile per step. Upstream has this
+        # commented out (pipeline.py line 327: `x_pred = clamp_tensor(x_pred, percentage=0.01)`).
+        # The patch-grid artifact comes from per-patch outliers — clamping the per-step
+        # x_pred range trims the worst extremes that show up as 32-pixel grid lines.
+        if args.clamp_x_pred > 0:
+            gp_np = np.asarray(gen_patches_mx)
+            lo = float(np.quantile(gp_np, args.clamp_x_pred))
+            hi = float(np.quantile(gp_np, 1.0 - args.clamp_x_pred))
+            gen_patches_mx = mx.clip(gen_patches_mx, lo, hi)
+        if args.diag and step_idx in (0, 1, 13, 27):
+            zarr = np.asarray(z.astype(mx.float32))
+            gp = np.asarray(gen_patches_mx)
+            print(f"  [diag step {step_idx}] sigma={sigma:.4f} "
+                  f"z(mean={zarr.mean():.3f},std={zarr.std():.3f}) "
+                  f"x_pred(mean={gp.mean():.3f},std={gp.std():.3f},"
+                  f"min={gp.min():.3f},max={gp.max():.3f})")
+        v = (gen_patches_mx - z.astype(mx.float32)) / sigma
+        model_output = -v
+        z = sched.step(model_output, float(step_t), z,
+                       s_noise=float(noise_scale_schedule[step_idx]),
+                       noise_clip_std=args.noise_clip_std,
+                       seed=args.seed)
+        # Force eval per step so timing is honest (otherwise mlx's lazy eval
+        # batches the entire loop into the final save, hiding per-step cost).
+        mx.eval(z)
+    elapsed = time.time() - t_start
+    print(f"  generation: {elapsed:.1f}s  ({elapsed / args.num_inference_steps:.2f}s/step)")
+    img = (z + 1) / 2
+    img_np = np.asarray(img[0].astype(mx.float32))
+    rgb = unpatchify(img_np, h_patches, w_patches)
+    arr = np.clip(rgb.transpose(1, 2, 0) * 255, 0, 255).astype(np.uint8)
+    if args.blend_seams > 0:
+        arr = _blend_patch_seams(arr, PATCH_SIZE, radius=args.blend_seams)
+    Image.fromarray(arr).save(args.output)
+    print(f"saved -> {args.output}")
+def _blend_patch_seams(rgb: np.ndarray, patch: int, radius: int = 1) -> np.ndarray:
+    """Smooth the 32-pixel patch grid seams.
+    HiDream's final_layer2 predicts each patch independently. In flat
+    regions adjacent patches don't quite agree on color, so a regular grid
+    of seams shows up. We replace each seam row/col with a weighted average
+    of its (2*radius+1) neighbours using a triangular kernel — heavier
+    smoothing than the previous box average, but still limited to a thin
+    band around each seam so non-seam content is preserved.
+    rgb: [H, W, 3] uint8
+    radius: 0 = off, 1 = ±1 row blur (subtle), 2-3 = visible at HD/QHD,
+            4+ = noticeable softening of seams
+    """
+    if radius <= 0:
+        return rgb
+    out = rgb.astype(np.float32).copy()
+    H, W, _ = rgb.shape
+    # Triangular kernel weights for (2*radius+1) rows
+    weights = np.array([radius - abs(i - radius) + 1 for i in range(2 * radius + 1)], dtype=np.float32)
+    weights = weights / weights.sum()
+    # Horizontal seams (vary by y); rebuild the seam row(s) as a triangular
+    # weighted average of (2*radius+1) neighbouring rows.
+    for y in range(patch, H, patch):
+        for offset in (-1, 0):  # blend the rows immediately above and at the seam
+            yy = y + offset
+            if 0 <= yy < H:
+                lo = max(0, yy - radius)
+                hi = min(H, yy + radius + 1)
+                # Pad weights to match the actually-clipped band
+                w_slice = weights[radius - (yy - lo): radius + (hi - yy)]
+                w_slice = w_slice / w_slice.sum()
+                band = out[lo:hi]                              # [n, W, 3]
+                smoothed = (band * w_slice[:, None, None]).sum(axis=0)
+                out[yy] = smoothed
+    # Vertical seams
+    for x in range(patch, W, patch):
+        for offset in (-1, 0):
+            xx = x + offset
+            if 0 <= xx < W:
+                lo = max(0, xx - radius)
+                hi = min(W, xx + radius + 1)
+                w_slice = weights[radius - (xx - lo): radius + (hi - xx)]
+                w_slice = w_slice / w_slice.sum()
+                band = out[:, lo:hi]                            # [H, n, 3]
+                smoothed = (band * w_slice[None, :, None]).sum(axis=1)
+                out[:, xx] = smoothed
+    return np.clip(out, 0, 255).astype(np.uint8)
+def main(argv=None):
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--shape-test", action="store_true")
+    ap.add_argument("--model-path", default="mlx_models/hidream-o1-dev-q4")
+    ap.add_argument("--prompt", default="a small red mushroom on a bed of moss, soft daylight, macro photo")
+    ap.add_argument("--output", default="sample_outputs/out.png")
+    ap.add_argument("--width", type=int, default=512)
+    ap.add_argument("--height", type=int, default=512)
+    ap.add_argument("--num-inference-steps", type=int, default=28)
+    ap.add_argument("--no-snap-resolution", action="store_true",
+                    help="Disable snapping to trained PREDEFINED_RESOLUTIONS list (off-spec dims produce visible patch artifacts)")
+    ap.add_argument("--diag", action="store_true",
+                    help="Print stats of z and x_pred at a few key steps")
+    ap.add_argument("--blend-seams", type=int, default=0,
+                    help="Post-process: smooth patch-grid seams with this radius (0 = off, 1-2 typical)")
+    ap.add_argument("--clamp-x-pred", type=float, default=0.0,
+                    help="Per-step quantile clamp on x_pred (0 = off; 0.01 = upstream's commented-out value)")
+    ap.add_argument("--ref-images", nargs="*", default=[],
+                    help="Reference image paths for edit/multi-ref mode (1-3). Empty = pure T2I.")
+    ap.add_argument("--noise-scale-start", type=float, default=NOISE_SCALE_DEFAULT)
+    ap.add_argument("--noise-scale-end", type=float, default=NOISE_SCALE_DEFAULT)
+    ap.add_argument("--noise-clip-std", type=float, default=2.5)
+    ap.add_argument("--seed", type=int, default=32)
+    args = ap.parse_args(argv)
+    if args.shape_test:
+        shape_test()
+        return
+    run_inference(args)
+if __name__ == "__main__":
+    main()

scripts/hidream_o1/hidream_model.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""Standalone MLX model wrapper for HiDream-O1-Image."""
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+class TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size: int, frequency_embedding_size: int = 256):
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.fc1 = nn.Linear(frequency_embedding_size, hidden_size, bias=True)
+        self.fc2 = nn.Linear(hidden_size, hidden_size, bias=True)
+    @staticmethod
+    def timestep_embedding(t: mx.array, dim: int, max_period: float = 10000.0) -> mx.array:
+        half = dim // 2
+        freqs = mx.exp(-math.log(max_period) * mx.arange(0, half, dtype=mx.float32) / half)
+        args = t[:, None].astype(mx.float32) * freqs[None]
+        emb = mx.concatenate([mx.cos(args), mx.sin(args)], axis=-1)
+        if dim % 2:
+            emb = mx.concatenate([emb, mx.zeros_like(emb[:, :1])], axis=-1)
+        return emb
+    def __call__(self, t: mx.array) -> mx.array:
+        t_freq = self.timestep_embedding(t * 1000.0, self.frequency_embedding_size)
+        return self.fc2(nn.silu(self.fc1(t_freq.astype(self.fc1.weight.dtype))))
+class BottleneckPatchEmbed(nn.Module):
+    def __init__(self, patch_size: int = 32, in_chans: int = 3,
+                 pca_dim: int = 1024, embed_dim: int = 4096):
+        super().__init__()
+        self.proj1 = nn.Linear(patch_size * patch_size * in_chans, pca_dim, bias=False)
+        self.proj2 = nn.Linear(pca_dim, embed_dim, bias=True)
+    def __call__(self, x: mx.array) -> mx.array:
+        return self.proj2(self.proj1(x))
+class FinalLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int = 32, out_channels: int = 3):
+        super().__init__()
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+    def __call__(self, x: mx.array) -> mx.array:
+        return self.linear(x)
+CUSTOM_HEAD_KEY_MAP = {
+    "model.t_embedder1.mlp.0.weight":   "t_embedder1.fc1.weight",
+    "model.t_embedder1.mlp.0.bias":     "t_embedder1.fc1.bias",
+    "model.t_embedder1.mlp.2.weight":   "t_embedder1.fc2.weight",
+    "model.t_embedder1.mlp.2.bias":     "t_embedder1.fc2.bias",
+    "model.x_embedder.proj1.weight":    "x_embedder.proj1.weight",
+    "model.x_embedder.proj2.weight":    "x_embedder.proj2.weight",
+    "model.x_embedder.proj2.bias":      "x_embedder.proj2.bias",
+    "model.final_layer2.linear.weight": "final_layer2.linear.weight",
+    "model.final_layer2.linear.bias":   "final_layer2.linear.bias",
+}
+@dataclass
+class HiDreamConfig:
+    hidden_size: int = 4096
+    patch_size: int = 32
+    in_channels: int = 3
+    bottleneck_dim: int = 1024
+    tms_token_id: int = 151673
+    image_token_id: int = 151655
+    video_token_id: int = 151656
+    vision_start_token_id: int = 151652
+def build_model(cfg: HiDreamConfig, mlx_vlm_qwen3_vl_model):
+    class HiDream(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.visual = mlx_vlm_qwen3_vl_model.vision_tower
+            self.language_model = mlx_vlm_qwen3_vl_model.language_model
+            self.t_embedder1 = TimestepEmbedder(cfg.hidden_size)
+            self.x_embedder = BottleneckPatchEmbed(
+                patch_size=cfg.patch_size, in_chans=cfg.in_channels,
+                pca_dim=cfg.bottleneck_dim, embed_dim=cfg.hidden_size,
+            )
+            self.final_layer2 = FinalLayer(
+                hidden_size=cfg.hidden_size,
+                patch_size=cfg.patch_size,
+                out_channels=cfg.in_channels,
+            )
+    return HiDream()
+def precompute_text_embeds_with_vision(model, cfg, input_ids, pixel_values=None, image_grid_thw=None):
+    """Compute text embeddings + (in edit mode) inject vision features at image_token
+    positions. Returns embeds [B, S_text, hidden]. Call once before the denoising
+    loop — output is constant across timesteps.
+    """
+    embed_tokens = model.language_model.model.embed_tokens
+    inputs_embeds = embed_tokens(input_ids)
+    if pixel_values is None or image_grid_thw is None:
+        return inputs_embeds
+    vt_out = model.visual(pixel_values, image_grid_thw)
+    image_features = vt_out[0] if isinstance(vt_out, tuple) else vt_out
+    if isinstance(image_features, (list, tuple)):
+        image_features = mx.concatenate(image_features, axis=0)
+    # Build a [B, S, H] tensor that has image_features at image_token positions
+    # and inputs_embeds everywhere else, via mx.where on a broadcast mask.
+    ids_np = np.asarray(input_ids)
+    img_positions = np.where(ids_np[0] == cfg.image_token_id)[0]
+    if img_positions.shape[0] != image_features.shape[0]:
+        raise RuntimeError(
+            f"image_features {image_features.shape[0]} != "
+            f"image_token_id positions {img_positions.shape[0]} (input_ids was: {ids_np.shape})"
+        )
+    B, S, H = inputs_embeds.shape
+    # Build aligned-to-S features: zero everywhere except at image positions.
+    aligned = np.zeros((B, S, H), dtype=np.float32)
+    aligned[0, img_positions] = np.asarray(image_features.astype(mx.float32))
+    aligned_mx = mx.array(aligned).astype(inputs_embeds.dtype)
+    # Mask: 1 at image positions, 0 elsewhere
+    mask_2d = (ids_np == cfg.image_token_id).astype(np.bool_)
+    mask_3d = np.broadcast_to(mask_2d[..., None], (B, S, H))
+    mask_mx = mx.array(mask_3d.copy())
+    return mx.where(mask_mx, aligned_mx, inputs_embeds)
+def forward_generation(model, cfg, inputs_embeds_with_vision, position_ids, vinputs, timestep,
+                       input_ids, token_types, attention_mask_4d):
+    """Per-step forward. Takes the precomputed text+vision inputs_embeds, the
+    fresh-noise vinputs, and the timestep. Returns x_pred [B, S_total, patch_dim].
+    Signature change vs the T2I-only version: pixel_values/image_grid_thw moved
+    out (call precompute_text_embeds_with_vision once before the loop). input_ids
+    is still needed inside because we look up tms_token positions for t_emb scatter.
+    """
+    inputs_embeds = inputs_embeds_with_vision
+    t_emb = model.t_embedder1(timestep)
+    tms_mask = (input_ids == cfg.tms_token_id)
+    tms_mask_3d = mx.broadcast_to(tms_mask[..., None], inputs_embeds.shape)
+    t_emb_expanded = mx.broadcast_to(t_emb[:, None, :], inputs_embeds.shape)
+    inputs_embeds = mx.where(tms_mask_3d, t_emb_expanded, inputs_embeds)
+    vinputs_embedded = model.x_embedder(vinputs).astype(inputs_embeds.dtype)
+    inputs_embeds = mx.concatenate([inputs_embeds, vinputs_embedded], axis=1)
+    text_model = model.language_model.model
+    # mlx-vlm Qwen3VLModel.__call__ accepts (inputs, inputs_embeds, mask, cache, position_ids, ...).
+    # Pass our 4D additive mask directly; it bypasses the internal causal mask.
+    # `inputs` is required positionally but ignored when inputs_embeds is set
+    # in mlx-vlm's implementation — pass a placeholder of correct shape.
+    placeholder = mx.zeros(inputs_embeds.shape[:2], dtype=mx.int32)
+    h = text_model(
+        placeholder,
+        inputs_embeds=inputs_embeds,
+        mask=attention_mask_4d,
+        cache=None,
+        position_ids=position_ids,
+    )
+    # Apply final norm. mlx-vlm's Qwen3VLModel applies it internally and returns hidden_states.
+    x_pred = model.final_layer2(h)
+    return x_pred

scripts/hidream_o1/pipeline_helpers.py ADDED Viewed

	@@ -0,0 +1,420 @@

+"""Helpers ported from HiDream-O1 models/pipeline.py + models/utils.py."""
+from __future__ import annotations
+import math
+from typing import Sequence
+import numpy as np
+PATCH_SIZE = 32
+TIMESTEP_TOKEN_NUM = 1
+NOISE_SCALE_DEFAULT = 7.5
+T_EPS = 0.001
+TMS_TOKEN_ID = 151673   # from qwen3_vl_transformers.py — Qwen3VLModel.tms_token_id
+CONDITION_IMAGE_SIZE = 384   # vision-tower-side size for reference images
+PREDEFINED_RESOLUTIONS = [
+    (2048, 2048),
+    (2304, 1728), (1728, 2304),
+    (2560, 1440), (1440, 2560),
+    (2496, 1664), (1664, 2496),
+    (3104, 1312), (1312, 3104),
+    (2304, 1792), (1792, 2304),
+]
+def find_closest_resolution(width: int, height: int) -> tuple[int, int]:
+    img_ratio = width / height
+    best, min_diff = None, float("inf")
+    for w, h in PREDEFINED_RESOLUTIONS:
+        diff = abs(w / h - img_ratio)
+        if diff < min_diff:
+            min_diff, best = diff, (w, h)
+    return best
+def patchify(img_chw: np.ndarray, patch: int = PATCH_SIZE) -> np.ndarray:
+    C, H, W = img_chw.shape
+    assert H % patch == 0 and W % patch == 0
+    x = img_chw.reshape(C, H // patch, patch, W // patch, patch)
+    x = np.transpose(x, (1, 3, 0, 2, 4))
+    return x.reshape(H // patch * W // patch, C * patch * patch)
+def unpatchify(patches_nd, h_patches, w_patches, patch=PATCH_SIZE, channels=3):
+    x = patches_nd.reshape(h_patches, w_patches, channels, patch, patch)
+    x = np.transpose(x, (2, 0, 3, 1, 4))
+    return x.reshape(channels, h_patches * patch, w_patches * patch)
+def build_t2i_text_sample(prompt, height, width, tokenizer, processor, model_config):
+    image_token_id = model_config.image_token_id
+    video_token_id = model_config.video_token_id
+    vision_start_token_id = model_config.vision_start_token_id
+    image_len = (height // PATCH_SIZE) * (width // PATCH_SIZE)
+    boi_token = getattr(tokenizer, "boi_token", "<|boi_token|>")
+    tms_token = getattr(tokenizer, "tms_token", "<|tms_token|>")
+    messages = [{"role": "user", "content": prompt}]
+    template_caption = (
+        processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        + boi_token + tms_token * TIMESTEP_TOKEN_NUM
+    )
+    input_ids = np.asarray(
+        tokenizer.encode(template_caption, add_special_tokens=False),
+        dtype=np.int64,
+    ).reshape(1, -1)
+    image_grid_thw = np.asarray(
+        [[1, height // PATCH_SIZE, width // PATCH_SIZE]], dtype=np.int64
+    )
+    vision_tokens = np.full((1, image_len), image_token_id, dtype=input_ids.dtype)
+    vision_tokens[0, 0] = vision_start_token_id
+    input_ids_pad = np.concatenate([input_ids, vision_tokens], axis=-1)
+    position_ids, _ = get_rope_index_fix_point(
+        spatial_merge_size=1,
+        image_token_id=image_token_id,
+        video_token_id=video_token_id,
+        vision_start_token_id=vision_start_token_id,
+        input_ids=input_ids_pad,
+        image_grid_thw=image_grid_thw,
+        skip_vision_start_token=[1],
+    )
+    txt_seq_len = input_ids.shape[-1]
+    all_seq_len = position_ids.shape[-1]
+    token_types = np.zeros((1, all_seq_len), dtype=np.int64)
+    bgn = txt_seq_len - TIMESTEP_TOKEN_NUM
+    token_types[0, bgn: bgn + image_len + TIMESTEP_TOKEN_NUM] = 1
+    # Tag the tms positions distinctly so vinput_mask excludes them — they're
+    # for the timestep embedding, not actual image patches.
+    token_types[0, txt_seq_len - TIMESTEP_TOKEN_NUM: txt_seq_len] = 3
+    vinput_mask = (token_types == 1)
+    token_types_bin = (token_types > 0).astype(np.int64)
+    return {
+        "input_ids": input_ids,
+        "position_ids": position_ids,
+        "token_types": token_types_bin,
+        "vinput_mask": vinput_mask,
+    }
+def get_rope_index_fix_point(
+    spatial_merge_size, image_token_id, video_token_id, vision_start_token_id,
+    input_ids, image_grid_thw=None, video_grid_thw=None, attention_mask=None,
+    skip_vision_start_token=None, fix_point=4096,
+):
+    if input_ids is None:
+        raise ValueError("input_ids is required")
+    if attention_mask is None:
+        attention_mask = np.ones_like(input_ids)
+    B, S = input_ids.shape
+    position_ids = np.ones((3, B, S), dtype=input_ids.dtype)
+    image_index = 0
+    video_index = 0
+    mrope_position_deltas: list[int] = []
+    for i in range(B):
+        ids_i = input_ids[i][attention_mask[i] == 1]
+        vision_start_indices = np.argwhere(ids_i == vision_start_token_id).reshape(-1)
+        vision_tokens = ids_i[vision_start_indices + 1] if len(vision_start_indices) else np.array([], dtype=ids_i.dtype)
+        image_nums = int((vision_tokens == image_token_id).sum())
+        video_nums = int((vision_tokens == video_token_id).sum())
+        toks = ids_i.tolist()
+        llm_pos_ids: list[np.ndarray] = []
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+        local_fix_point = fix_point
+        for _ in range(image_nums + video_nums):
+            ed_image = toks.index(image_token_id, st) if (image_token_id in toks[st:] and remain_images > 0) else len(toks) + 1
+            ed_video = toks.index(video_token_id, st) if (video_token_id in toks[st:] and remain_videos > 0) else len(toks) + 1
+            if ed_image < ed_video:
+                t, h, w = image_grid_thw[image_index]
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = video_grid_thw[video_index]
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+            llm_grid_t = int(t)
+            llm_grid_h = int(h) // spatial_merge_size
+            llm_grid_w = int(w) // spatial_merge_size
+            text_len = ed - st
+            text_len -= int(skip_vision_start_token[image_index - 1])
+            text_len = max(0, text_len)
+            st_idx = (llm_pos_ids[-1].max() + 1) if llm_pos_ids else 0
+            llm_pos_ids.append(np.broadcast_to(np.arange(text_len) + st_idx, (3, text_len)).copy())
+            t_index = np.repeat(np.arange(llm_grid_t), llm_grid_h * llm_grid_w)
+            h_index = np.tile(np.repeat(np.arange(llm_grid_h), llm_grid_w), llm_grid_t)
+            w_index = np.tile(np.arange(llm_grid_w), llm_grid_t * llm_grid_h)
+            if int(skip_vision_start_token[image_index - 1]):
+                if local_fix_point > 0:
+                    local_fix_point = local_fix_point - st_idx
+                llm_pos_ids.append(np.stack([t_index, h_index, w_index]) + local_fix_point + st_idx)
+                local_fix_point = 0
+            else:
+                llm_pos_ids.append(np.stack([t_index, h_index, w_index]) + text_len + st_idx)
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+        if st < len(toks):
+            st_idx = (llm_pos_ids[-1].max() + 1) if llm_pos_ids else 0
+            text_len = len(toks) - st
+            llm_pos_ids.append(np.broadcast_to(np.arange(text_len) + st_idx, (3, text_len)).copy())
+        llm_positions = np.concatenate(llm_pos_ids, axis=1).reshape(3, -1)
+        position_ids[..., i, attention_mask[i] == 1] = llm_positions
+        mrope_position_deltas.append(int(llm_positions.max() + 1 - input_ids.shape[1]))
+    deltas = np.asarray(mrope_position_deltas, dtype=np.int64).reshape(-1, 1)
+    return position_ids, deltas
+def resize_pilimage(pil_image, image_size: int, patch_size: int = PATCH_SIZE, resampler=None):
+    """Port of HiDream-O1 utils.py:resize_pilimage.
+    Reduce by 2x box resamples until min dim < 2*image_size, then bicubic-fit
+    + center-crop to the largest patch-aligned size that doesn't exceed
+    image_size**2 area.
+    """
+    from PIL import Image
+    if resampler is None:
+        resampler = Image.BICUBIC
+    while min(pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=Image.BOX)
+    m = patch_size
+    width, height = pil_image.width, pil_image.height
+    S_max = image_size * image_size
+    scale = math.sqrt(S_max / (width * height))
+    candidates = [
+        (round(width * scale) // m * m, round(height * scale) // m * m),
+        (round(width * scale) // m * m, math.floor(height * scale) // m * m),
+        (math.floor(width * scale) // m * m, round(height * scale) // m * m),
+        (math.floor(width * scale) // m * m, math.floor(height * scale) // m * m),
+    ]
+    candidates = sorted(candidates, key=lambda x: x[0] * x[1], reverse=True)
+    new_w, new_h = next((c for c in candidates if c[0] * c[1] <= S_max), candidates[-1])
+    s1 = width / new_w
+    s2 = height / new_h
+    if s1 < s2:
+        pil_image = pil_image.resize([new_w, round(height / s1)], resample=resampler)
+        top = (round(height / s1) - new_h) // 2
+        pil_image = pil_image.crop((0, top, new_w, top + new_h))
+    else:
+        pil_image = pil_image.resize([round(width / s2), new_h], resample=resampler)
+        left = (round(width / s2) - new_w) // 2
+        pil_image = pil_image.crop((left, 0, left + new_w, new_h))
+    return pil_image
+def calculate_dimensions(max_size: int, ratio: float) -> tuple[int, int]:
+    """Port of HiDream-O1 utils.py:calculate_dimensions.
+    Pick (w, h) such that max(w*h) <= max_size**2 and w/h ≈ ratio, both
+    multiples of 32 (PATCH_SIZE).
+    """
+    width = math.sqrt(max_size * max_size * ratio)
+    height = width / ratio
+    width = int(width / 32) * 32
+    height = int(height / 32) * 32
+    return width, height
+def patchify_ref_image(pil_image, patch: int = PATCH_SIZE) -> np.ndarray:
+    """Convert a PIL image (already patch-aligned) into HiDream's diffusion-side
+    patches: [N_patches, 3*patch*patch] with float32 in [-1, 1].
+    Mirrors the upstream `TENSOR_TRANSFORM` (ToTensor + Normalize 0.5/0.5).
+    """
+    arr = np.asarray(pil_image.convert("RGB"), dtype=np.float32) / 255.0  # [H, W, 3] in [0, 1]
+    arr = (arr - 0.5) / 0.5                                                # [-1, 1]
+    arr = arr.transpose(2, 0, 1)                                           # [3, H, W]
+    return patchify(arr, patch=patch)                                      # [N, 3*p*p]
+def build_edit_text_sample(
+    prompt: str,
+    ref_image_paths: Sequence[str],
+    height: int,
+    width: int,
+    tokenizer,
+    processor,
+    model_config,
+) -> dict:
+    """Build the unified token sequence + position_ids + masks for image edit
+    or multi-reference subject-driven generation.
+    Faithful port of the multi-ref branch of HiDream-O1 pipeline.py
+    generate_image. Single-reference (K=1) is the well-tested path.
+    Returns:
+      input_ids        [1, txt_seq_len]
+      position_ids     [3, 1, total_seq_len]
+      token_types      [1, total_seq_len]   (0=AR, 1=tgt+tms, 2=ref)
+      vinput_mask      [1, total_seq_len]   (True where diffusion patches go)
+      vinput_mask_tgt_only [1, total_seq_len]  (True ONLY for the tgt span; for slicing the prediction)
+      pixel_values     [N_vision_patches, vision_patch_dim]   (vision tower input)
+      image_grid_thw   [K, 3]                                 (vision tower grid for refs)
+      ref_patches      [1, sum(N_ref_patches), 3*32*32]       (clean ref patches for vinputs cat)
+      tgt_image_len    int                                    (number of target patches)
+    """
+    from PIL import Image
+    image_token_id = model_config.image_token_id
+    video_token_id = model_config.video_token_id
+    vision_start_token_id = model_config.vision_start_token_id
+    spatial_merge_size = model_config.vision_config.spatial_merge_size
+    ref_pils = [Image.open(p).convert("RGB") for p in ref_image_paths]
+    K = len(ref_pils)
+    if K == 1:
+        max_size = max(height, width)
+    elif K == 2:
+        max_size = max(height, width) * 48 // 64
+    elif K <= 4:
+        max_size = max(height, width) // 2
+    elif K <= 8:
+        max_size = max(height, width) * 24 // 64
+    else:
+        max_size = max(height, width) // 4
+    ref_pils_resized: list = []
+    ref_patch_lists: list = []
+    for pil in ref_pils:
+        pil_r = resize_pilimage(pil, max_size, PATCH_SIZE)
+        ref_pils_resized.append(pil_r)
+        ref_patch_lists.append(patchify_ref_image(pil_r))
+    ref_image_lens = [arr.shape[0] for arr in ref_patch_lists]
+    total_ref_len = sum(ref_image_lens)
+    ref_patches = np.concatenate(ref_patch_lists, axis=0)[None]   # [1, sum(N), 3*32*32]
+    tgt_image_len = (height // PATCH_SIZE) * (width // PATCH_SIZE)
+    if K <= 4:
+        cond_img_size = CONDITION_IMAGE_SIZE
+    elif K <= 8:
+        cond_img_size = CONDITION_IMAGE_SIZE * 48 // 64
+    else:
+        cond_img_size = CONDITION_IMAGE_SIZE // 2
+    ref_pils_vlm = []
+    for pil_r in ref_pils_resized:
+        cw, ch = calculate_dimensions(cond_img_size, pil_r.width / pil_r.height)
+        ref_pils_vlm.append(pil_r.resize((cw, ch), resample=Image.LANCZOS))
+    image_grid_thw_tgt = np.asarray([[1, height // PATCH_SIZE, width // PATCH_SIZE]], dtype=np.int64)
+    image_grid_thw_ref = np.zeros((K, 3), dtype=np.int64)
+    for i, pil_r in enumerate(ref_pils_resized):
+        rw, rh = pil_r.size
+        image_grid_thw_ref[i] = [1, rh // PATCH_SIZE, rw // PATCH_SIZE]
+    boi_token = getattr(tokenizer, "boi_token", "<|boi_token|>")
+    tms_token = getattr(tokenizer, "tms_token", "<|tms_token|>")
+    content = [{"type": "image"} for _ in range(K)]
+    content.append({"type": "text", "text": prompt})
+    messages = [{"role": "user", "content": content}]
+    template_caption = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    proc = processor(text=[template_caption], images=ref_pils_vlm, padding="longest", return_tensors="pt")
+    input_ids_2 = np.asarray(
+        tokenizer.encode(boi_token + tms_token * TIMESTEP_TOKEN_NUM, add_special_tokens=False),
+        dtype=np.int64,
+    ).reshape(1, -1)
+    proc_input_ids = np.asarray(proc.input_ids, dtype=np.int64)
+    input_ids = np.concatenate([proc_input_ids, input_ids_2], axis=-1)
+    igthw_cond = np.asarray(proc.image_grid_thw, dtype=np.int64).copy()
+    for i in range(K):
+        igthw_cond[i, 1] //= spatial_merge_size
+        igthw_cond[i, 2] //= spatial_merge_size
+    igthw_all = np.concatenate([igthw_cond, image_grid_thw_tgt, image_grid_thw_ref], axis=0)
+    # Build the per-image vision-token spans appended after the text:
+    #   tgt span (tgt_image_len tokens, first slot is vision_start)
+    #   then for each ref: span of ref_image_lens[i] tokens, first slot vision_start
+    vt_pieces = []
+    vt_tgt = np.full((1, tgt_image_len), image_token_id, dtype=input_ids.dtype)
+    vt_tgt[0, 0] = vision_start_token_id
+    vt_pieces.append(vt_tgt)
+    for rl in ref_image_lens:
+        vt_ref = np.full((1, rl), image_token_id, dtype=input_ids.dtype)
+        vt_ref[0, 0] = vision_start_token_id
+        vt_pieces.append(vt_ref)
+    vision_tokens = np.concatenate(vt_pieces, axis=1)
+    input_ids_pad = np.concatenate([input_ids, vision_tokens], axis=-1)
+    position_ids, _ = get_rope_index_fix_point(
+        spatial_merge_size=1,
+        image_token_id=image_token_id,
+        video_token_id=video_token_id,
+        vision_start_token_id=vision_start_token_id,
+        input_ids=input_ids_pad,
+        image_grid_thw=igthw_all,
+        video_grid_thw=None,
+        attention_mask=None,
+        skip_vision_start_token=[0] * K + [1] + [1] * K,
+    )
+    txt_seq_len = input_ids.shape[-1]
+    all_seq_len = position_ids.shape[-1]
+    token_types_raw = np.zeros((1, all_seq_len), dtype=np.int64)
+    bgn = txt_seq_len - TIMESTEP_TOKEN_NUM
+    end = bgn + tgt_image_len + TIMESTEP_TOKEN_NUM
+    token_types_raw[0, bgn:end] = 1                 # tgt span (and tms inside it)
+    token_types_raw[0, end: end + total_ref_len] = 2  # ref spans
+    token_types_raw[0, txt_seq_len - TIMESTEP_TOKEN_NUM: txt_seq_len] = 3  # tms
+    vinput_mask = np.logical_or(token_types_raw == 1, token_types_raw == 2)
+    vinput_mask_tgt_only = (token_types_raw == 1)   # excludes tms (=3) and refs (=2)
+    token_types_bin = (token_types_raw > 0).astype(np.int64)
+    # Pixel values from the processor are pre-flattened patches of vision-tower size.
+    # Shape (after np conversion) is [num_vision_patches, vision_patch_dim].
+    pixel_values_np = np.asarray(proc.pixel_values, dtype=np.float32)
+    image_grid_thw_for_visual = np.asarray(proc.image_grid_thw, dtype=np.int64)
+    return {
+        "input_ids": input_ids,
+        "position_ids": position_ids,
+        "token_types": token_types_bin,
+        "vinput_mask": vinput_mask,
+        "vinput_mask_tgt_only": vinput_mask_tgt_only,
+        "pixel_values": pixel_values_np,
+        "image_grid_thw": image_grid_thw_for_visual,
+        "ref_patches": ref_patches,
+        "tgt_image_len": tgt_image_len,
+    }
+def build_attention_mask(token_types_bin: np.ndarray, dtype_min: float) -> np.ndarray:
+    """text rows causal, gen rows bidirectional. Returns [B, 1, S, S] additive."""
+    B, S = token_types_bin.shape
+    mask = np.full((B, 1, S, S), dtype_min, dtype=np.float32)
+    causal_2d = np.triu(np.full((S, S), dtype_min, dtype=np.float32), k=1)
+    for b in range(B):
+        m = causal_2d.copy()
+        gen = token_types_bin[b].astype(bool)
+        m[gen, :] = 0.0
+        mask[b, 0] = m
+    return mask

scripts/hidream_o1/postprocess.py ADDED Viewed

	@@ -0,0 +1,106 @@

+#!/usr/bin/env python3
+"""Standalone post-process: take an existing HiDream PNG and smooth the
+32-pixel patch grid seams. NO model load — just numpy + PIL.
+Usage:
+  postprocess.py <input.png> <output.png> [--radius N] [--strength F]
+Strategy:
+  For each seam line (x, y multiples of PATCH_SIZE), apply a 1D gaussian
+  blur perpendicular to the seam, blended with the original by --strength.
+  The blur kernel is symmetric, so flat regions get more smoothing than
+  sharp edges (which the gaussian's centre weight preserves).
+  --radius   blur radius in pixels (default 3)
+  --strength blend weight 0-1 (default 0.7 = 70% blurred + 30% original)
+"""
+from __future__ import annotations
+import argparse
+import sys
+from pathlib import Path
+import numpy as np
+from PIL import Image
+PATCH_SIZE = 32
+def gaussian_kernel_1d(radius: int) -> np.ndarray:
+    """Build a normalised 1D gaussian kernel with sigma=radius/2."""
+    sigma = radius / 2.0
+    x = np.arange(-radius, radius + 1, dtype=np.float32)
+    k = np.exp(-0.5 * (x / sigma) ** 2)
+    return k / k.sum()
+def smooth_seams(rgb: np.ndarray, radius: int = 3, strength: float = 0.7) -> np.ndarray:
+    """Smooth horizontal+vertical patch seams via local gaussian blur.
+    The blur is applied to the SEAM rows/cols only, then alpha-blended back
+    by `strength`. Non-seam pixels are untouched.
+    """
+    out = rgb.astype(np.float32).copy()
+    H, W, C = rgb.shape
+    kernel = gaussian_kernel_1d(radius)  # length 2*radius+1
+    # --- Horizontal seams (rows at y in {patch, 2*patch, ...}) ---
+    # We smooth the 2 rows on each side of each seam (4 rows total per seam).
+    for y in range(PATCH_SIZE, H, PATCH_SIZE):
+        for offset in (-2, -1, 0, 1):
+            yy = y + offset
+            if not (0 <= yy < H):
+                continue
+            lo = max(0, yy - radius)
+            hi = min(H, yy + radius + 1)
+            k_lo = radius - (yy - lo)
+            k_hi = radius + (hi - yy)
+            k = kernel[k_lo:k_hi]
+            k = k / k.sum()
+            band = out[lo:hi]                       # [n, W, C]
+            blurred = (band * k[:, None, None]).sum(axis=0)
+            out[yy] = (1 - strength) * out[yy] + strength * blurred
+    # --- Vertical seams (cols at x in {patch, 2*patch, ...}) ---
+    for x in range(PATCH_SIZE, W, PATCH_SIZE):
+        for offset in (-2, -1, 0, 1):
+            xx = x + offset
+            if not (0 <= xx < W):
+                continue
+            lo = max(0, xx - radius)
+            hi = min(W, xx + radius + 1)
+            k_lo = radius - (xx - lo)
+            k_hi = radius + (hi - xx)
+            k = kernel[k_lo:k_hi]
+            k = k / k.sum()
+            band = out[:, lo:hi]                    # [H, n, C]
+            blurred = (band * k[None, :, None]).sum(axis=1)
+            out[:, xx] = (1 - strength) * out[:, xx] + strength * blurred
+    return np.clip(out, 0, 255).astype(np.uint8)
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("input")
+    ap.add_argument("output")
+    ap.add_argument("--radius", type=int, default=3)
+    ap.add_argument("--strength", type=float, default=0.7)
+    args = ap.parse_args()
+    inp = Path(args.input)
+    if not inp.exists():
+        sys.exit(f"input not found: {inp}")
+    rgb = np.array(Image.open(inp).convert("RGB"))
+    H, W = rgb.shape[:2]
+    print(f"{inp.name}: {W}x{H}, {(W // PATCH_SIZE) - 1} vertical + {(H // PATCH_SIZE) - 1} horizontal seams")
+    print(f"smoothing with radius={args.radius}, strength={args.strength}...")
+    out = smooth_seams(rgb, radius=args.radius, strength=args.strength)
+    Image.fromarray(out).save(args.output)
+    print(f"saved -> {args.output}")
+if __name__ == "__main__":
+    main()

scripts/hidream_o1/realism_batch.sh ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env bash
+# Anti-AI realism batch — film stocks, documentary photographers, natural light,
+# skin texture cues. BF16 weights (no quantization).
+set -euo pipefail
+LAB="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+PY="$LAB/.venv/bin/python"
+MODEL="$LAB/mlx_models/hidream-o1-dev-bf16"
+OUT="$LAB/sample_outputs/showcase_realism"
+mkdir -p "$OUT"
+run() {
+  local name="$1" w="$2" h="$3" prompt="$4" seed="${5:-42}"
+  echo "=== $name ${w}x${h} (seed=$seed) ==="
+  cd "$LAB" && "$PY" scripts/hidream_o1/generate_hidream_o1_mlx.py \
+    --model-path "$MODEL" \
+    --prompt "$prompt" \
+    --width "$w" --height "$h" \
+    --output "$OUT/$name.png" \
+    --seed "$seed" 2>&1 | grep -E "loaded|using|generation:|saved" | tail -3
+  echo ""
+}
+# Verticals (1440x2560)
+run "01_barista_morning" 1440 2560 \
+  "candid documentary photograph of a tired thirties barista pulling an espresso shot at 6am, natural skin with visible pores and faint under-eye shadows, slight grease on her apron, hair half-loose from her bun, warm overhead pendant lamp lighting only, shot on Kodak Portra 400 film with visible grain, William Eggleston color palette, no retouching, real and lived-in" \
+  111
+run "02_workshop_oldman" 1440 2560 \
+  "documentary portrait of a weathered seventy year old man in a cluttered bike workshop, fixing a vintage racing bicycle, deeply lined hands streaked with grease, faded blue work shirt, reading glasses low on his nose, a half-drunk mug of coffee on the bench, natural overcast daylight from the open garage door, shot on Kodak Vision3 250D 35mm cinema film, Mary Ellen Mark aesthetic" \
+  222
+run "03_kitchen_morning" 1440 2560 \
+  "candid morning photo of a woman in her late thirties at a wooden kitchen table holding a chipped ceramic coffee mug with both hands, no makeup, hair messy from sleep, freckles and faint laugh lines visible, wearing an oversized grey sweater, soft diffuse light from a north-facing window beside her, slight steam rising from the mug, half-eaten toast on a plate, lived-in apartment in soft focus, Saul Leiter colour mood, Cinestill 800T film grain" \
+  333
+# Wides (3104x1312)
+run "04_bar_friends" 3104 1312 \
+  "ultrawide naturalistic photo of two male friends in their forties slumped in a worn leather booth at a dim Brooklyn dive bar around 1am, half-finished beers and a bowl of stale peanuts on the table between them, one mid-laugh wearing a faded Carhartt jacket, the other listening with a tired smile in a wrinkled flannel, neither looking at the camera, single tungsten bulb above the booth as the only light source, shot on Cinestill 800T film with halation around the bulb, Wim Wenders mood, deep shadows, no airbrushing" \
+  444
+run "05_construction_lunch" 3104 1312 \
+  "ultrawide documentary photo of three construction workers sitting on an unfinished concrete floor of a high-rise during their lunch break, sunburnt necks, dust on their boots and arms, eating from foil-wrapped sandwiches and thermos cups, the city skyline visible through the open building structure behind them, harsh midday sun casting hard shadows, Sebastião Salgado documentary aesthetic, shot on a Leica with Kodak Tri-X black and white film, raw and dignified" \
+  555
+run "06_painter_studio" 3104 1312 \
+  "ultrawide editorial photo of a female painter in her fifties standing in her cluttered Brooklyn warehouse studio, paint smeared on her overalls and forearms, holding a long brush in her right hand and a rag in her left, looking off-frame in thought, a half-finished large abstract canvas leaning behind her, north-facing factory windows providing cool diffuse light, wooden floor stained with decades of dropped pigment, Annie Leibovitz Vanity Fair aesthetic, shot on Hasselblad medium format with natural skin tone retention" \
+  666
+echo "=== batch complete ==="
+ls -la "$OUT"

scripts/hidream_o1/showcase_batch.sh ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/usr/bin/env bash
+# Showcase battery: diverse prompts to characterise HiDream-O1-Image-Dev Q8.
+# Sequential. Each generates a single 1024x1024 PNG.
+set -euo pipefail
+LAB="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+PY="$LAB/.venv/bin/python"
+# Quant comes from $1 (default q6 — sweet spot). Pass q8 to use Q8 for safety-margin runs.
+QUANT="${1:-q6}"
+MODEL="$LAB/mlx_models/hidream-o1-dev-${QUANT}"
+OUT="$LAB/sample_outputs/showcase_${QUANT}"
+mkdir -p "$OUT"
+echo "showcase quant=${QUANT}, model=${MODEL}, out=${OUT}"
+run() {
+  local name="$1" prompt="$2" seed="${3:-42}"
+  echo "=== $name (seed=$seed) ==="
+  cd "$LAB" && /usr/bin/time -l "$PY" scripts/hidream_o1/generate_hidream_o1_mlx.py \
+    --model-path "$MODEL" \
+    --prompt "$prompt" \
+    --width 1024 --height 1024 \
+    --output "$OUT/$name.png" \
+    --seed "$seed" 2>&1 | grep -E "loaded|using|generation:|saved|maximum resident" | tail -6
+  echo ""
+}
+run "01_portrait_photo"      "studio photo of an elderly Japanese tea master with a wise smile, holding a ceramic teacup, gentle natural light, shallow depth of field, sharp focus on eyes, 85mm lens" 8
+run "02_anime"               "anime girl with pink hair sitting on the rooftop of a Tokyo skyscraper at dusk, neon city lights below, cherry blossom petals floating, soft watercolor style" 19
+run "03_macro_nature"        "extreme macro photo of a single dewdrop on a spiderweb at dawn, tiny rainbow refractions, blurred leaf background, ultra sharp focus" 31
+run "04_architecture"        "interior of a futuristic library, towering bookshelves, holographic displays, warm golden light streaming through stained glass windows, wide angle" 5
+run "05_surreal"             "a giant blue whale floating in the clouds above a vast desert landscape, magical realism, oil painting style, golden hour" 27
+run "06_food_flatlay"        "overhead flat lay of a rustic italian breakfast, golden croissants, espresso cup, fresh berries, marble surface, soft morning light, food photography" 53
+run "07_action_cinematic"    "samurai warrior mid leap with katana drawn, cherry blossoms swirling around him, mountain backdrop at sunset, dynamic action, cinematic film still" 71
+run "08_fantasy_creature"    "majestic dragon perched on a crystal mountain peak, iridescent scales reflecting aurora borealis, snow swirling around, dramatic dramatic lighting, fantasy art" 88
+run "09_wildlife"            "close-up portrait of a snow leopard staring directly at the camera, falling snow flakes, mountain background, national geographic style, ultra sharp" 17
+run "10_text_render"         "vintage diner neon sign reading BLOOM CAFE in glowing pink letters at night, retro americana 1950s style, rainy street reflection, cinematic" 64
+echo "=== showcase complete ==="
+ls -la "$OUT"