Add MIGRATION_GUIDE.md
Browse files- MIGRATION_GUIDE.md +292 -0
MIGRATION_GUIDE.md
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Migration Guide: StreamingVLM β Qwen3-VL on ROCm
|
| 2 |
+
|
| 3 |
+
## Overview of Changes
|
| 4 |
+
|
| 5 |
+
This document details every change made to port StreamingVLM from its original
|
| 6 |
+
implementation (Qwen2.5-VL-7B + flash-attn + CUDA) to (Qwen3-VL-4B + SDPA + ROCm).
|
| 7 |
+
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
## 1. Dependency Changes
|
| 11 |
+
|
| 12 |
+
### Original (`infer_requirements.txt`)
|
| 13 |
+
```
|
| 14 |
+
transformers==4.52.4
|
| 15 |
+
flash_attn==2.8.0.post2
|
| 16 |
+
torch==2.7.1
|
| 17 |
+
liger_kernel==0.6.1
|
| 18 |
+
qwen-vl-utils==0.0.11
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
### Updated (`requirements.txt`)
|
| 22 |
+
```
|
| 23 |
+
transformers>=4.57.0 (install from source: pip install git+https://github.com/huggingface/transformers)
|
| 24 |
+
torch>=2.4.0 (ROCm or CUDA build)
|
| 25 |
+
qwen-vl-utils>=0.0.14
|
| 26 |
+
# NO flash-attn required!
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
**Why**: Qwen3-VL requires transformers 4.57.0+ (unreleased, must install from source).
|
| 30 |
+
Flash-attn is completely eliminated β replaced with `torch.nn.functional.scaled_dot_product_attention`.
|
| 31 |
+
|
| 32 |
+
---
|
| 33 |
+
|
| 34 |
+
## 2. Model Class Changes
|
| 35 |
+
|
| 36 |
+
| Original | Updated |
|
| 37 |
+
|----------|---------|
|
| 38 |
+
| `Qwen2_5_VLForConditionalGeneration` | `Qwen3VLForConditionalGeneration` |
|
| 39 |
+
| `from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import ...` | No private imports needed |
|
| 40 |
+
| `attn_implementation="flash_attention_2"` | `attn_implementation="sdpa"` |
|
| 41 |
+
|
| 42 |
+
### Loading Code
|
| 43 |
+
|
| 44 |
+
```python
|
| 45 |
+
# BEFORE (original)
|
| 46 |
+
from transformers import Qwen2_5_VLForConditionalGeneration
|
| 47 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 48 |
+
model_path, torch_dtype="auto", device_map="cuda",
|
| 49 |
+
attn_implementation="flash_attention_2"
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# AFTER (this port)
|
| 53 |
+
from transformers import Qwen3VLForConditionalGeneration
|
| 54 |
+
model = Qwen3VLForConditionalGeneration.from_pretrained(
|
| 55 |
+
"Qwen/Qwen3-VL-4B-Instruct",
|
| 56 |
+
torch_dtype=torch.bfloat16,
|
| 57 |
+
device_map="auto",
|
| 58 |
+
attn_implementation="sdpa" # Works on ROCm + CUDA
|
| 59 |
+
)
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
---
|
| 63 |
+
|
| 64 |
+
## 3. Attention Mechanism Replacement
|
| 65 |
+
|
| 66 |
+
### Language Model Attention
|
| 67 |
+
|
| 68 |
+
**Original**: Used `_flash_attention_forward()` (private transformers API, CUDA-only)
|
| 69 |
+
```python
|
| 70 |
+
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import _flash_attention_forward
|
| 71 |
+
attn_output = _flash_attention_forward(query, key, value, attention_mask, ...)
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
**Updated**: Uses `F.scaled_dot_product_attention()` (PyTorch native, ROCm+CUDA)
|
| 75 |
+
```python
|
| 76 |
+
import torch.nn.functional as F
|
| 77 |
+
attn_output = F.scaled_dot_product_attention(
|
| 78 |
+
query_states, key_states, value_states,
|
| 79 |
+
attn_mask=attention_mask,
|
| 80 |
+
dropout_p=0.0,
|
| 81 |
+
is_causal=(attention_mask is None and q_len > 1),
|
| 82 |
+
)
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
### Vision Encoder Attention
|
| 86 |
+
|
| 87 |
+
**Original**: Used `flash_attn_varlen_func` for packed variable-length sequences
|
| 88 |
+
```python
|
| 89 |
+
from flash_attn import flash_attn_varlen_func
|
| 90 |
+
attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen)
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
**Updated**: Chunked SDPA β splits at sequence boundaries, applies SDPA per chunk
|
| 94 |
+
```python
|
| 95 |
+
def _sdpa_varlen(query, key, value, cu_seqlens, max_seqlen):
|
| 96 |
+
outputs = []
|
| 97 |
+
for i in range(cu_seqlens.shape[0] - 1):
|
| 98 |
+
start, end = cu_seqlens[i].item(), cu_seqlens[i+1].item()
|
| 99 |
+
q_i = query[start:end].unsqueeze(0).transpose(1, 2)
|
| 100 |
+
k_i = key[start:end].unsqueeze(0).transpose(1, 2)
|
| 101 |
+
v_i = value[start:end].unsqueeze(0).transpose(1, 2)
|
| 102 |
+
out = F.scaled_dot_product_attention(q_i, k_i, v_i, is_causal=False)
|
| 103 |
+
outputs.append(out.transpose(1, 2).squeeze(0))
|
| 104 |
+
return torch.cat(outputs, dim=0)
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
---
|
| 108 |
+
|
| 109 |
+
## 4. Architecture Differences (Qwen2.5-VL β Qwen3-VL)
|
| 110 |
+
|
| 111 |
+
| Feature | Qwen2.5-VL-7B | Qwen3-VL-4B |
|
| 112 |
+
|---------|---------------|--------------|
|
| 113 |
+
| ViT patch size | 14Γ14 | **16Γ16** |
|
| 114 |
+
| ViT depth | 32 layers | **24 layers** |
|
| 115 |
+
| ViT hidden | 1280 | **1024** |
|
| 116 |
+
| LM hidden | 3584 | **2560** |
|
| 117 |
+
| LM layers | 28 | **36** |
|
| 118 |
+
| KV heads | 4 | **8** |
|
| 119 |
+
| Max context | 128K | **256K** |
|
| 120 |
+
| RoPE theta | 1M | **5M** |
|
| 121 |
+
| MRoPE | standard | **interleaved** |
|
| 122 |
+
| QK-Norm | β | **β
** |
|
| 123 |
+
| DeepStack | β | **β
(layers 5,11,17)** |
|
| 124 |
+
| ViT window attn | Yes | **No (full attn)** |
|
| 125 |
+
|
| 126 |
+
### Impact on StreamingVLM:
|
| 127 |
+
- **QK-Norm**: Added `self.q_norm(query)` and `self.k_norm(key)` before RoPE in attention
|
| 128 |
+
- **DeepStack**: Vision encoder extracts features from intermediate layers for richer representation
|
| 129 |
+
- **Interleaved MRoPE**: `mrope_section=[24,20,20]` (temporal/height/width) with interleaved application
|
| 130 |
+
- **No ViT window attention**: Simpler vision encoder β no `fullatt_block_indexes` logic needed
|
| 131 |
+
|
| 132 |
+
---
|
| 133 |
+
|
| 134 |
+
## 5. Monkey-Patch Target Changes
|
| 135 |
+
|
| 136 |
+
### Original (10 patches on Qwen2.5-VL):
|
| 137 |
+
```python
|
| 138 |
+
model.generate = streaming_generate
|
| 139 |
+
model.prepare_inputs_for_generation = ...
|
| 140 |
+
model._sample = ...
|
| 141 |
+
model.forward = qwen2_5_vl_forward
|
| 142 |
+
model.model.forward = model_forward
|
| 143 |
+
model.model.language_model.forward = streaming_language_model_forward
|
| 144 |
+
model.model.language_model._update_causal_mask = ...
|
| 145 |
+
for layer in model.model.language_model.layers: # "language_model" submodule
|
| 146 |
+
layer.forward = streaming_text_decoder_layer_forward
|
| 147 |
+
layer.self_attn.forward = streaming_text_flash_attn_forward
|
| 148 |
+
model.model.visual.forward = streaming_visual_encoder_forward
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
### Updated (simplified patches on Qwen3-VL):
|
| 152 |
+
```python
|
| 153 |
+
model.forward = streaming_qwen3_vl_forward
|
| 154 |
+
model.model.forward = streaming_model_forward
|
| 155 |
+
for layer in model.model.layers: # directly on model.model (no "language_model"!)
|
| 156 |
+
layer.forward = streaming_text_decoder_layer_forward
|
| 157 |
+
layer.self_attn.forward = streaming_text_sdpa_forward
|
| 158 |
+
model.model.visual.forward = streaming_visual_encoder_forward
|
| 159 |
+
for blk in model.model.visual.blocks:
|
| 160 |
+
blk.forward = streaming_visual_block_forward
|
| 161 |
+
blk.attn.forward = streaming_visual_attention_forward
|
| 162 |
+
model.model.get_rope_index = get_rope_index_streaming
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
**Key structural difference**: In Qwen2.5-VL, text layers are at `model.model.language_model.layers`.
|
| 166 |
+
In Qwen3-VL, they are at `model.model.layers` (no intermediate `language_model` wrapper).
|
| 167 |
+
|
| 168 |
+
---
|
| 169 |
+
|
| 170 |
+
## 6. KV Cache API Changes
|
| 171 |
+
|
| 172 |
+
### transformers 4.50-4.52 (original):
|
| 173 |
+
```python
|
| 174 |
+
cache.key_cache[layer_idx] # List[Tensor]
|
| 175 |
+
cache.value_cache[layer_idx]
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
### transformers 5.8.0+ (updated):
|
| 179 |
+
```python
|
| 180 |
+
cache.layers[layer_idx].keys # DynamicLayer objects
|
| 181 |
+
cache.layers[layer_idx].values
|
| 182 |
+
```
|
| 183 |
+
|
| 184 |
+
Our `StreamingCache` handles both via `_get_layer_keys()`/`_set_layer_keys()` compat methods.
|
| 185 |
+
|
| 186 |
+
---
|
| 187 |
+
|
| 188 |
+
## 7. Token IDs (Verified for Qwen3-VL)
|
| 189 |
+
|
| 190 |
+
| Token | ID | Same as 2.5? |
|
| 191 |
+
|-------|-----|--------------|
|
| 192 |
+
| `<\|im_start\|>` | 151644 | β
|
|
| 193 |
+
| `<\|im_end\|>` | 151645 | β
|
|
| 194 |
+
| `<\|vision_start\|>` | 151652 | β
|
|
| 195 |
+
| `<\|vision_end\|>` | 151653 | β
|
|
| 196 |
+
| `<\|image_pad\|>` | 151655 | β
|
|
| 197 |
+
| `<\|video_pad\|>` | 151656 | β
|
|
| 198 |
+
|
| 199 |
+
Token IDs are identical β no changes needed for token-level logic.
|
| 200 |
+
|
| 201 |
+
---
|
| 202 |
+
|
| 203 |
+
## 8. ROCm-Specific Configuration
|
| 204 |
+
|
| 205 |
+
### Performance tiers (recommended):
|
| 206 |
+
|
| 207 |
+
| Priority | Implementation | Install | Performance | Platform |
|
| 208 |
+
|----------|---------------|---------|-------------|----------|
|
| 209 |
+
| 1 | `flash_attention_2` + CK backend | Build flash-attn from source | Best | ROCm β₯5.7 |
|
| 210 |
+
| 2 | `sdpa` (default) | None (PyTorch built-in) | Good | Any |
|
| 211 |
+
| 3 | `eager` | None | Slowest | Any |
|
| 212 |
+
|
| 213 |
+
### For ROCm flash-attn (optional, for maximum throughput):
|
| 214 |
+
```bash
|
| 215 |
+
git clone https://github.com/Dao-AILab/flash-attention.git
|
| 216 |
+
cd flash-attention
|
| 217 |
+
pip install .
|
| 218 |
+
# Then use: attn_implementation="flash_attention_2"
|
| 219 |
+
```
|
| 220 |
+
|
| 221 |
+
### Environment variables:
|
| 222 |
+
```bash
|
| 223 |
+
export PYTORCH_ROCM_ARCH="gfx942" # MI300X
|
| 224 |
+
# export PYTORCH_ROCM_ARCH="gfx90a" # MI250X
|
| 225 |
+
# export PYTORCH_ROCM_ARCH="gfx1100" # RX 7900 XTX
|
| 226 |
+
|
| 227 |
+
# Flash-attn backend selection:
|
| 228 |
+
export FLASH_ATTENTION_TRITON_AMD_ENABLE="FALSE" # CK (default, recommended)
|
| 229 |
+
# export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" # Triton (alternative)
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
---
|
| 233 |
+
|
| 234 |
+
## 9. Eliminated Private API Dependencies
|
| 235 |
+
|
| 236 |
+
The original code imported many private symbols from transformers internals.
|
| 237 |
+
These are **all eliminated** in this port:
|
| 238 |
+
|
| 239 |
+
```python
|
| 240 |
+
# REMOVED β These broke between transformers versions:
|
| 241 |
+
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
|
| 242 |
+
_flash_attention_forward, # Private, CUDA-only
|
| 243 |
+
flash_attn_varlen_func, # Private re-export of flash_attn
|
| 244 |
+
rotate_half, # Reimplemented locally
|
| 245 |
+
repeat_kv, # Reimplemented locally
|
| 246 |
+
apply_rotary_pos_emb_flashatt, # Not needed with SDPA
|
| 247 |
+
StaticCache, # Using DynamicCache directly
|
| 248 |
+
SlidingWindowCache, # Not used
|
| 249 |
+
AttentionMaskConverter, # Not used
|
| 250 |
+
make_flex_block_causal_mask, # Not used
|
| 251 |
+
BlockMask, # Not used
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
# KEPT β Only stable public APIs:
|
| 255 |
+
from transformers import (
|
| 256 |
+
Qwen3VLForConditionalGeneration, # Public model class
|
| 257 |
+
AutoProcessor, # Public processor
|
| 258 |
+
DynamicCache, # Public cache class
|
| 259 |
+
)
|
| 260 |
+
from transformers.modeling_outputs import (
|
| 261 |
+
BaseModelOutputWithPast, # Public output type
|
| 262 |
+
CausalLMOutputWithPast, # Public output type
|
| 263 |
+
)
|
| 264 |
+
```
|
| 265 |
+
|
| 266 |
+
---
|
| 267 |
+
|
| 268 |
+
## 10. Running the Updated Code
|
| 269 |
+
|
| 270 |
+
### Inference (streaming commentary):
|
| 271 |
+
```bash
|
| 272 |
+
python -m streaming_vlm.inference.inference \
|
| 273 |
+
--model_path Qwen/Qwen3-VL-4B-Instruct \
|
| 274 |
+
--video_path match.mp4 \
|
| 275 |
+
--output_path commentary.vtt \
|
| 276 |
+
--fps 2 \
|
| 277 |
+
--attn_implementation sdpa
|
| 278 |
+
```
|
| 279 |
+
|
| 280 |
+
### Training (Stage 1):
|
| 281 |
+
```bash
|
| 282 |
+
# Download data
|
| 283 |
+
# huggingface-cli download mit-han-lab/Inf-Stream-Train --local-dir ./data
|
| 284 |
+
|
| 285 |
+
bash scripts/sft_stage_1.sh
|
| 286 |
+
```
|
| 287 |
+
|
| 288 |
+
### Tests:
|
| 289 |
+
```bash
|
| 290 |
+
python test_imports.py
|
| 291 |
+
# Expected: 6/6 tests pass with no CUDA/flash-attn dependency
|
| 292 |
+
```
|