Jonna Marie Matthiesen Claude Opus 4.6 (1M context) commited on
Commit ·
e824fce
1
Parent(s): c616588
Migrate to flash-head package: remove custom model files and auto_map
Browse filesRemove bundled modeling/configuration Python files and auto_map from
config.json, and restore base model_type. The flash-head pip package
now handles architecture registration and FlashHead activation via
its vLLM plugin.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
- config.json +2 -6
- configuration_flash_head_qwen3_vl.py +0 -18
- modeling_flash_head_qwen3_vl.py +0 -18
config.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
"dtype": "bfloat16",
|
| 6 |
"head_dim": 128,
|
| 7 |
"image_token_id": 151655,
|
| 8 |
-
"model_type": "
|
| 9 |
"num_attention_heads": 32,
|
| 10 |
"num_key_value_heads": 8,
|
| 11 |
"quantization_config": {
|
|
@@ -215,9 +215,5 @@
|
|
| 215 |
"vision_end_token_id": 151653,
|
| 216 |
"vision_start_token_id": 151652,
|
| 217 |
"flash_head_cache_dir": "flash_head_assets",
|
| 218 |
-
"flash_head_special_token_ids": []
|
| 219 |
-
"auto_map": {
|
| 220 |
-
"AutoModelForCausalLM": "modeling_flash_head_qwen3_vl.FlashHeadQwen3VLForConditionalGeneration",
|
| 221 |
-
"AutoConfig": "configuration_flash_head_qwen3_vl.FlashHeadQwen3VLConfig"
|
| 222 |
-
}
|
| 223 |
}
|
|
|
|
| 5 |
"dtype": "bfloat16",
|
| 6 |
"head_dim": 128,
|
| 7 |
"image_token_id": 151655,
|
| 8 |
+
"model_type": "qwen3_vl",
|
| 9 |
"num_attention_heads": 32,
|
| 10 |
"num_key_value_heads": 8,
|
| 11 |
"quantization_config": {
|
|
|
|
| 215 |
"vision_end_token_id": 151653,
|
| 216 |
"vision_start_token_id": 151652,
|
| 217 |
"flash_head_cache_dir": "flash_head_assets",
|
| 218 |
+
"flash_head_special_token_ids": []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
}
|
configuration_flash_head_qwen3_vl.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
| 1 |
-
try:
|
| 2 |
-
from vllm.model_executor.models.qwen3_vl import _SUPPORTS_EMBEDL_FLASHHEAD
|
| 3 |
-
except ImportError:
|
| 4 |
-
raise ImportError(
|
| 5 |
-
"\n\n"
|
| 6 |
-
"===============================================================\n"
|
| 7 |
-
" FlashHead requires the Embedl Docker container to run.\n"
|
| 8 |
-
"\n"
|
| 9 |
-
" Currently, FlashHead inference is supported only via vLLM.\n"
|
| 10 |
-
" Hugging Face Transformers support will be added in a future release.\n"
|
| 11 |
-
" Please use one of the following images:\n"
|
| 12 |
-
"\n"
|
| 13 |
-
" Jetson Orin: embedl/vllm:latest-jetson-orin-flashhead\n"
|
| 14 |
-
" Jetson Thor: embedl/vllm:latest-jetson-thor-flashhead\n"
|
| 15 |
-
"\n"
|
| 16 |
-
" Do NOT install embedl-models via pip directly.\n"
|
| 17 |
-
"===============================================================\n"
|
| 18 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modeling_flash_head_qwen3_vl.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
| 1 |
-
try:
|
| 2 |
-
from vllm.model_executor.models.qwen3_vl import _SUPPORTS_EMBEDL_FLASHHEAD
|
| 3 |
-
except ImportError:
|
| 4 |
-
raise ImportError(
|
| 5 |
-
"\n\n"
|
| 6 |
-
"===============================================================\n"
|
| 7 |
-
" FlashHead requires the Embedl Docker container to run.\n"
|
| 8 |
-
"\n"
|
| 9 |
-
" Currently, FlashHead inference is supported only via vLLM.\n"
|
| 10 |
-
" Hugging Face Transformers support will be added in a future release.\n"
|
| 11 |
-
" Please use one of the following images:\n"
|
| 12 |
-
"\n"
|
| 13 |
-
" Jetson Orin: embedl/vllm:latest-jetson-orin-flashhead\n"
|
| 14 |
-
" Jetson Thor: embedl/vllm:latest-jetson-thor-flashhead\n"
|
| 15 |
-
"\n"
|
| 16 |
-
" Do NOT install embedl-models via pip directly.\n"
|
| 17 |
-
"===============================================================\n"
|
| 18 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|