Migrate to flash-head package: remove custom model files and auto_map

Remove bundled modeling/configuration Python files and auto_map from
config.json, and restore base model_type. The flash-head pip package
now handles architecture registration and FlashHead activation via
its vLLM plugin.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

config.json +2 -6
configuration_flash_head_qwen3_vl.py +0 -18
modeling_flash_head_qwen3_vl.py +0 -18

config.json CHANGED Viewed

@@ -5,7 +5,7 @@
     "dtype": "bfloat16",
     "head_dim": 128,
     "image_token_id": 151655,
-    "model_type": "flash_head_qwen3_vl",
     "num_attention_heads": 32,
     "num_key_value_heads": 8,
     "quantization_config": {
@@ -215,9 +215,5 @@
     "vision_end_token_id": 151653,
     "vision_start_token_id": 151652,
     "flash_head_cache_dir": "flash_head_assets",
-    "flash_head_special_token_ids": [],
-    "auto_map": {
-        "AutoModelForCausalLM": "modeling_flash_head_qwen3_vl.FlashHeadQwen3VLForConditionalGeneration",
-        "AutoConfig": "configuration_flash_head_qwen3_vl.FlashHeadQwen3VLConfig"
-    }
 }

     "dtype": "bfloat16",
     "head_dim": 128,
     "image_token_id": 151655,
+    "model_type": "qwen3_vl",
     "num_attention_heads": 32,
     "num_key_value_heads": 8,
     "quantization_config": {
     "vision_end_token_id": 151653,
     "vision_start_token_id": 151652,
     "flash_head_cache_dir": "flash_head_assets",
+    "flash_head_special_token_ids": []
 }

configuration_flash_head_qwen3_vl.py DELETED Viewed

@@ -1,18 +0,0 @@
-try:
-    from vllm.model_executor.models.qwen3_vl import _SUPPORTS_EMBEDL_FLASHHEAD
-except ImportError:
-    raise ImportError(
-        "\n\n"
-        "===============================================================\n"
-        "  FlashHead requires the Embedl Docker container to run.\n"
-        "\n"
-        "  Currently, FlashHead inference is supported only via vLLM.\n"
-        "  Hugging Face Transformers support will be added in a future release.\n"
-        "  Please use one of the following images:\n"
-        "\n"
-        "    Jetson Orin:  embedl/vllm:latest-jetson-orin-flashhead\n"
-        "    Jetson Thor:  embedl/vllm:latest-jetson-thor-flashhead\n"
-        "\n"
-        "  Do NOT install embedl-models via pip directly.\n"
-        "===============================================================\n"
-    )

modeling_flash_head_qwen3_vl.py DELETED Viewed

@@ -1,18 +0,0 @@
-try:
-    from vllm.model_executor.models.qwen3_vl import _SUPPORTS_EMBEDL_FLASHHEAD
-except ImportError:
-    raise ImportError(
-        "\n\n"
-        "===============================================================\n"
-        "  FlashHead requires the Embedl Docker container to run.\n"
-        "\n"
-        "  Currently, FlashHead inference is supported only via vLLM.\n"
-        "  Hugging Face Transformers support will be added in a future release.\n"
-        "  Please use one of the following images:\n"
-        "\n"
-        "    Jetson Orin:  embedl/vllm:latest-jetson-orin-flashhead\n"
-        "    Jetson Thor:  embedl/vllm:latest-jetson-thor-flashhead\n"
-        "\n"
-        "  Do NOT install embedl-models via pip directly.\n"
-        "===============================================================\n"
-    )