The Step3-VL-10B is now supported in llama-cpp-python. This project provides a test GGUF file.
llama-cpp-python: https://github.com/JamePeng/llama-cpp-python
Code example:
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Step3VLChatHandler
import base64
import os

# Model and multimodal projection paths
MODEL_PATH = r"path/to/Step3-VL-10B-Q8_0.gguf"
MMPROJ_PATH = r"path/to/mmproj-Step3-VL-10b-F16.gguf"

# Initialize the Llama model with vision support
llm = Llama(
    model_path=MODEL_PATH,
    chat_handler=Step3VLChatHandler(
        clip_model_path=MMPROJ_PATH,
        enable_thinking=True,      # Set to True if you want forced chain-of-thought output
        verbose=True,
    ),
    n_gpu_layers=-1,    # Use all available GPU layers
    n_ctx=8192,        # Context window size
    verbose=True,
)

# Comprehensive MIME type mapping (updated as of 2025)
# Based on Pillow 10.x+ "Fully Supported" (Read & Write) formats
# Reference: IANA official media types + common real-world usage
# See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
_IMAGE_MIME_TYPES = {
    # Most common formats
    '.png':  'image/png',
    '.jpg':  'image/jpeg',
    '.jpeg': 'image/jpeg',
    '.gif':  'image/gif',
    '.webp': 'image/webp',

    # Next-generation formats
    '.avif': 'image/avif',
    '.jp2':  'image/jp2',
    '.j2k':  'image/jp2',
    '.jpx':  'image/jp2',

    # Legacy / Windows formats
    '.bmp':  'image/bmp',
    '.ico':  'image/x-icon',
    '.pcx':  'image/x-pcx',
    '.tga':  'image/x-tga',
    '.icns': 'image/icns',

    # Professional / Scientific imaging
    '.tif':  'image/tiff',
    '.tiff': 'image/tiff',
    '.eps':  'application/postscript',
    '.dds':  'image/vnd-ms.dds',
    '.dib':  'image/dib',
    '.sgi':  'image/sgi',

    # Portable Map formats (PPM/PGM/PBM)
    '.pbm':  'image/x-portable-bitmap',
    '.pgm':  'image/x-portable-graymap',
    '.ppm':  'image/x-portable-pixmap',

    # Miscellaneous / Older formats
    '.xbm':  'image/x-xbitmap',
    '.mpo':  'image/mpo',
    '.msp':  'image/msp',
    '.im':   'image/x-pillow-im',
    '.qoi':  'image/qoi',
}

def image_to_base64_data_uri(
    file_path: str,
    *,
    fallback_mime: str = "application/octet-stream"
) -> str:
    """
    Convert a local image file to a base64-encoded data URI with the correct MIME type.

    Supports 20+ image formats (PNG, JPEG, WebP, AVIF, HEIC, SVG, BMP, ICO, TIFF, etc.).

    Args:
        file_path: Path to the image file on disk.
        fallback_mime: MIME type used when the file extension is unknown.

    Returns:
        A valid data URI string (e.g., data:image/webp;base64,...).

    Raises:
        FileNotFoundError: If the file does not exist.
        OSError: If reading the file fails.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"Image file not found: {file_path}")

    extension = os.path.splitext(file_path)[1].lower()
    mime_type = _IMAGE_MIME_TYPES.get(extension, fallback_mime)

    if mime_type == fallback_mime:
        print(f"Warning: Unknown extension '{extension}' for '{file_path}'. "
              f"Using fallback MIME type: {fallback_mime}")

    try:
        with open(file_path, "rb") as img_file:
            encoded_data = base64.b64encode(img_file.read()).decode("utf-8")
    except OSError as e:
        raise OSError(f"Failed to read image file '{file_path}': {e}") from e

    return f"data:{mime_type};base64,{encoded_data}"


# ========================
# Main image processing & inference section
# ========================

# 1. List of image paths you want to analyze (supports mixed formats)
image_paths = [
    r"6.jpeg",
]

# 2. Container for message content (each image + final text prompt)
user_content = []

# 3. Convert every image to a properly formatted data URI message
for path in image_paths:
    data_uri = image_to_base64_data_uri(path)
    user_content.append({
        "type": "image_url",
        "image_url": {"url": data_uri}
    })

# 4. Append the text instruction (appears after all images in the message)
user_content.append({
    "type": "text",
    "text": "Please describe this image."  # You can change the prompt as needed
})

# 5. Perform chat completion with vision
response = llm.create_chat_completion(
    messages=[
        # {"role": "system", "content": "You are a highly accurate vision-language assistant. Provide detailed, precise, and well-structured image descriptions."},
        {"role": "user", "content": user_content}
    ],
    temperature=1.0,
    top_p=0.95,
    top_k=64,
    max_tokens=8192,
)

# 6. Print the model's reply
print(response["choices"][0]["message"]["content"])
Downloads last month: 533
GGUF
Model size
8B params
Architecture
qwen3
Hardware compatibility
3-bit
4-bit
6-bit
8-bit
16-bit
Model tree for JamePeng2023/Step3-VL-10B-GGUF

Base model
stepfun-ai/Step3-VL-10B-Base
Finetuned
stepfun-ai/Step3-VL-10B
Quantized
(9)
this model