The Step3-VL-10B is now supported in llama-cpp-python. This project provides a test GGUF file.
llama-cpp-python: https://github.com/JamePeng/llama-cpp-python
Code example:
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Step3VLChatHandler
import base64
import os
# Model and multimodal projection paths
MODEL_PATH = r"path/to/Step3-VL-10B-Q8_0.gguf"
MMPROJ_PATH = r"path/to/mmproj-Step3-VL-10b-F16.gguf"
# Initialize the Llama model with vision support
llm = Llama(
model_path=MODEL_PATH,
chat_handler=Step3VLChatHandler(
clip_model_path=MMPROJ_PATH,
enable_thinking=True, # Set to True if you want forced chain-of-thought output
verbose=True,
),
n_gpu_layers=-1, # Use all available GPU layers
n_ctx=8192, # Context window size
verbose=True,
)
# Comprehensive MIME type mapping (updated as of 2025)
# Based on Pillow 10.x+ "Fully Supported" (Read & Write) formats
# Reference: IANA official media types + common real-world usage
# See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
_IMAGE_MIME_TYPES = {
# Most common formats
'.png': 'image/png',
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.gif': 'image/gif',
'.webp': 'image/webp',
# Next-generation formats
'.avif': 'image/avif',
'.jp2': 'image/jp2',
'.j2k': 'image/jp2',
'.jpx': 'image/jp2',
# Legacy / Windows formats
'.bmp': 'image/bmp',
'.ico': 'image/x-icon',
'.pcx': 'image/x-pcx',
'.tga': 'image/x-tga',
'.icns': 'image/icns',
# Professional / Scientific imaging
'.tif': 'image/tiff',
'.tiff': 'image/tiff',
'.eps': 'application/postscript',
'.dds': 'image/vnd-ms.dds',
'.dib': 'image/dib',
'.sgi': 'image/sgi',
# Portable Map formats (PPM/PGM/PBM)
'.pbm': 'image/x-portable-bitmap',
'.pgm': 'image/x-portable-graymap',
'.ppm': 'image/x-portable-pixmap',
# Miscellaneous / Older formats
'.xbm': 'image/x-xbitmap',
'.mpo': 'image/mpo',
'.msp': 'image/msp',
'.im': 'image/x-pillow-im',
'.qoi': 'image/qoi',
}
def image_to_base64_data_uri(
file_path: str,
*,
fallback_mime: str = "application/octet-stream"
) -> str:
"""
Convert a local image file to a base64-encoded data URI with the correct MIME type.
Supports 20+ image formats (PNG, JPEG, WebP, AVIF, HEIC, SVG, BMP, ICO, TIFF, etc.).
Args:
file_path: Path to the image file on disk.
fallback_mime: MIME type used when the file extension is unknown.
Returns:
A valid data URI string (e.g., data:image/webp;base64,...).
Raises:
FileNotFoundError: If the file does not exist.
OSError: If reading the file fails.
"""
if not os.path.isfile(file_path):
raise FileNotFoundError(f"Image file not found: {file_path}")
extension = os.path.splitext(file_path)[1].lower()
mime_type = _IMAGE_MIME_TYPES.get(extension, fallback_mime)
if mime_type == fallback_mime:
print(f"Warning: Unknown extension '{extension}' for '{file_path}'. "
f"Using fallback MIME type: {fallback_mime}")
try:
with open(file_path, "rb") as img_file:
encoded_data = base64.b64encode(img_file.read()).decode("utf-8")
except OSError as e:
raise OSError(f"Failed to read image file '{file_path}': {e}") from e
return f"data:{mime_type};base64,{encoded_data}"
# ========================
# Main image processing & inference section
# ========================
# 1. List of image paths you want to analyze (supports mixed formats)
image_paths = [
r"6.jpeg",
]
# 2. Container for message content (each image + final text prompt)
user_content = []
# 3. Convert every image to a properly formatted data URI message
for path in image_paths:
data_uri = image_to_base64_data_uri(path)
user_content.append({
"type": "image_url",
"image_url": {"url": data_uri}
})
# 4. Append the text instruction (appears after all images in the message)
user_content.append({
"type": "text",
"text": "Please describe this image." # You can change the prompt as needed
})
# 5. Perform chat completion with vision
response = llm.create_chat_completion(
messages=[
# {"role": "system", "content": "You are a highly accurate vision-language assistant. Provide detailed, precise, and well-structured image descriptions."},
{"role": "user", "content": user_content}
],
temperature=1.0,
top_p=0.95,
top_k=64,
max_tokens=8192,
)
# 6. Print the model's reply
print(response["choices"][0]["message"]["content"])
- Downloads last month
- 533
Hardware compatibility
Log In to add your hardware
3-bit
4-bit
6-bit
8-bit
16-bit