Instructions to use SynLayers/Bbox-caption-8b with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use SynLayers/Bbox-caption-8b with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="SynLayers/Bbox-caption-8b")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText

processor = AutoProcessor.from_pretrained("SynLayers/Bbox-caption-8b")
model = AutoModelForImageTextToText.from_pretrained("SynLayers/Bbox-caption-8b")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use SynLayers/Bbox-caption-8b with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "SynLayers/Bbox-caption-8b"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "SynLayers/Bbox-caption-8b",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/SynLayers/Bbox-caption-8b

SGLang

How to use SynLayers/Bbox-caption-8b with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "SynLayers/Bbox-caption-8b" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "SynLayers/Bbox-caption-8b",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "SynLayers/Bbox-caption-8b" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "SynLayers/Bbox-caption-8b",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use SynLayers/Bbox-caption-8b with Docker Model Runner:
```
docker model run hf.co/SynLayers/Bbox-caption-8b
```

Bbox-caption-8b

File size: 9,291 Bytes

from __future__ import annotations

import os
import sys
from pathlib import Path

try:
    import spaces
except ImportError:
    class _SpacesCompat:
        @staticmethod
        def GPU(*decorator_args, **decorator_kwargs):
            if decorator_args and callable(decorator_args[0]) and len(decorator_args) == 1 and not decorator_kwargs:
                return decorator_args[0]

            def decorator(fn):
                return fn

            return decorator

    spaces = _SpacesCompat()

import gradio as gr
import torch

CURRENT_FILE = Path(__file__).resolve()
PROJECT_ROOT = CURRENT_FILE.parents[1]
for candidate in (CURRENT_FILE.parent, CURRENT_FILE.parents[1]):
    if (candidate / "infer").exists() and (candidate / "models").exists():
        PROJECT_ROOT = candidate
        break
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from demo.real_world_pipeline import (  # noqa: E402
    DEFAULT_BBOX_MODEL,
    DEFAULT_REAL_CONFIG_PATH,
    DEFAULT_RUN_NAME,
    DEFAULT_WORK_DIR,
    run_real_world_pipeline,
)

DEFAULT_EXAMPLE_DIR = Path(
    os.environ.get(
        "SYNLAYERS_EXAMPLE_DIR",
        "/project/llmsvgen/share/data/kmw_layered_dataset/real_world_inference/layers_real_test_1024",
    )
)


def read_int_env(name: str, default: int) -> int:
    raw = os.environ.get(name)
    if raw is None:
        return default
    try:
        return int(raw)
    except ValueError:
        return default


ZERO_GPU_SIZE = (os.environ.get("SYNLAYERS_ZERO_GPU_SIZE", "large").strip() or "large").lower()
ZERO_GPU_DURATION = max(60, read_int_env("SYNLAYERS_ZERO_GPU_DURATION", 900))


def list_example_images(limit: int = 6) -> list[list[str]]:
    if not DEFAULT_EXAMPLE_DIR.exists():
        return []

    candidates = []
    for ext in ("*.png", "*.jpg", "*.jpeg", "*.webp"):
        candidates.extend(DEFAULT_EXAMPLE_DIR.glob(ext))
    candidates = sorted(candidates)[:limit]
    return [[str(path)] for path in candidates]


def build_gallery(result: dict) -> list[tuple[str, str]]:
    gallery: list[tuple[str, str]] = []
    if result.get("whole_image_rgba"):
        gallery.append((result["whole_image_rgba"], "Whole RGBA"))
    if result.get("background_rgba"):
        gallery.append((result["background_rgba"], "Background RGBA"))
    for idx, path in enumerate(result.get("layer_images", [])):
        gallery.append((path, f"Layer {idx}"))
    return gallery


def get_gpu_name() -> str:
    if not torch.cuda.is_available():
        return "None"
    try:
        return torch.cuda.get_device_name(torch.cuda.current_device())
    except Exception as exc:  # pragma: no cover - defensive runtime reporting
        return f"Unavailable ({exc})"


def is_zero_gpu_space() -> bool:
    accelerator = os.environ.get("ACCELERATOR", "").lower()
    return (
        os.environ.get("ZEROGPU_V2", "").lower() == "true"
        or os.environ.get("ZERO_GPU_PATCH_TORCH_DEVICE") == "1"
        or accelerator == "zerogpu"
        or accelerator.startswith("zero")
    )


def get_runtime_status_markdown() -> str:
    accelerator = os.environ.get("ACCELERATOR", "unknown")
    space_id = os.environ.get("SPACE_ID", "local")
    model_repo = os.environ.get("SYNLAYERS_MODEL_REPO", "(unset)")
    zero_gpu_enabled = is_zero_gpu_space()

    lines = ["## Runtime Status", f"- `SPACE_ID`: `{space_id}`", f"- `ACCELERATOR`: `{accelerator}`"]

    if zero_gpu_enabled:
        lines.extend(
            [
                f"- `ZeroGPU mode`: `True`",
                f"- `Requested GPU size`: `{ZERO_GPU_SIZE}`",
                f"- `Requested max duration`: `{ZERO_GPU_DURATION}` seconds",
                f"- `SYNLAYERS_MODEL_REPO`: `{model_repo}`",
                f"- `CUDA probe outside @spaces.GPU`: `{torch.cuda.is_available()}`",
                "",
                "This Space is configured for Hugging Face ZeroGPU.",
                "A shared H200 GPU is requested on demand when you click `Run Full Pipeline`.",
                "Queueing and quota are managed by Hugging Face ZeroGPU, not by an in-app GPU selector.",
            ]
        )
    else:
        cuda_available = torch.cuda.is_available()
        lines.extend(
            [
                f"- `CUDA available`: `{cuda_available}`",
                f"- `GPU device`: `{get_gpu_name()}`",
                f"- `SYNLAYERS_MODEL_REPO`: `{model_repo}`",
                "",
            ]
        )

        if accelerator == "none" or not cuda_available:
            lines.extend(
                [
                    "This Space is not currently running with a usable CUDA GPU.",
                    "The GPU type must be chosen by the Space owner in Hugging Face `Settings -> Hardware`.",
                    "Visitors cannot switch GPUs from inside the Gradio app.",
                ]
            )
        else:
            lines.append("The CUDA runtime is available and the full SynLayers pipeline can run here.")

    return "\n".join(lines)


@spaces.GPU(duration=ZERO_GPU_DURATION, size=ZERO_GPU_SIZE)
def run_demo_inference(
    image_path: str,
    sample_name: str,
    max_new_tokens: int,
    seed_value: float,
) -> dict:
    seed = int(seed_value) if seed_value >= 0 else None
    return run_real_world_pipeline(
        image_path=image_path,
        sample_name=sample_name or None,
        work_dir=DEFAULT_WORK_DIR,
        bbox_model=DEFAULT_BBOX_MODEL,
        config_path=DEFAULT_REAL_CONFIG_PATH,
        max_new_tokens=int(max_new_tokens),
        seed=seed,
        run_name=DEFAULT_RUN_NAME,
    )


def run_demo(
    image_path: str,
    sample_name: str,
    max_new_tokens: int,
    seed_value: float,
):
    if not image_path:
        raise gr.Error("Please upload an input image first.")

    try:
        result = run_demo_inference(
            image_path=image_path,
            sample_name=sample_name,
            max_new_tokens=max_new_tokens,
            seed_value=seed_value,
        )
    except Exception as exc:
        raise gr.Error(str(exc)) from exc

    return (
        result["bbox_visualization"],
        result["merged_image"],
        result["bbox_record"].get("whole_caption", ""),
        result["bbox_record"],
        result["metadata"],
        build_gallery(result),
        result["archive_path"],
        result["case_dir"],
    )


with gr.Blocks(title="SynLayers Real-World Demo") as demo:
    gr.Markdown(
        """
        # SynLayers Real-World Decomposition

        Upload a single image and run the full pipeline in one step:
        1. VLM for whole-caption + bounding-box detection
        2. SynLayers real-image layer decomposition

        This Space can run either on a dedicated GPU Space or on Hugging Face ZeroGPU.
        The first request may take time while model assets are loaded from Hugging Face.

        In ZeroGPU mode, a shared GPU is requested only while inference is running.
        """
    )
    runtime_status = gr.Markdown(get_runtime_status_markdown())
    refresh_status_button = gr.Button("Refresh Runtime Status")

    with gr.Row():
        with gr.Column(scale=1):
            image_input = gr.Image(type="filepath", label="Input Image")
            sample_name_input = gr.Textbox(
                label="Optional Sample Name",
                placeholder="Leave empty to use the uploaded filename",
            )
            max_new_tokens_input = gr.Slider(
                minimum=128,
                maximum=2048,
                value=1024,
                step=64,
                label="VLM Max New Tokens",
            )
            seed_input = gr.Number(
                value=42,
                precision=0,
                label="Seed (-1 keeps config default)",
            )
            run_button = gr.Button("Run Full Pipeline", variant="primary")

        with gr.Column(scale=1):
            bbox_vis_output = gr.Image(type="filepath", label="Detected Bounding Boxes")
            merged_output = gr.Image(type="filepath", label="Merged Decomposition")

    caption_output = gr.Textbox(label="Whole Caption", lines=6)
    with gr.Row():
        bbox_json_output = gr.JSON(label="BBox JSON")
        meta_json_output = gr.JSON(label="Inference Metadata")
    layer_gallery = gr.Gallery(label="Predicted Layers", columns=4, height="auto")
    with gr.Row():
        archive_output = gr.File(label="Download Result Bundle")
        case_dir_output = gr.Textbox(label="Saved Case Directory")

    examples = list_example_images()
    if examples:
        gr.Examples(examples=examples, inputs=[image_input], label="Example Images")

    refresh_status_button.click(
        fn=get_runtime_status_markdown,
        outputs=runtime_status,
    )

    run_button.click(
        fn=run_demo,
        inputs=[
            image_input,
            sample_name_input,
            max_new_tokens_input,
            seed_input,
        ],
        outputs=[
            bbox_vis_output,
            merged_output,
            caption_output,
            bbox_json_output,
            meta_json_output,
            layer_gallery,
            archive_output,
            case_dir_output,
        ],
    )


if __name__ == "__main__":
    demo.queue().launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", "7860")),
    )