Spaces:

usingcolor
/

MambaEye

Running on Zero

App Files Files Community

MambaEye / app.py

usingcolor

chore: initialize git repository and add standard hook templates

ed21522 19 days ago

raw

history blame contribute delete

17.8 kB

	import sys
	import os
	import subprocess
	import time

	mamba_dir = os.path.join(os.path.dirname(__file__), "MambaEye")
	if not os.path.exists(mamba_dir) or not os.path.exists(os.path.join(mamba_dir, "mambaeye")):
	print("Cloning MambaEye repository from GitHub...", flush=True)
	if os.path.exists(mamba_dir):
	import shutil
	shutil.rmtree(mamba_dir)
	subprocess.check_call(["git", "clone", "https://github.com/usingcolor/MambaEye.git", mamba_dir])

	try:
	import mamba_ssm
	import causal_conv1d
	except ImportError:
	print("Installing mamba_ssm and causal_conv1d in backend...", flush=True)
	env = os.environ.copy()
	env["MAMBA_SKIP_CUDA_BUILD"] = "TRUE"
	env["CAUSAL_CONV1D_SKIP_CUDA_BUILD"] = "TRUE"
	subprocess.check_call(
	[sys.executable, "-m", "pip", "install", "causal-conv1d==1.5.0.post8", "mamba-ssm==2.2.4", "--no-build-isolation"],
	env=env
	)

	sys.path.append(os.path.join(os.path.dirname(__file__), "MambaEye"))

	import gradio as gr
	import numpy as np
	import torch
	import torch.nn.functional as F
	from PIL import Image, ImageDraw
	import torchvision.transforms as T
	from torchvision.models import ResNet50_Weights
	from huggingface_hub import hf_hub_download
	import spaces

	from mambaeye.model import MambaEye
	from mambaeye.scan import generate_scan_positions
	from mambaeye.positional_encoding import sinusoidal_position_encoding_2d
	from mamba_ssm.utils.generation import InferenceParams

	PATCH_SIZE = 16
	CATEGORIES = ResNet50_Weights.IMAGENET1K_V1.meta["categories"]

	MODEL_CONFIG = {
	"num_classes": 1000,
	"input_dim": 1280,
	"dim": 256,
	"depth": 48,
	"d_state": 64,
	"d_conv": 4,
	"expand": 2,
	"residual_in_fp32": True,
	}

	MODEL_REPO = "usingcolor/MambaEye-base"
	MODEL_FILENAME = "mambaeye_base_ft.pt"
	# --- EAGER CPU RAM PRE-LOADING ---
	# Hugging Face ZeroGPU processes fork from this main thread. By cleanly executing the model structural download
	# precisely before launching the UI, all weights map deeply into the persistent physical System RAM safely.
	print(f"Eagerly pre-downloading {MODEL_FILENAME} from {MODEL_REPO} into static CPU RAM...", flush=True)
	try:
	CHECKPOINT_PATH = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILENAME)
	_GLOBAL_CPU_MODEL = MambaEye(**MODEL_CONFIG)
	_GLOBAL_CPU_MODEL.load_state_dict(torch.load(CHECKPOINT_PATH, map_location="cpu", weights_only=True))
	_GLOBAL_CPU_MODEL.eval()
	print("Model perfectly cached conceptually in System RAM! Completely zero-latency disk I/O remaining.")
	except Exception as e:
	print(f"Failed cleanly pre-loading model context: {e}")
	raise


	# --- FALLBACK CSS INJECTION ---
	# We use a CSS override to display a precision crosshair since custom dynamic HTML div overlays
	# are deeply rejected by Gradio's internal Canvas shadow properties.
	CSS_STYLE = """
	.gradio-image-hook, .gradio-image-hook * {
	cursor: crosshair !important;
	}
	"""

	# -----------------------------

	def get_model():
	# As the @spaces.GPU worker natively forks off, it effortlessly snags the _GLOBAL_CPU_MODEL reference
	# directly passing its exact tensor parameters perfectly over exactly across PCI-e into active VRAM!
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	_GLOBAL_CPU_MODEL.to(device)
	return _GLOBAL_CPU_MODEL, device

	# --- FALLBACK CSS INJECTION ---
	# We use a CSS override to display a precision crosshair since custom dynamic HTML div overlays
	# are deeply rejected by Gradio's internal Canvas shadow properties.
	CSS_STYLE = """
	.gradio-image-hook, .gradio-image-hook * {
	cursor: crosshair !important;
	}
	.big-accordion {
	border: 2px solid #e5e7eb !important;
	}
	.big-accordion button, .big-accordion .label-wrap, .big-accordion summary {
	font-size: 1.3em !important;
	padding: 12px 18px !important;
	font-weight: 600 !important;
	}
	"""

	# --- HOVER SCRIPT INJECTION ---

	def transfer_inference_params(params, device):
	if params is None or getattr(params, "key_value_memory_dict", None) is None:
	return params

	for k, v in params.key_value_memory_dict.items():
	if isinstance(v, torch.Tensor):
	params.key_value_memory_dict[k] = v.to(device)
	elif isinstance(v, tuple):
	params.key_value_memory_dict[k] = tuple(x.to(device) if isinstance(x, torch.Tensor) else x for x in v)
	elif isinstance(v, list):
	params.key_value_memory_dict[k] = [x.to(device) if isinstance(x, torch.Tensor) else x for x in v]
	elif isinstance(v, dict):
	for k2, v2 in v.items():
	if hasattr(v2, "to"):
	params.key_value_memory_dict[k][k2] = v2.to(device)
	return params

	def format_seq_len(seq_len):
	return f"<div style='text-align: center; border: 1px solid #e5e7eb; border-radius: 8px; padding: 10px; margin-bottom: 10px; background-color: #f9fafb;'><span style='font-size: 1.1em; color: #6b7280;'>Total Sequenced Patches</span><br><span style='font-size: 3em; font-weight: bold; color: #3b82f6;'>{seq_len}</span></div>"

	def _compute_move_embedding(patch_location: torch.Tensor, cur_location: torch.Tensor = None) -> torch.Tensor:
	if cur_location is None:
	move_embedding = torch.zeros((patch_location.shape[0], 2), dtype=torch.float32, device=patch_location.device)
	return sinusoidal_position_encoding_2d(move_embedding, 256)
	return sinusoidal_position_encoding_2d((patch_location - cur_location).float(), 256)

	def format_predictions(probs_np):
	top5_idx = np.argsort(probs_np)[-5:][::-1]
	top5_probs = probs_np[top5_idx]

	result = {}
	for prob, idx in zip(top5_probs, top5_idx):
	class_name = CATEGORIES[idx].split(",")[0].title()
	result[class_name] = float(prob)
	return result

	def preprocess_image(image_arr):
	img = Image.fromarray(image_arr).convert("RGB")
	width, height = img.size
	totensor = T.ToTensor()
	img_tensor = totensor(img)
	canvas_size = max(width, height)
	canvas = torch.zeros(3, canvas_size, canvas_size, dtype=torch.float32)
	x_offset = (canvas_size - img_tensor.shape[1]) // 2
	y_offset = (canvas_size - img_tensor.shape[2]) // 2

	canvas[:, x_offset : x_offset + img_tensor.shape[1], y_offset : y_offset + img_tensor.shape[2]] = img_tensor

	return canvas, x_offset, y_offset, height, width

	def extract_patch(canvas_tensor, px, py):
	canvas_size = canvas_tensor.shape[1]
	px = max(0, min(px, canvas_size - PATCH_SIZE))
	py = max(0, min(py, canvas_size - PATCH_SIZE))
	patch = canvas_tensor[:, px : px + PATCH_SIZE, py : py + PATCH_SIZE]
	return patch.flatten()

	def draw_patches_on_image(image_arr, positions, x_offset, y_offset, h, w):
	img = np.array(image_arr)

	# Create the greyed-out ambient background
	grey_base = Image.fromarray(img).convert("L").convert("RGB")
	grey_base_np = (np.array(grey_base).astype(float) * 0.4 + 160).clip(0, 255).astype(np.uint8)

	temp_img = Image.fromarray(grey_base_np)
	orig_pil = Image.fromarray(img)
	draw = ImageDraw.Draw(temp_img)

	for i, (px, py) in enumerate(positions):
	orig_y = py - y_offset
	orig_x = px - x_offset
	orig_px_size = PATCH_SIZE

	box = (int(orig_y), int(orig_x), int(orig_y + orig_px_size), int(orig_x + orig_px_size))

	# Paste original color into the highlighted region
	patch_crop = orig_pil.crop(box)
	temp_img.paste(patch_crop, box)

	return np.array(temp_img), positions

	def init_state_for_image(image):
	canvas_tensor, x_offset, y_offset, h, w = preprocess_image(image)
	return {
	'inference_params': None,
	'cur_location': None,
	'canvas_tensor': canvas_tensor.cpu(),
	'x_offset': x_offset,
	'y_offset': y_offset,
	'h': h,
	'w': w,
	'original_image': image,
	'drawn_positions': [],
	'sequence_length': 0
	}

	@spaces.GPU
	def run_auto_scan(image, scan_pattern, sequence_length):
	if image is None:
	return None, {"Upload Image": 1.0}, None, "Upload Image"

	model, device = get_model()

	state = init_state_for_image(image)

	x_end = max(state['x_offset'] + 1, state['x_offset'] + state['h'])
	y_end = max(state['y_offset'] + 1, state['y_offset'] + state['w'])

	import random
	rng = random.Random(42)

	positions_xy = generate_scan_positions(
	x_start=state['y_offset'], x_stop=y_end,
	y_start=state['x_offset'], y_stop=x_end,
	patch_size=PATCH_SIZE, sequence_length=sequence_length,
	scan_pattern=scan_pattern, rng=rng
	)
	# The scan coordinate generator effectively expects x=cols and y=rows.
	# We securely transpose them back to (px=row, py=col) to match our unified backend matrix structure.
	positions = [(py, px) for px, py in positions_xy]

	inference_params = InferenceParams(max_seqlen=4000, max_batch_size=1)

	patches_list = []
	moves_list = []
	cur_location = None

	for px, py in positions:
	loc_tensor = torch.tensor([[px, py]], dtype=torch.long, device=device)
	move_emb = _compute_move_embedding(loc_tensor, cur_location)
	cur_location = loc_tensor

	patch = extract_patch(state['canvas_tensor'], px, py).to(device)
	patches_list.append(patch)
	moves_list.append(move_emb.squeeze(0))

	img_seq = torch.stack(patches_list, dim=0).unsqueeze(0) # (1, L, 768)
	move_seq = torch.stack(moves_list, dim=0).unsqueeze(0) # (1, L, 512)

	with torch.no_grad():
	out = model(img_seq, move_seq, inference_params=inference_params)
	final_probs = F.softmax(out[0, -1], dim=-1).cpu().numpy()
	inference_params.seqlen_offset += img_seq.shape[1]

	state['cur_location'] = cur_location.cpu()
	state['drawn_positions'] = positions
	state['sequence_length'] = sequence_length

	state['canvas_tensor'] = state['canvas_tensor'].cpu()
	state['inference_params'] = transfer_inference_params(inference_params, torch.device('cpu'))

	img_display, _ = draw_patches_on_image(
	state['original_image'], state['drawn_positions'],
	state['x_offset'], state['y_offset'], state['h'], state['w']
	)

	return img_display, format_predictions(final_probs), state, f"Auto Scan Complete. Extracted {sequence_length} patches. Click to add more!", format_seq_len(sequence_length)

	@spaces.GPU
	def process_click_inference(x_orig, y_orig, original_image, state):
	if original_image is None:
	return None, {"Upload Image": 1.0}, state, "Upload Image"

	model, device = get_model()

	if state is None or state.get('inference_params') is None:
	state = init_state_for_image(original_image)
	state['inference_params'] = InferenceParams(max_seqlen=4000, max_batch_size=1)

	state['inference_params'] = transfer_inference_params(state['inference_params'], device)

	orig_h, orig_w = state['original_image'].shape[:2]
	canvas_size = max(orig_h, orig_w)

	canvas_y = int(x_orig) + state['y_offset']
	canvas_x = int(y_orig) + state['x_offset']

	# 1px flexible precision anchoring the patch directly onto the exact center click
	px = max(0, min(int(canvas_x - PATCH_SIZE / 2), canvas_size - PATCH_SIZE))
	py = max(0, min(int(canvas_y - PATCH_SIZE / 2), canvas_size - PATCH_SIZE))

	cur_loc = state['cur_location'].to(device) if state['cur_location'] is not None else None
	loc_tensor = torch.tensor([[px, py]], dtype=torch.long, device=device)
	move_emb = _compute_move_embedding(loc_tensor, cur_loc)

	patch = extract_patch(state['canvas_tensor'], px, py).to(device)

	img_seq = patch.unsqueeze(0).unsqueeze(0)
	move_seq = move_emb.unsqueeze(0)

	with torch.no_grad():
	out = model(img_seq, move_seq, inference_params=state['inference_params'])
	final_probs = F.softmax(out[0, -1], dim=-1).cpu().numpy()
	state['inference_params'].seqlen_offset += 1

	state['cur_location'] = loc_tensor.cpu()
	state['drawn_positions'].append((px, py))
	state['sequence_length'] += 1

	state['inference_params'] = transfer_inference_params(state['inference_params'], torch.device('cpu'))

	img_display, _ = draw_patches_on_image(
	state['original_image'], state['drawn_positions'],
	state['x_offset'], state['y_offset'], state['h'], state['w']
	)

	status_msg = f"🔍 Revealed patch #{state['sequence_length']}! The model is analyzing... Keep clicking to give it more clues!"
	return img_display, format_predictions(final_probs), state, status_msg, format_seq_len(state['sequence_length'])

	def on_click(evt: gr.SelectData, original_image, state):
	x_orig, y_orig = evt.index
	return process_click_inference(x_orig, y_orig, original_image, state)

	def on_upload(image):
	if image is None:
	return None, None, {"Waiting...": 1.0}, None, "Upload Image", 0

	# Pre-render the grey background immediately on upload
	grey_base = Image.fromarray(image).convert("L").convert("RGB")
	grey_base_np = (np.array(grey_base).astype(float) * 0.4 + 160).clip(0, 255).astype(np.uint8)

	return grey_base_np, image, {"Click an interesting object in the photo": 1.0}, None, "✨ Image loaded! The model is currently blind. Click anywhere on the grey canvas to reveal the first patch and let the model guess!", format_seq_len(0)

	def on_clear(original_image):
	if original_image is None:
	return None, {"Cleared": 1.0}, None, "Cleared", 0

	grey_base = Image.fromarray(original_image).convert("L").convert("RGB")
	grey_base_np = (np.array(grey_base).astype(float) * 0.4 + 160).clip(0, 255).astype(np.uint8)

	return grey_base_np, {"Cleared": 1.0}, init_state_for_image(original_image), "🧹 Selections cleared! The canvas is blank. Where will you click next?", format_seq_len(0)

	with gr.Blocks(title="MambaEye Interactive Demo") as demo:
	gr.Markdown(
	"# MambaEye Interactive Inference Demo\n"
	"🔗 [Project Page](https://usingcolor.github.io/MambaEye) • 💻 [GitHub Repository](https://github.com/usingcolor/MambaEye)\n\n"
	"This interface incorporates the full MambaEye-base-ft model natively.\n\n"
	"Note: The first inference or Auto Scan may take 1~2 minutes to compile CUDA kernels and build hardware cache. Subsequent patch clicks will be dramatically faster!"
	)

	state = gr.State(None)
	original_image_state = gr.State(None)

	seq_len_display = gr.HTML(value=format_seq_len(0), render=False)
	model_output_label = gr.Label(label="MambaEye Output Predictions", num_top_classes=5, render=False)
	status_text = gr.Markdown("Status: Waiting for image upload...", render=False)

	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown("### 🎯 Challenge: See how few clicks the model needs to guess your image!\nClick directly on the most informative parts of the grey image to reveal patches to the model.")
	input_image = gr.Image(type="numpy", label="👆 Interactive Canvas: Click here to extract patches!", interactive=True, elem_classes="gradio-image-hook")

	clear_btn = gr.Button("🗑️ Clear Selections & Start Over", variant="secondary")

	with gr.Accordion("🤖 Advanced: Auto-Scan Features", open=False, elem_classes="big-accordion"):
	gr.Markdown("### ✨ Let the model automatically scan a sequence of patches!")
	with gr.Row():
	scan_pattern = gr.Dropdown(
	choices=["random", "spiral", "diagonal", "golden", "horizontal_raster", "horizontal_zigzag", "column_major", "column_snake"],
	value="random",
	label="Scan Pattern"
	)
	seq_length = gr.Slider(minimum=1, maximum=4096, step=1, value=256, label="Auto Sequence Length")
	auto_btn = gr.Button("Auto Generate Path & Infer", variant="primary")

	gr.Examples(
	examples=[
	"assets/dog.jpg",
	"assets/leo.jpg",
	"assets/green_mamba.jpg",
	],
	inputs=input_image,
	outputs=[input_image, original_image_state, model_output_label, state, status_text, seq_len_display],
	fn=on_upload,
	run_on_click=True,
	cache_examples=False,
	label="Try an Example Image"
	)

	with gr.Column(scale=1):
	seq_len_display.render()
	model_output_label.render()
	status_text.render()

	input_image.upload(
	fn=on_upload,
	inputs=[input_image],
	outputs=[input_image, original_image_state, model_output_label, state, status_text, seq_len_display]
	)

	auto_btn.click(
	fn=run_auto_scan,
	inputs=[original_image_state, scan_pattern, seq_length],
	outputs=[input_image, model_output_label, state, status_text, seq_len_display]
	)

	input_image.select(
	fn=on_click,
	inputs=[original_image_state, state],
	outputs=[input_image, model_output_label, state, status_text, seq_len_display]
	)

	clear_btn.click(
	fn=on_clear,
	inputs=[original_image_state],
	outputs=[input_image, model_output_label, state, status_text, seq_len_display]
	)


	if __name__ == "__main__":
	demo.launch(theme=gr.themes.Soft(), ssr_mode=False, css=CSS_STYLE)