Spaces:

usingcolor
/

MambaEye

Sleeping

App Files Files Community

usingcolor commited on 22 days ago

Commit

93331e9

1 Parent(s): b468d75

chore: initialize git repository and remove Dockerfile

Browse files

Files changed (4) hide show

Dockerfile +0 -42
README.md +3 -1
app.py +48 -25
requirements.txt +1 -0

Dockerfile DELETED Viewed

@@ -1,42 +0,0 @@
-FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel
-ENV DEBIAN_FRONTEND=noninteractive
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    git ffmpeg libsm6 libxext6 cmake rsync libgl1 curl \
-    && rm -rf /var/lib/apt/lists/*
-# Install uv for fast python package installations
-RUN pip install uv
-# Create non-root user required by HuggingFace Spaces
-RUN useradd -m -u 1000 user
-USER user
-ENV HOME=/home/user \
-    PATH=/home/user/.local/bin:$PATH \
-    PYTHONWARNINGS=ignore \
-    GRADIO_SERVER_NAME="0.0.0.0" \
-    GRADIO_SERVER_PORT="7860"
-WORKDIR $HOME/app
-# Copy requirements and install base dependencies via uv
-COPY --chown=user requirements.txt $HOME/app/
-RUN uv pip install --system --upgrade pip
-RUN uv pip install --system -r requirements.txt
-# Specify CUDA architectures to compile for various GPUs commonly found on HF Spaces
-# 7.5: T4, 8.0: A100, 8.6: A10G/RTX3090, 8.9: L4, 9.0: H100
-ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9 9.0+PTX"
-# Install causal-conv1d and mamba-ssm requiring compilation
-# Since this uses the `devel` image, `nvcc` is available safely!
-RUN uv pip install --system causal-conv1d==1.5.0.post8 --no-build-isolation
-RUN uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
-# Copy the rest of the application code
-COPY --chown=user . $HOME/app/
-# Run the Gradio app
-CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -3,7 +3,9 @@ title: MambaEye
 emoji: 👁️
 colorFrom: blue
 colorTo: green
-sdk: docker
 pinned: false
 ---

 emoji: 👁️
 colorFrom: blue
 colorTo: green
+sdk: gradio
+sdk_version: 6.10.0
+app_file: app.py
 pinned: false
 ---

app.py CHANGED Viewed

@@ -1,7 +1,26 @@
 import sys
 import os
 import time
 # Add the cloned MambaEye repository to the Python path
 sys.path.append(os.path.join(os.path.dirname(__file__), "MambaEye"))
@@ -13,6 +32,7 @@ from PIL import Image, ImageDraw
 import torchvision.transforms as T
 from torchvision.models import ResNet50_Weights
 from huggingface_hub import hf_hub_download
 # MambaEye Imports
 from mambaeye.model import MambaEye
@@ -52,8 +72,10 @@ def get_model():
         try:
             checkpoint_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILENAME)
             model = MambaEye(**MODEL_CONFIG)
-            model.load_state_dict(torch.load(checkpoint_path, map_location=DEVICE))
-            model.to(DEVICE)
             model.eval()
             _GLOBAL_MODEL = model
             print("Model loaded successfully.")
@@ -99,8 +121,6 @@ def preprocess_image(image_arr):
     return canvas, x_offset, y_offset, new_h, new_w
 def extract_patch(canvas_tensor, px, py):
-    # px, py are coordinates on the canvas
-    # Bound them
     px = max(0, min(px, TARGET_CANVAS_SIZE - PATCH_SIZE))
     py = max(0, min(py, TARGET_CANVAS_SIZE - PATCH_SIZE))
     patch = canvas_tensor[:, px : px + PATCH_SIZE, py : py + PATCH_SIZE]
@@ -114,7 +134,6 @@ def draw_patches_on_image(image_arr, positions, x_offset, y_offset, h, w):
     ratio = min(TARGET_CANVAS_SIZE / orig_w, TARGET_CANVAS_SIZE / orig_h)
     for i, (px, py) in enumerate(positions):
-        # Map back to original image coordinates
         orig_y = (py - y_offset) / ratio
         orig_x = (px - x_offset) / ratio
         orig_px_size = PATCH_SIZE / ratio
@@ -138,7 +157,7 @@ def init_state_for_image(image):
     return {
         'inference_params': None,
         'cur_location': None,
-        'canvas_tensor': canvas_tensor,
         'x_offset': x_offset,
         'y_offset': y_offset,
         'h': h,
@@ -148,11 +167,14 @@ def init_state_for_image(image):
         'sequence_length': 0
     }
 def run_auto_scan(image, scan_pattern, sequence_length):
     if image is None:
         return None, {"Upload Image": 1.0}, None, "Upload Image"
     model = get_model()
     state = init_state_for_image(image)
     x_end = max(state['x_offset'] + 1, state['x_offset'] + state['h'])
@@ -168,7 +190,6 @@ def run_auto_scan(image, scan_pattern, sequence_length):
         scan_pattern=scan_pattern, rng=rng
     )
-    # We allow up to Max Seq Length (say 4000) for ongoing clicks later.
     inference_params = InferenceParams(max_seqlen=4000, max_batch_size=1)
     state['inference_params'] = inference_params
@@ -181,21 +202,23 @@ def run_auto_scan(image, scan_pattern, sequence_length):
         move_emb = _compute_move_embedding(loc_tensor, cur_location)
         cur_location = loc_tensor
-        patch = extract_patch(state['canvas_tensor'], px, py)
         patches_list.append(patch)
         moves_list.append(move_emb.squeeze(0))
-    img_seq = torch.stack(patches_list, dim=0).unsqueeze(0).to(DEVICE) # (1, L, 768)
-    move_seq = torch.stack(moves_list, dim=0).unsqueeze(0).to(DEVICE) # (1, L, 512)
     with torch.no_grad():
         out = model(img_seq, move_seq, inference_params=inference_params)
         final_probs = F.softmax(out[0, -1], dim=-1).cpu().numpy()
         inference_params.seqlen_offset += img_seq.shape[1]
-    state['cur_location'] = cur_location
     state['drawn_positions'] = positions
     state['sequence_length'] = sequence_length
     img_display, _ = draw_patches_on_image(
         state['original_image'], state['drawn_positions'],
@@ -204,16 +227,18 @@ def run_auto_scan(image, scan_pattern, sequence_length):
     return img_display, format_predictions(final_probs), state, f"Auto Scan Complete. Extracted {sequence_length} patches. Click to add more!"
 def on_click(evt: gr.SelectData, original_image, state):
     if original_image is None:
         return None, {"Upload Image": 1.0}, state, "Upload Image"
     if state is None or state.get('inference_params') is None:
         # Initialize state to begin a new purely user-guided sequence
         state = init_state_for_image(original_image)
         state['inference_params'] = InferenceParams(max_seqlen=4000, max_batch_size=1)
-    model = get_model()
     x_orig, y_orig = evt.index
     orig_h, orig_w = state['original_image'].shape[:2]
@@ -225,20 +250,21 @@ def on_click(evt: gr.SelectData, original_image, state):
     px = (canvas_x // PATCH_SIZE) * PATCH_SIZE
     py = (canvas_y // PATCH_SIZE) * PATCH_SIZE
     loc_tensor = torch.tensor([[px, py]], dtype=torch.long, device=DEVICE)
-    move_emb = _compute_move_embedding(loc_tensor, state['cur_location'])
-    patch = extract_patch(state['canvas_tensor'], px, py)
-    img_seq = patch.unsqueeze(0).unsqueeze(0).to(DEVICE) # (1, 1, 768)
-    move_seq = move_emb.unsqueeze(0).to(DEVICE) # (1, 1, 512)
     with torch.no_grad():
         out = model(img_seq, move_seq, inference_params=state['inference_params'])
         final_probs = F.softmax(out[0, -1], dim=-1).cpu().numpy()
         state['inference_params'].seqlen_offset += 1
-    state['cur_location'] = loc_tensor
     state['drawn_positions'].append((px, py))
     state['sequence_length'] += 1
@@ -247,13 +273,12 @@ def on_click(evt: gr.SelectData, original_image, state):
         state['x_offset'], state['y_offset'], state['h'], state['w']
     )
-    return img_display, format_predictions(final_probs), state, f"Added patch {state['sequence_length']} (Total {state['inference_params'].seqlen_offset} inference steps done)."
 def on_upload(image):
     if image is None:
         return None, {"Waiting...": 1.0}, None, "Upload Image"
-    # Pre-warm model in background
-    get_model()
     return image, {"Click Auto Scan or click the image": 1.0}, None, "Ready. You can Auto Scan or click."
 def on_clear(original_image):
@@ -261,11 +286,10 @@ def on_clear(original_image):
         return None, {"Cleared": 1.0}, None, "Cleared"
     return original_image, {"Cleared": 1.0}, init_state_for_image(original_image), "Selections cleared. Ready for new patch sequence."
 # Build the Gradio App Blocks
 with gr.Blocks(title="MambaEye Interactive Demo", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# MambaEye Interactive Inference Demo")
-    gr.Markdown("This interface incorporates the full **MambaEye-base** model inference in real-time.")
     with gr.Row():
         with gr.Column(scale=2):
@@ -297,7 +321,6 @@ with gr.Blocks(title="MambaEye Interactive Demo", theme=gr.themes.Soft()) as dem
         inputs=[input_image],
         outputs=[input_image, model_output_label, state, status_text]
     ).then(
-        # Save original image separately for redraw and clearing
         fn=lambda img: img, inputs=[input_image], outputs=[original_image_state]
     )

 import sys
 import os
+import subprocess
 import time
+# --- Dynamic Dependency Injection for HuggingFace Spaces ---
+# HuggingFace ZeroGPU builder environments lack `nvcc`.
+# We intercept the import and softly compile mamba-ssm using CPU-fallback PyTorch natives
+# so we pass the build requirements perfectly.
+try:
+    import mamba_ssm
+    import causal_conv1d
+except ImportError:
+    print("Installing mamba_ssm and causal_conv1d in backend...", flush=True)
+    env = os.environ.copy()
+    # Bypass CUDA extensions because we don't have nvcc locally or in standard Hub build container
+    env["MAMBA_SKIP_CUDA_BUILD"] = "TRUE"
+    env["CAUSAL_CONV1D_SKIP_CUDA_BUILD"] = "TRUE"
+    subprocess.check_call(
+        [sys.executable, "-m", "pip", "install", "causal-conv1d==1.5.0.post8", "mamba-ssm==2.2.4", "--no-build-isolation"],
+        env=env
+    )
 # Add the cloned MambaEye repository to the Python path
 sys.path.append(os.path.join(os.path.dirname(__file__), "MambaEye"))
 import torchvision.transforms as T
 from torchvision.models import ResNet50_Weights
 from huggingface_hub import hf_hub_download
+import spaces
 # MambaEye Imports
 from mambaeye.model import MambaEye
         try:
             checkpoint_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILENAME)
             model = MambaEye(**MODEL_CONFIG)
+            # On zero_gpu, downloading weights might happen on CPU first
+            map_loc = torch.device('cpu')
+            model.load_state_dict(torch.load(checkpoint_path, map_location=map_loc))
             model.eval()
             _GLOBAL_MODEL = model
             print("Model loaded successfully.")
     return canvas, x_offset, y_offset, new_h, new_w
 def extract_patch(canvas_tensor, px, py):
     px = max(0, min(px, TARGET_CANVAS_SIZE - PATCH_SIZE))
     py = max(0, min(py, TARGET_CANVAS_SIZE - PATCH_SIZE))
     patch = canvas_tensor[:, px : px + PATCH_SIZE, py : py + PATCH_SIZE]
     ratio = min(TARGET_CANVAS_SIZE / orig_w, TARGET_CANVAS_SIZE / orig_h)
     for i, (px, py) in enumerate(positions):
         orig_y = (py - y_offset) / ratio
         orig_x = (px - x_offset) / ratio
         orig_px_size = PATCH_SIZE / ratio
     return {
         'inference_params': None,
         'cur_location': None,
+        'canvas_tensor': canvas_tensor.cpu(),
         'x_offset': x_offset,
         'y_offset': y_offset,
         'h': h,
         'sequence_length': 0
     }
+@spaces.GPU
 def run_auto_scan(image, scan_pattern, sequence_length):
     if image is None:
         return None, {"Upload Image": 1.0}, None, "Upload Image"
     model = get_model()
+    model.to(DEVICE)
     state = init_state_for_image(image)
     x_end = max(state['x_offset'] + 1, state['x_offset'] + state['h'])
         scan_pattern=scan_pattern, rng=rng
     )
     inference_params = InferenceParams(max_seqlen=4000, max_batch_size=1)
     state['inference_params'] = inference_params
         move_emb = _compute_move_embedding(loc_tensor, cur_location)
         cur_location = loc_tensor
+        patch = extract_patch(state['canvas_tensor'], px, py).to(DEVICE)
         patches_list.append(patch)
         moves_list.append(move_emb.squeeze(0))
+    img_seq = torch.stack(patches_list, dim=0).unsqueeze(0) # (1, L, 768)
+    move_seq = torch.stack(moves_list, dim=0).unsqueeze(0) # (1, L, 512)
     with torch.no_grad():
         out = model(img_seq, move_seq, inference_params=inference_params)
         final_probs = F.softmax(out[0, -1], dim=-1).cpu().numpy()
         inference_params.seqlen_offset += img_seq.shape[1]
+    state['cur_location'] = cur_location.cpu()
     state['drawn_positions'] = positions
     state['sequence_length'] = sequence_length
+    # On ZeroGPU spaces safely store Tensors back to CPU State
+    state['canvas_tensor'] = state['canvas_tensor'].cpu()
     img_display, _ = draw_patches_on_image(
         state['original_image'], state['drawn_positions'],
     return img_display, format_predictions(final_probs), state, f"Auto Scan Complete. Extracted {sequence_length} patches. Click to add more!"
+@spaces.GPU
 def on_click(evt: gr.SelectData, original_image, state):
     if original_image is None:
         return None, {"Upload Image": 1.0}, state, "Upload Image"
+    model = get_model()
+    model.to(DEVICE)
     if state is None or state.get('inference_params') is None:
         # Initialize state to begin a new purely user-guided sequence
         state = init_state_for_image(original_image)
         state['inference_params'] = InferenceParams(max_seqlen=4000, max_batch_size=1)
     x_orig, y_orig = evt.index
     orig_h, orig_w = state['original_image'].shape[:2]
     px = (canvas_x // PATCH_SIZE) * PATCH_SIZE
     py = (canvas_y // PATCH_SIZE) * PATCH_SIZE
+    cur_loc = state['cur_location'].to(DEVICE) if state['cur_location'] is not None else None
     loc_tensor = torch.tensor([[px, py]], dtype=torch.long, device=DEVICE)
+    move_emb = _compute_move_embedding(loc_tensor, cur_loc)
+    patch = extract_patch(state['canvas_tensor'], px, py).to(DEVICE)
+    img_seq = patch.unsqueeze(0).unsqueeze(0) # (1, 1, 768)
+    move_seq = move_emb.unsqueeze(0) # (1, 1, 512)
     with torch.no_grad():
         out = model(img_seq, move_seq, inference_params=state['inference_params'])
         final_probs = F.softmax(out[0, -1], dim=-1).cpu().numpy()
         state['inference_params'].seqlen_offset += 1
+    state['cur_location'] = loc_tensor.cpu()
     state['drawn_positions'].append((px, py))
     state['sequence_length'] += 1
         state['x_offset'], state['y_offset'], state['h'], state['w']
     )
+    return img_display, format_predictions(final_probs), state, f"Added patch {state['sequence_length']} (Total {state['inference_params'].seqlen_offset} inference steps)."
 def on_upload(image):
     if image is None:
         return None, {"Waiting...": 1.0}, None, "Upload Image"
+    # Delay model load until auto-scan triggers, saving memory overhead in preloads
     return image, {"Click Auto Scan or click the image": 1.0}, None, "Ready. You can Auto Scan or click."
 def on_clear(original_image):
         return None, {"Cleared": 1.0}, None, "Cleared"
     return original_image, {"Cleared": 1.0}, init_state_for_image(original_image), "Selections cleared. Ready for new patch sequence."
 # Build the Gradio App Blocks
 with gr.Blocks(title="MambaEye Interactive Demo", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# MambaEye Interactive inference Demo")
+    gr.Markdown("This interface incorporates the full **MambaEye-base** model inference natively. Using **ZeroGPU** inference via PyTorch equivalents.")
     with gr.Row():
         with gr.Column(scale=2):
         inputs=[input_image],
         outputs=[input_image, model_output_label, state, status_text]
     ).then(
         fn=lambda img: img, inputs=[input_image], outputs=[original_image_state]
     )

requirements.txt CHANGED Viewed

@@ -6,3 +6,4 @@ torchvision==0.21.0
 lightning==2.6.1
 huggingface_hub
 omegaconf==2.3.0

 lightning==2.6.1
 huggingface_hub
 omegaconf==2.3.0
+spaces