MatAnyone

Running on Zero

App Files Files Community

PeiqingYang commited on Mar 6

Commit

77b7c2e

1 Parent(s): d6e7745

integrate MatAnyone 1 & 2

Browse files

Files changed (40) hide show

README.md +1 -1
hugging_face/app.py +159 -49
hugging_face/{matanyone_wrapper.py → matanyone2_wrapper.py} +10 -6
matanyone/utils/__init__.py +0 -0
matanyone2/__init__.py +2 -0
{matanyone → matanyone2/config}/__init__.py +0 -0
{matanyone → matanyone2}/config/eval_matanyone_config.yaml +1 -1
{matanyone → matanyone2}/config/hydra/job_logging/custom-no-rank.yaml +0 -0
{matanyone → matanyone2}/config/hydra/job_logging/custom.yaml +0 -0
{matanyone → matanyone2}/config/model/base.yaml +0 -0
{matanyone/config → matanyone2/inference}/__init__.py +0 -0
{matanyone → matanyone2}/inference/image_feature_store.py +2 -2
{matanyone → matanyone2}/inference/inference_core.py +155 -11
{matanyone → matanyone2}/inference/kv_memory_store.py +0 -0
{matanyone → matanyone2}/inference/memory_manager.py +6 -6
{matanyone → matanyone2}/inference/object_info.py +0 -0
{matanyone → matanyone2}/inference/object_manager.py +1 -1
{matanyone/inference → matanyone2/inference/utils}/__init__.py +0 -0
{matanyone → matanyone2}/inference/utils/args_utils.py +0 -0
{matanyone/inference/utils → matanyone2/model}/__init__.py +0 -0
{matanyone → matanyone2}/model/aux_modules.py +2 -2
{matanyone → matanyone2}/model/big_modules.py +5 -4
{matanyone → matanyone2}/model/channel_attn.py +0 -0
{matanyone → matanyone2}/model/group_modules.py +1 -1
matanyone/model/matanyone.py → matanyone2/model/matanyone2.py +22 -17
{matanyone → matanyone2}/model/modules.py +6 -5
{matanyone/model → matanyone2/model/transformer}/__init__.py +0 -0
{matanyone → matanyone2}/model/transformer/object_summarizer.py +4 -3
{matanyone → matanyone2}/model/transformer/object_transformer.py +4 -4
{matanyone → matanyone2}/model/transformer/positional_encoding.py +4 -2
{matanyone → matanyone2}/model/transformer/transformer_layers.py +1 -1
{matanyone/model/transformer → matanyone2/model/utils}/__init__.py +0 -0
{matanyone → matanyone2}/model/utils/memory_utils.py +0 -0
{matanyone → matanyone2}/model/utils/parameter_groups.py +0 -0
{matanyone → matanyone2}/model/utils/resnet.py +0 -0
{matanyone/model → matanyone2}/utils/__init__.py +0 -0
matanyone2/utils/device.py +33 -0
{matanyone → matanyone2}/utils/get_default_model.py +6 -6
matanyone2/utils/inference_utils.py +54 -0
{matanyone → matanyone2}/utils/tensor_utils.py +3 -3

README.md CHANGED Viewed

@@ -8,7 +8,7 @@ sdk_version: 5.16.0
 app_file: hugging_face/app.py
 pinned: false
 license: other
-short_description: Gradio demo for MatAnyone
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: hugging_face/app.py
 pinned: false
 license: other
+short_description: Gradio demo for MatAnyone 1 & 2
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

hugging_face/app.py CHANGED Viewed

@@ -21,9 +21,13 @@ from tools.interact_tools import SamControler
 from tools.misc import get_device
 from tools.download_util import load_file_from_url
-from matanyone_wrapper import matanyone
-from matanyone.utils.get_default_model import get_matanyone_model
-from matanyone.inference.inference_core import InferenceCore
 def parse_augment():
     parser = argparse.ArgumentParser()
@@ -121,7 +125,6 @@ def get_frames_from_video(video_input, video_state):
     except Exception as e:
         print(f"Audio extraction error: {str(e)}")
         audio_path = ""  # Set to "" if extraction fails
-    # print(f'audio_path: {audio_path}')
     # extract frames
     try:
@@ -140,8 +143,8 @@ def get_frames_from_video(video_input, video_state):
         print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
     image_size = (frames[0].shape[0],frames[0].shape[1])
-    # resize if resolution too big
-    if image_size[0]>=1280 and image_size[0]>=1280:
         scale = 1080 / min(image_size)
         new_w = int(image_size[1] * scale)
         new_h = int(image_size[0] * scale)
@@ -165,8 +168,7 @@ def get_frames_from_video(video_input, video_state):
     video_info = "Video Name: {},\nFPS: {},\nTotal Frames: {},\nImage Size:{}".format(video_state["video_name"], round(video_state["fps"], 0), len(frames), image_size)
     model.samcontroler.sam_controler.reset_image()
     model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
-    return video_state, video_info, video_state["origin_images"][0], \
-                        gr.update(visible=True, maximum=len(frames), value=1), gr.update(visible=False, maximum=len(frames), value=len(frames)), \
                         gr.update(visible=True), gr.update(visible=True), \
                         gr.update(visible=True), gr.update(visible=True),\
                         gr.update(visible=True), gr.update(visible=True), \
@@ -267,8 +269,18 @@ def show_mask(video_state, interactive_state, mask_dropdown):
         return select_frame
 # image matting
-def image_matting(video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, refine_iter):
-    matanyone_processor = InferenceCore(matanyone_model, cfg=matanyone_model.cfg)
     if interactive_state["track_end_number"]:
         following_frames = video_state["origin_images"][video_state["select_frame_number"]:interactive_state["track_end_number"]]
     else:
@@ -289,14 +301,25 @@ def image_matting(video_state, interactive_state, mask_dropdown, erode_kernel_si
     # operation error
     if len(np.unique(template_mask))==1:
         template_mask[0][0]=1
-    foreground, alpha = matanyone(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size, n_warmup=refine_iter)
     foreground_output = Image.fromarray(foreground[-1])
     alpha_output = Image.fromarray(alpha[-1][:,:,0])
     return foreground_output, alpha_output
 # video matting
-def video_matting(video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size):
-    matanyone_processor = InferenceCore(matanyone_model, cfg=matanyone_model.cfg)
     if interactive_state["track_end_number"]:
         following_frames = video_state["origin_images"][video_state["select_frame_number"]:interactive_state["track_end_number"]]
     else:
@@ -320,11 +343,11 @@ def video_matting(video_state, interactive_state, mask_dropdown, erode_kernel_si
     # operation error
     if len(np.unique(template_mask))==1:
         template_mask[0][0]=1
-    foreground, alpha = matanyone(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size)
     foreground_output = generate_video_from_frames(foreground, output_path="./results/{}_fg.mp4".format(video_state["video_name"]), fps=fps, audio_path=audio_path) # import video_input to name the output video
     alpha_output = generate_video_from_frames(alpha, output_path="./results/{}_alpha.mp4".format(video_state["video_name"]), fps=fps, gray2rgb=True, audio_path=audio_path) # import video_input to name the output video
     return foreground_output, alpha_output
@@ -415,47 +438,113 @@ sam_checkpoint = load_file_from_url(sam_checkpoint_url_dict[args.sam_model_type]
 # initialize sams
 model = MaskGenerator(sam_checkpoint, args)
-# initialize matanyone
-# load from ckpt
-# pretrain_model_url = "https://github.com/pq-yang/MatAnyone/releases/download/v1.0.0"
-# ckpt_path = load_file_from_url(os.path.join(pretrain_model_url, 'matanyone.pth'), checkpoint_folder)
-# matanyone_model = get_matanyone_model(ckpt_path, args.device)
-# load from Hugging Face
-from matanyone.model.matanyone import MatAnyone
-matanyone_model = MatAnyone.from_pretrained("PeiqingYang/MatAnyone")
-matanyone_model = matanyone_model.to(args.device).eval()
-matanyone_processor = InferenceCore(matanyone_model, cfg=matanyone_model.cfg)
 # download test samples
-media_url = "https://github.com/pq-yang/MatAnyone/releases/download/media/"
 test_sample_path = os.path.join('/home/user/app/hugging_face/', "test_sample/")
-load_file_from_url(os.path.join(media_url, 'test-sample0-720p.mp4'), test_sample_path)
-load_file_from_url(os.path.join(media_url, 'test-sample1-720p.mp4'), test_sample_path)
-load_file_from_url(os.path.join(media_url, 'test-sample2-720p.mp4'), test_sample_path)
-load_file_from_url(os.path.join(media_url, 'test-sample3-720p.mp4'), test_sample_path)
-load_file_from_url(os.path.join(media_url, 'test-sample0.jpg'), test_sample_path)
-load_file_from_url(os.path.join(media_url, 'test-sample1.jpg'), test_sample_path)
 # download assets
 assets_path = os.path.join('/home/user/app/hugging_face/', "assets/")
-load_file_from_url(os.path.join(media_url, 'tutorial_single_target.mp4'), assets_path)
-load_file_from_url(os.path.join(media_url, 'tutorial_multi_targets.mp4'), assets_path)
 # documents
-title = r"""<div class="multi-layer" align="center"><span>MatAnyone</span></div>
 """
 description = r"""
-<b>Official Gradio demo</b> for <a href='https://github.com/pq-yang/MatAnyone' target='_blank'><b>MatAnyone: Stable Video Matting with Consistent Memory Propagation</b></a>.<br>
-🔥 MatAnyone is a practical human video matting framework supporting target assignment 🎯.<br>
-🎪 Try to drop your video/image, assign the target masks with a few clicks, and get the the matting results 🤡!<br>
 *Note: Due to the online GPU memory constraints, any input with too big resolution will be resized to 1080p.<br>*
-🚀 <b> If you encounter any issue (e.g., frozen video output) or wish to run on higher resolution inputs, please consider <u>duplicating this space</u> or
-<u>launching the <a href='https://github.com/pq-yang/MatAnyone?tab=readme-ov-file#-interactive-demo' target='_blank'>demo</a> locally</u> following the GitHub instructions.</b>
 """
 article = r"""<h3>
-<b>If MatAnyone is helpful, please help to 🌟 the <a href='https://github.com/pq-yang/MatAnyone' target='_blank'>Github Repo</a>. Thanks!</b></h3>
 ---
@@ -463,6 +552,13 @@ article = r"""<h3>
 <br>
 If our work is useful for your research, please consider citing:
 ```bibtex
 @InProceedings{yang2025matanyone,
      title     = {{MatAnyone}: Stable Video Matting with Consistent Memory Propagation},
      author    = {Yang, Peiqing and Zhou, Shangchen and Zhao, Jixin and Tao, Qingyi and Loy, Chen Change},
@@ -558,10 +654,10 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
         <div class="title-container">
             <h1 class="title is-2 publication-title"
                 style="font-size:50px; font-family: 'Sarpanch', serif;
-                    background: linear-gradient(to right, #d231d8, #2dc464);
                     display: inline-block; -webkit-background-clip: text;
                     -webkit-text-fill-color: transparent;">
-                MatAnyone
             </h1>
         </div>
     ''')
@@ -614,7 +710,14 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
             with gr.Group(elem_classes="gr-monochrome-group", visible=True):
                 with gr.Row():
-                    with gr.Accordion('MatAnyone Settings (click to expand)', open=False):
                         with gr.Row():
                             erode_kernel_size = gr.Slider(label='Erode Kernel Size',
                                                     minimum=0,
@@ -722,7 +825,7 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
             # video matting
             matting_button.click(
                 fn=video_matting,
-                inputs=[video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size],
                 outputs=[foreground_video_output, alpha_video_output]
             )
@@ -775,7 +878,7 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
             gr.Markdown("---")
             gr.Markdown("## Examples")
             gr.Examples(
-                examples=[os.path.join(os.path.dirname(__file__), "./test_sample/", test_sample) for test_sample in ["test-sample0-720p.mp4", "test-sample1-720p.mp4", "test-sample2-720p.mp4", "test-sample3-720p.mp4"]],
                 inputs=[video_input],
             )
@@ -811,7 +914,14 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
             with gr.Group(elem_classes="gr-monochrome-group", visible=True):
                 with gr.Row():
-                    with gr.Accordion('MatAnyone Settings (click to expand)', open=False):
                         with gr.Row():
                             erode_kernel_size = gr.Slider(label='Erode Kernel Size',
                                                     minimum=0,
@@ -918,7 +1028,7 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
             # image matting
             matting_button.click(
                 fn=image_matting,
-                inputs=[image_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, image_selection_slider],
                 outputs=[foreground_image_output, alpha_image_output]
             )
@@ -971,7 +1081,7 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
             gr.Markdown("---")
             gr.Markdown("## Examples")
             gr.Examples(
-                examples=[os.path.join(os.path.dirname(__file__), "./test_sample/", test_sample) for test_sample in ["test-sample0.jpg", "test-sample1.jpg"]],
                 inputs=[image_input],
             )

 from tools.misc import get_device
 from tools.download_util import load_file_from_url
+from matanyone2_wrapper import matanyone2
+from matanyone2.utils.get_default_model import get_matanyone2_model
+from matanyone2.inference.inference_core import InferenceCore
+from hydra.core.global_hydra import GlobalHydra
+import warnings
+warnings.filterwarnings("ignore")
 def parse_augment():
     parser = argparse.ArgumentParser()
     except Exception as e:
         print(f"Audio extraction error: {str(e)}")
         audio_path = ""  # Set to "" if extraction fails
     # extract frames
     try:
         print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
     image_size = (frames[0].shape[0],frames[0].shape[1])
+    # [remove for local demo] resize if resolution too big
+    if image_size[0]>=1080 and image_size[0]>=1080:
         scale = 1080 / min(image_size)
         new_w = int(image_size[1] * scale)
         new_h = int(image_size[0] * scale)
     video_info = "Video Name: {},\nFPS: {},\nTotal Frames: {},\nImage Size:{}".format(video_state["video_name"], round(video_state["fps"], 0), len(frames), image_size)
     model.samcontroler.sam_controler.reset_image()
     model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
+    return video_state, video_info, video_state["origin_images"][0], gr.update(visible=True, maximum=len(frames), value=1), gr.update(visible=False, maximum=len(frames), value=len(frames)), \
                         gr.update(visible=True), gr.update(visible=True), \
                         gr.update(visible=True), gr.update(visible=True),\
                         gr.update(visible=True), gr.update(visible=True), \
         return select_frame
 # image matting
+def image_matting(video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, refine_iter, model_selection):
+    # Load model if not already loaded
+    try:
+        selected_model = load_model(model_selection)
+    except (FileNotFoundError, ValueError) as e:
+        # Fallback to first available model
+        if available_models:
+            print(f"Warning: {str(e)}. Using {available_models[0]} instead.")
+            selected_model = load_model(available_models[0])
+        else:
+            raise ValueError("No models are available! Please check if the model files exist.")
+    matanyone_processor = InferenceCore(selected_model, cfg=selected_model.cfg)
     if interactive_state["track_end_number"]:
         following_frames = video_state["origin_images"][video_state["select_frame_number"]:interactive_state["track_end_number"]]
     else:
     # operation error
     if len(np.unique(template_mask))==1:
         template_mask[0][0]=1
+    foreground, alpha = matanyone2(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size, n_warmup=refine_iter)
     foreground_output = Image.fromarray(foreground[-1])
     alpha_output = Image.fromarray(alpha[-1][:,:,0])
     return foreground_output, alpha_output
 # video matting
+def video_matting(video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, model_selection):
+    # Load model if not already loaded
+    try:
+        selected_model = load_model(model_selection)
+    except (FileNotFoundError, ValueError) as e:
+        # Fallback to first available model
+        if available_models:
+            print(f"Warning: {str(e)}. Using {available_models[0]} instead.")
+            selected_model = load_model(available_models[0])
+        else:
+            raise ValueError("No models are available! Please check if the model files exist.")
+    matanyone_processor = InferenceCore(selected_model, cfg=selected_model.cfg)
     if interactive_state["track_end_number"]:
         following_frames = video_state["origin_images"][video_state["select_frame_number"]:interactive_state["track_end_number"]]
     else:
     # operation error
     if len(np.unique(template_mask))==1:
         template_mask[0][0]=1
+    foreground, alpha = matanyone2(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size)
     foreground_output = generate_video_from_frames(foreground, output_path="./results/{}_fg.mp4".format(video_state["video_name"]), fps=fps, audio_path=audio_path) # import video_input to name the output video
     alpha_output = generate_video_from_frames(alpha, output_path="./results/{}_alpha.mp4".format(video_state["video_name"]), fps=fps, gray2rgb=True, audio_path=audio_path) # import video_input to name the output video
     return foreground_output, alpha_output
 # initialize sams
 model = MaskGenerator(sam_checkpoint, args)
+# initialize matanyone - lazy loading
+# Model display names to file names mapping
+model_display_to_file = {
+    "MatAnyone": "matanyone.pth",
+    "MatAnyone 2": "matanyone2.pth"
+}
+# Model URLs
+model_urls = {
+    "matanyone.pth": "https://github.com/pq-yang/MatAnyone/releases/download/v1.0.0/matanyone.pth",
+    "matanyone2.pth": "https://github.com/pq-yang/MatAnyone2/releases/download/v1.0.0/matanyone2.pth"
+}
+# Model paths - download models using load_file_from_url
+model_paths = {
+    "matanyone.pth": load_file_from_url(model_urls["matanyone.pth"], checkpoint_folder),
+    "matanyone2.pth": load_file_from_url(model_urls["matanyone2.pth"], checkpoint_folder)
+}
+# Cache for loaded models (lazy loading)
+loaded_models = {}
+def load_model(display_name):
+    """Load a model if not already loaded"""
+    # Convert display name to file name
+    if display_name in model_display_to_file:
+        model_file = model_display_to_file[display_name]
+    elif display_name in model_paths:
+        # Also support direct file name for backward compatibility
+        model_file = display_name
+    else:
+        raise ValueError(f"Unknown model: {display_name}")
+    if model_file in loaded_models:
+        return loaded_models[model_file]
+    if model_file not in model_paths:
+        raise ValueError(f"Unknown model file: {model_file}")
+    ckpt_path = model_paths[model_file]
+    if not os.path.exists(ckpt_path):
+        raise FileNotFoundError(f"Model file not found: {ckpt_path}")
+    # Clear Hydra instance if already initialized (to allow loading different models)
+    try:
+        GlobalHydra.instance().clear()
+    except:
+        pass  # If Hydra is not initialized, this is fine
+    print(f"Loading model: {display_name} ({model_file})...")
+    model = get_matanyone2_model(ckpt_path, args.device)
+    model = model.to(args.device).eval()
+    loaded_models[model_file] = model
+    print(f"Model {display_name} loaded successfully.")
+    return model
+# Get available model choices for the UI (check if files exist)
+# Order: MatAnyone 2 first, then MatAnyone
+available_models = []
+# Check MatAnyone 2 first
+if "MatAnyone 2" in model_display_to_file:
+    file_name = model_display_to_file["MatAnyone 2"]
+    if file_name in model_paths and os.path.exists(model_paths[file_name]):
+        available_models.append("MatAnyone 2")
+# Then check MatAnyone
+if "MatAnyone" in model_display_to_file:
+    file_name = model_display_to_file["MatAnyone"]
+    if file_name in model_paths and os.path.exists(model_paths[file_name]):
+        available_models.append("MatAnyone")
+if not available_models:
+    raise RuntimeError("No models are available! Please ensure at least one model file exists in ../pretrained_models/")
+default_model = "MatAnyone 2" if "MatAnyone 2" in available_models else available_models[0]
 # download test samples
 test_sample_path = os.path.join('/home/user/app/hugging_face/', "test_sample/")
+load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-0-1080p.mp4', test_sample_path)
+load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-1-1080p.mp4', test_sample_path)
+load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-2-720p.mp4', test_sample_path)
+load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-3-720p.mp4', test_sample_path)
+load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-4-720p.mp4', test_sample_path)
+load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-5-720p.mp4', test_sample_path)
+load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-0.jpg', test_sample_path)
+load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-1.jpg', test_sample_path)
+load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-2.jpg', test_sample_path)
+load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-3.jpg', test_sample_path)
 # download assets
 assets_path = os.path.join('/home/user/app/hugging_face/', "assets/")
+load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/tutorial_single_target.mp4', assets_path)
+load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/tutorial_multi_targets.mp4', assets_path)
 # documents
+title = r"""<div class="multi-layer" align="center"><span>MatAnyone Series</span></div>
 """
 description = r"""
+<b>Official Gradio demo</b> for <a href='https://github.com/pq-yang/MatAnyone2' target='_blank'><b>MatAnyone 2</b></a> and <a href='https://github.com/pq-yang/MatAnyone' target='_blank'><b>MatAnyone</b></a>.<br>
+🔥 MatAnyone series provide practical human video matting framework supporting target assignment.<br>
+🧐 <b>We use <u>MatAnyone 2</u> as the default model. You can also choose <u>MatAnyone</u> in "Model Selection".</b><br>
+🎪 Try to drop your video/image, assign the target masks with a few clicks, and get the the matting results!<br>
 *Note: Due to the online GPU memory constraints, any input with too big resolution will be resized to 1080p.<br>*
+🚀 <b> If you encounter any issue (e.g., frozen video output) or wish to run on higher resolution inputs, please consider duplicating this space or
+launching the demo locally following the <a href='https://github.com/pq-yang/MatAnyone2?tab=readme-ov-file#-interactive-demo' target='_blank'>GitHub instructions</a>.</b>
 """
 article = r"""<h3>
+<b>If our projects are helpful, please help to 🌟 the Github Repo for <a href='https://github.com/pq-yang/MatAnyone2' target='_blank'>MatAnyone 2</a> and <a href='https://github.com/pq-yang/MatAnyone' target='_blank'>MatAnyone</a>. Thanks!</b></h3>
 ---
 <br>
 If our work is useful for your research, please consider citing:
 ```bibtex
+@InProceedings{yang2026matanyone2,
+      title     = {{MatAnyone 2}: Scaling Video Matting via a Learned Quality Evaluator},
+      author    = {Yang, Peiqing and Zhou, Shangchen and Hao, Kai and Tao, Qingyi},
+      booktitle = {CVPR},
+      year      = {2026}
+}
 @InProceedings{yang2025matanyone,
      title     = {{MatAnyone}: Stable Video Matting with Consistent Memory Propagation},
      author    = {Yang, Peiqing and Zhou, Shangchen and Zhao, Jixin and Tao, Qingyi and Loy, Chen Change},
         <div class="title-container">
             <h1 class="title is-2 publication-title"
                 style="font-size:50px; font-family: 'Sarpanch', serif;
+                    background: linear-gradient(to right, #000000, #2dc464);
                     display: inline-block; -webkit-background-clip: text;
                     -webkit-text-fill-color: transparent;">
+                MatAnyone Series
             </h1>
         </div>
     ''')
             with gr.Group(elem_classes="gr-monochrome-group", visible=True):
                 with gr.Row():
+                    model_selection = gr.Radio(
+                        choices=available_models,
+                        value=default_model,
+                        label="Model Selection",
+                        info="Choose the model to use for matting",
+                        interactive=True)
+                with gr.Row():
+                    with gr.Accordion('Model Settings (click to expand)', open=False):
                         with gr.Row():
                             erode_kernel_size = gr.Slider(label='Erode Kernel Size',
                                                     minimum=0,
             # video matting
             matting_button.click(
                 fn=video_matting,
+                inputs=[video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, model_selection],
                 outputs=[foreground_video_output, alpha_video_output]
             )
             gr.Markdown("---")
             gr.Markdown("## Examples")
             gr.Examples(
+                examples=[os.path.join(os.path.dirname(__file__), "./test_sample/", test_sample) for test_sample in ["test-sample-0-1080p.mp4", "test-sample-1-1080p.mp4", "test-sample-2-720p.mp4", "test-sample-3-720p.mp4", "test-sample-4-720p.mp4", "test-sample-5-720p.mp4"]],
                 inputs=[video_input],
             )
             with gr.Group(elem_classes="gr-monochrome-group", visible=True):
                 with gr.Row():
+                    model_selection = gr.Radio(
+                        choices=available_models,
+                        value=default_model,
+                        label="Model Selection",
+                        info="Choose the model to use for matting",
+                        interactive=True)
+                with gr.Row():
+                    with gr.Accordion('Model Settings (click to expand)', open=False):
                         with gr.Row():
                             erode_kernel_size = gr.Slider(label='Erode Kernel Size',
                                                     minimum=0,
             # image matting
             matting_button.click(
                 fn=image_matting,
+                inputs=[image_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, image_selection_slider, model_selection],
                 outputs=[foreground_image_output, alpha_image_output]
             )
             gr.Markdown("---")
             gr.Markdown("## Examples")
             gr.Examples(
+                examples=[os.path.join(os.path.dirname(__file__), "./test_sample/", test_sample) for test_sample in ["test-sample-0.jpg", "test-sample-1.jpg", "test-sample-2.jpg", "test-sample-3.jpg"]],
                 inputs=[image_input],
             )

hugging_face/{matanyone_wrapper.py → matanyone2_wrapper.py} RENAMED Viewed

@@ -1,9 +1,13 @@
 import tqdm
 import torch
 from torchvision.transforms.functional import to_tensor
 import numpy as np
 import random
 import cv2
 def gen_dilate(alpha, min_kernel_size, max_kernel_size):
     kernel_size = random.randint(min_kernel_size, max_kernel_size)
@@ -20,8 +24,8 @@ def gen_erosion(alpha, min_kernel_size, max_kernel_size):
     return erode.astype(np.float32)
 @torch.inference_mode()
-@torch.cuda.amp.autocast()
-def matanyone(processor, frames_np, mask, r_erode=0, r_dilate=0, n_warmup=10):
     """
     Args:
         frames_np: [(H,W,C)]*n, uint8
@@ -41,14 +45,14 @@ def matanyone(processor, frames_np, mask, r_erode=0, r_dilate=0, n_warmup=10):
     if r_erode > 0:
         mask = gen_erosion(mask, r_erode, r_erode)
-    mask = torch.from_numpy(mask).cuda()
     frames_np = [frames_np[0]]* n_warmup + frames_np
     frames = []
     phas = []
     for ti, frame_single in tqdm.tqdm(enumerate(frames_np)):
-        image = to_tensor(frame_single).cuda().float()
         if ti == 0:
             output_prob = processor.step(image, mask, objects=objects)      # encode given mask
@@ -62,7 +66,7 @@ def matanyone(processor, frames_np, mask, r_erode=0, r_dilate=0, n_warmup=10):
         # convert output probabilities to an object mask
         mask = processor.output_prob_to_mask(output_prob)
-        pha = mask.unsqueeze(2).cpu().numpy()
         com_np = frame_single / 255. * pha + bgr * (1 - pha)
         # DONOT save the warmup frames
@@ -70,4 +74,4 @@ def matanyone(processor, frames_np, mask, r_erode=0, r_dilate=0, n_warmup=10):
             frames.append((com_np*255).astype(np.uint8))
             phas.append((pha*255).astype(np.uint8))
-    return frames, phas

 import tqdm
 import torch
 from torchvision.transforms.functional import to_tensor
 import numpy as np
 import random
 import cv2
+from matanyone2.utils.device import get_default_device, safe_autocast_decorator
+device = get_default_device()
 def gen_dilate(alpha, min_kernel_size, max_kernel_size):
     kernel_size = random.randint(min_kernel_size, max_kernel_size)
     return erode.astype(np.float32)
 @torch.inference_mode()
+@safe_autocast_decorator()
+def matanyone2(processor, frames_np, mask, r_erode=0, r_dilate=0, n_warmup=10):
     """
     Args:
         frames_np: [(H,W,C)]*n, uint8
     if r_erode > 0:
         mask = gen_erosion(mask, r_erode, r_erode)
+    mask = torch.from_numpy(mask).to(device)
     frames_np = [frames_np[0]]* n_warmup + frames_np
     frames = []
     phas = []
     for ti, frame_single in tqdm.tqdm(enumerate(frames_np)):
+        image = to_tensor(frame_single).float().to(device)
         if ti == 0:
             output_prob = processor.step(image, mask, objects=objects)      # encode given mask
         # convert output probabilities to an object mask
         mask = processor.output_prob_to_mask(output_prob)
+        pha = mask.unsqueeze(2).detach().to("cpu").numpy()
         com_np = frame_single / 255. * pha + bgr * (1 - pha)
         # DONOT save the warmup frames
             frames.append((com_np*255).astype(np.uint8))
             phas.append((pha*255).astype(np.uint8))
+    return frames, phas

matanyone/utils/__init__.py DELETED Viewed

File without changes

matanyone2/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from matanyone2.inference.inference_core import InferenceCore
2	+ from matanyone2.model.matanyone2 import MatAnyone2

{matanyone → matanyone2/config}/__init__.py RENAMED Viewed

File without changes

{matanyone → matanyone2}/config/eval_matanyone_config.yaml RENAMED Viewed

@@ -9,7 +9,7 @@ hydra:
   output_subdir: ${now:%Y-%m-%d_%H-%M-%S}-hydra
 amp: False
-weights: pretrained_models/matanyone.pth # default (can be modified from outside)
 output_dir: null # defaults to run_dir; specify this to override
 flip_aug: False

   output_subdir: ${now:%Y-%m-%d_%H-%M-%S}-hydra
 amp: False
+weights: pretrained_models/matanyone2.pth # default (can be modified from outside)
 output_dir: null # defaults to run_dir; specify this to override
 flip_aug: False

{matanyone → matanyone2}/config/hydra/job_logging/custom-no-rank.yaml RENAMED Viewed

File without changes

{matanyone → matanyone2}/config/hydra/job_logging/custom.yaml RENAMED Viewed

File without changes

{matanyone → matanyone2}/config/model/base.yaml RENAMED Viewed

File without changes

{matanyone/config → matanyone2/inference}/__init__.py RENAMED Viewed

File without changes

{matanyone → matanyone2}/inference/image_feature_store.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import warnings
 from typing import Iterable
 import torch
-from matanyone.model.matanyone import MatAnyone
 class ImageFeatureStore:
@@ -13,7 +13,7 @@ class ImageFeatureStore:
     Feature of a frame should be associated with a unique index -- typically the frame id.
     """
-    def __init__(self, network: MatAnyone, no_warning: bool = False):
         self.network = network
         self._store = {}
         self.no_warning = no_warning

 import warnings
 from typing import Iterable
 import torch
+from matanyone2.model.matanyone2 import MatAnyone2
 class ImageFeatureStore:
     Feature of a frame should be associated with a unique index -- typically the frame id.
     """
+    def __init__(self, network: MatAnyone2, no_warning: bool = False):
         self.network = network
         self._store = {}
         self.no_warning = no_warning

{matanyone → matanyone2}/inference/inference_core.py RENAMED Viewed

@@ -1,16 +1,25 @@
-from typing import List, Optional, Iterable
 import logging
 from omegaconf import DictConfig
-import numpy as np
 import torch
 import torch.nn.functional as F
-from matanyone.inference.memory_manager import MemoryManager
-from matanyone.inference.object_manager import ObjectManager
-from matanyone.inference.image_feature_store import ImageFeatureStore
-from matanyone.model.matanyone import MatAnyone
-from matanyone.utils.tensor_utils import pad_divide_by, unpad, aggregate
 log = logging.getLogger()
@@ -18,11 +27,21 @@ log = logging.getLogger()
 class InferenceCore:
     def __init__(self,
-                 network: MatAnyone,
-                 cfg: DictConfig,
                  *,
-                 image_feature_store: ImageFeatureStore = None):
-        self.network = network
         self.cfg = cfg
         self.mem_every = cfg.mem_every
         stagger_updates = cfg.stagger_updates
@@ -404,3 +423,128 @@ class InferenceCore:
                 new_mask[mask == tmp_id] = obj.id
         return new_mask

 import logging
 from omegaconf import DictConfig
+from typing import List, Optional, Iterable, Union,Tuple
+import os
+import cv2
 import torch
+import imageio
+import tempfile
+import numpy as np
+from tqdm import tqdm
+from PIL import Image
 import torch.nn.functional as F
+from matanyone2.inference.memory_manager import MemoryManager
+from matanyone2.inference.object_manager import ObjectManager
+from matanyone2.inference.image_feature_store import ImageFeatureStore
+from matanyone2.model.matanyone2 import MatAnyone2
+from matanyone2.utils.tensor_utils import pad_divide_by, unpad, aggregate
+from matanyone2.utils.inference_utils import gen_dilate, gen_erosion, read_frame_from_videos
+from matanyone2.utils.device import get_default_device, safe_autocast
 log = logging.getLogger()
 class InferenceCore:
     def __init__(self,
+                 network: Union[MatAnyone2,str],
+                 cfg: DictConfig = None,
                  *,
+                 image_feature_store: ImageFeatureStore = None,
+                 device: Optional[Union[str, torch.device]] = None
+                 ):
+        if device is None:
+            device = get_default_device()
+        self.device = device
+        if isinstance(network, str):
+            network = MatAnyone2.from_pretrained(network)
+        network.to(device)
+        network.eval()
+        self.network = network
+        cfg = cfg if cfg is not None else network.cfg
         self.cfg = cfg
         self.mem_every = cfg.mem_every
         stagger_updates = cfg.stagger_updates
                 new_mask[mask == tmp_id] = obj.id
         return new_mask
+    @torch.inference_mode()
+    @safe_autocast()
+    def process_video(
+        self,
+        input_path: str,
+        mask_path: str,
+        output_path: str = None,
+        n_warmup: int = 10,
+        r_erode: int = 10,
+        r_dilate: int = 10,
+        suffix: str = "",
+        save_image: bool = False,
+        max_size: int = -1,
+    ) -> Tuple:
+        """
+        Process a video for object segmentation and matting.
+        This method processes a video file by performing object segmentation and matting on each frame.
+        It supports warmup frames, mask erosion/dilation, and various output options.
+        Args:
+            input_path (str): Path to the input video file
+            mask_path (str): Path to the mask image file used for initial segmentation
+            output_path (str, optional): Directory path where output files will be saved. Defaults to a temporary directory
+            n_warmup (int, optional): Number of warmup frames to use. Defaults to 10
+            r_erode (int, optional): Erosion radius for mask processing. Defaults to 10
+            r_dilate (int, optional): Dilation radius for mask processing. Defaults to 10
+            suffix (str, optional): Suffix to append to output filename. Defaults to ""
+            save_image (bool, optional): Whether to save individual frames. Defaults to False
+            max_size (int, optional): Maximum size for frame dimension. Use -1 for no limit. Defaults to -1
+        Returns:
+            Tuple[str, str]: A tuple containing:
+                - Path to the output foreground video file (str)
+                - Path to the output alpha matte video file (str)
+        Output:
+            - Saves processed video files with foreground (_fgr) and alpha matte (_pha)
+            - If save_image=True, saves individual frames in separate directories
+        """
+        output_path = output_path if output_path is not None else tempfile.TemporaryDirectory().name
+        r_erode = int(r_erode)
+        r_dilate = int(r_dilate)
+        n_warmup = int(n_warmup)
+        max_size = int(max_size)
+        vframes, fps, length, video_name = read_frame_from_videos(input_path)
+        repeated_frames = vframes[0].unsqueeze(0).repeat(n_warmup, 1, 1, 1)
+        vframes = torch.cat([repeated_frames, vframes], dim=0).float()
+        length += n_warmup
+        new_h, new_w = vframes.shape[-2:]
+        if max_size > 0:
+            h, w = new_h, new_w
+            min_side = min(h, w)
+            if min_side > max_size:
+                new_h = int(h / min_side * max_size)
+                new_w = int(w / min_side * max_size)
+                vframes = F.interpolate(vframes, size=(new_h, new_w), mode="area")
+        os.makedirs(output_path, exist_ok=True)
+        if suffix:
+            video_name = f"{video_name}_{suffix}"
+        if save_image:
+            os.makedirs(f"{output_path}/{video_name}", exist_ok=True)
+            os.makedirs(f"{output_path}/{video_name}/pha", exist_ok=True)
+            os.makedirs(f"{output_path}/{video_name}/fgr", exist_ok=True)
+        mask = np.array(Image.open(mask_path).convert("L"))
+        if r_dilate > 0:
+            mask = gen_dilate(mask, r_dilate, r_dilate)
+        if r_erode > 0:
+            mask = gen_erosion(mask, r_erode, r_erode)
+        mask = torch.from_numpy(mask).float().to(self.device)
+        if max_size > 0:
+            mask = F.interpolate(
+                mask.unsqueeze(0).unsqueeze(0), size=(new_h, new_w), mode="nearest"
+            )[0, 0]
+        bgr = (np.array([120, 255, 155], dtype=np.float32) / 255).reshape((1, 1, 3))
+        objects = [1]
+        phas = []
+        fgrs = []
+        for ti in tqdm(range(length)):
+            image = vframes[ti]
+            image_np = np.array(image.permute(1, 2, 0))
+            image = (image / 255.0).float().to(self.device)
+            if ti == 0:
+                output_prob = self.step(image, mask, objects=objects)
+                output_prob = self.step(image, first_frame_pred=True)
+            else:
+                if ti <= n_warmup:
+                    output_prob = self.step(image, first_frame_pred=True)
+                else:
+                    output_prob = self.step(image)
+            mask = self.output_prob_to_mask(output_prob)
+            pha = mask.unsqueeze(2).cpu().numpy()
+            com_np = image_np / 255.0 * pha + bgr * (1 - pha)
+            if ti > (n_warmup - 1):
+                com_np = (com_np * 255).astype(np.uint8)
+                pha = (pha * 255).astype(np.uint8)
+                fgrs.append(com_np)
+                phas.append(pha)
+                if save_image:
+                    cv2.imwrite(
+                        f"{output_path}/{video_name}/pha/{str(ti - n_warmup).zfill(5)}.png",
+                        pha,
+                    )
+                    cv2.imwrite(
+                        f"{output_path}/{video_name}/fgr/{str(ti - n_warmup).zfill(5)}.png",
+                        com_np[..., [2, 1, 0]],
+                    )
+        fgrs = np.array(fgrs)
+        phas = np.array(phas)
+        fgr_filename = f"{output_path}/{video_name}_fgr.mp4"
+        alpha_filename = f"{output_path}/{video_name}_pha.mp4"
+        imageio.mimwrite(fgr_filename, fgrs, fps=fps, quality=7)
+        imageio.mimwrite(alpha_filename, phas, fps=fps, quality=7)
+        return (fgr_filename,alpha_filename)

{matanyone → matanyone2}/inference/kv_memory_store.py RENAMED Viewed

File without changes

{matanyone → matanyone2}/inference/memory_manager.py RENAMED Viewed

@@ -3,10 +3,10 @@ from omegaconf import DictConfig
 from typing import List, Dict
 import torch
-from matanyone.inference.object_manager import ObjectManager
-from matanyone.inference.kv_memory_store import KeyValueMemoryStore
-from matanyone.model.matanyone import MatAnyone
-from matanyone.model.utils.memory_utils import get_similarity, do_softmax
 log = logging.getLogger()
@@ -113,7 +113,7 @@ class MemoryManager:
         return value
     def read_first_frame(self, last_msk_value, pix_feat: torch.Tensor,
-             last_mask: torch.Tensor, network: MatAnyone, uncert_output=None) -> Dict[int, torch.Tensor]:
         """
         Read from all memory stores and returns a single memory readout tensor for each object
@@ -166,7 +166,7 @@ class MemoryManager:
         return all_readout_mem
     def read(self, pix_feat: torch.Tensor, query_key: torch.Tensor, selection: torch.Tensor,
-             last_mask: torch.Tensor, network: MatAnyone, uncert_output=None, last_msk_value=None, ti=None,
              last_pix_feat=None, last_pred_mask=None) -> Dict[int, torch.Tensor]:
         """
         Read from all memory stores and returns a single memory readout tensor for each object

 from typing import List, Dict
 import torch
+from matanyone2.inference.object_manager import ObjectManager
+from matanyone2.inference.kv_memory_store import KeyValueMemoryStore
+from matanyone2.model.matanyone2 import MatAnyone2
+from matanyone2.model.utils.memory_utils import get_similarity, do_softmax
 log = logging.getLogger()
         return value
     def read_first_frame(self, last_msk_value, pix_feat: torch.Tensor,
+             last_mask: torch.Tensor, network: MatAnyone2, uncert_output=None) -> Dict[int, torch.Tensor]:
         """
         Read from all memory stores and returns a single memory readout tensor for each object
         return all_readout_mem
     def read(self, pix_feat: torch.Tensor, query_key: torch.Tensor, selection: torch.Tensor,
+             last_mask: torch.Tensor, network: MatAnyone2, uncert_output=None, last_msk_value=None, ti=None,
              last_pix_feat=None, last_pred_mask=None) -> Dict[int, torch.Tensor]:
         """
         Read from all memory stores and returns a single memory readout tensor for each object

{matanyone → matanyone2}/inference/object_info.py RENAMED Viewed

File without changes

{matanyone → matanyone2}/inference/object_manager.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from typing import Union, List, Dict
 import torch
-from matanyone.inference.object_info import ObjectInfo
 class ObjectManager:

 from typing import Union, List, Dict
 import torch
+from matanyone2.inference.object_info import ObjectInfo
 class ObjectManager:

{matanyone/inference → matanyone2/inference/utils}/__init__.py RENAMED Viewed

File without changes

{matanyone → matanyone2}/inference/utils/args_utils.py RENAMED Viewed

File without changes

{matanyone/inference/utils → matanyone2/model}/__init__.py RENAMED Viewed

File without changes

{matanyone → matanyone2}/model/aux_modules.py RENAMED Viewed

@@ -6,8 +6,8 @@ from omegaconf import DictConfig
 import torch
 import torch.nn as nn
-from matanyone.model.group_modules import GConv2d
-from matanyone.utils.tensor_utils import aggregate
 class LinearPredictor(nn.Module):

 import torch
 import torch.nn as nn
+from matanyone2.model.group_modules import GConv2d
+from matanyone2.utils.tensor_utils import aggregate
 class LinearPredictor(nn.Module):

{matanyone → matanyone2}/model/big_modules.py RENAMED Viewed

@@ -14,9 +14,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from matanyone.model.group_modules import MainToGroupDistributor, GroupFeatureFusionBlock, GConv2d
-from matanyone.model.utils import resnet
-from matanyone.model.modules import SensoryDeepUpdater, SensoryUpdater_fullscale, DecoderFeatureProcessor, MaskUpsampleBlock
 class UncertPred(nn.Module):
     def __init__(self, model_cfg: DictConfig):
@@ -330,7 +331,7 @@ class MaskDecoder(nn.Module):
             p4 = self.up_8_4(p8, f4)
             p2 = self.up_4_2(p4, f2)
             p1 = self.up_2_1(p2, f1)
-            with torch.cuda.amp.autocast(enabled=False):
                 if seg_pass:
                     if last_mask is not None:
                         res = self.pred_seg(F.relu(p1.flatten(start_dim=0, end_dim=1).float()))

 import torch.nn as nn
 import torch.nn.functional as F
+from matanyone2.model.group_modules import MainToGroupDistributor, GroupFeatureFusionBlock, GConv2d
+from matanyone2.model.utils import resnet
+from matanyone2.model.modules import SensoryDeepUpdater, SensoryUpdater_fullscale, DecoderFeatureProcessor, MaskUpsampleBlock
+from matanyone2.utils.device import safe_autocast
 class UncertPred(nn.Module):
     def __init__(self, model_cfg: DictConfig):
             p4 = self.up_8_4(p8, f4)
             p2 = self.up_4_2(p4, f2)
             p1 = self.up_2_1(p2, f1)
+            with safe_autocast(enabled=False):
                 if seg_pass:
                     if last_mask is not None:
                         res = self.pred_seg(F.relu(p1.flatten(start_dim=0, end_dim=1).float()))

{matanyone → matanyone2}/model/channel_attn.py RENAMED Viewed

File without changes

{matanyone → matanyone2}/model/group_modules.py RENAMED Viewed

@@ -2,7 +2,7 @@ from typing import Optional
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from matanyone.model.channel_attn import CAResBlock
 def interpolate_groups(g: torch.Tensor, ratio: float, mode: str,
                        align_corners: bool) -> torch.Tensor:

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from matanyone2.model.channel_attn import CAResBlock
 def interpolate_groups(g: torch.Tensor, ratio: float, mode: str,
                        align_corners: bool) -> torch.Tensor:

matanyone/model/matanyone.py → matanyone2/model/matanyone2.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import List, Dict, Iterable
 import logging
 from omegaconf import DictConfig
 import torch
@@ -7,18 +7,21 @@ import torch.nn.functional as F
 from omegaconf import OmegaConf
 from huggingface_hub import PyTorchModelHubMixin
-from matanyone.model.big_modules import PixelEncoder, UncertPred, KeyProjection, MaskEncoder, PixelFeatureFuser, MaskDecoder
-from matanyone.model.aux_modules import AuxComputer
-from matanyone.model.utils.memory_utils import get_affinity, readout
-from matanyone.model.transformer.object_transformer import QueryTransformer
-from matanyone.model.transformer.object_summarizer import ObjectSummarizer
-from matanyone.utils.tensor_utils import aggregate
 log = logging.getLogger()
-class MatAnyone(nn.Module,
                 PyTorchModelHubMixin,
-                library_name="matanyone",
-                repo_url="https://github.com/pq-yang/MatAnyone",
                 coders={
                     DictConfig: (
                         lambda x: OmegaConf.to_container(x),
@@ -83,6 +86,8 @@ class MatAnyone(nn.Module,
         return uncert_output
     def encode_image(self, image: torch.Tensor, seq_length=None, last_feats=None) -> (Iterable[torch.Tensor], torch.Tensor): # type: ignore
         image = (image - self.pixel_mean) / self.pixel_std
         ms_image_feat = self.pixel_encoder(image, seq_length) # f16, f8, f4, f2, f1
         return ms_image_feat, self.pix_feat_proj(ms_image_feat[0])
@@ -96,7 +101,7 @@ class MatAnyone(nn.Module,
             *,
             deep_update: bool = True,
             chunk_size: int = -1,
-            need_weights: bool = False) -> (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor):
         image = (image - self.pixel_mean) / self.pixel_std
         others = self._get_others(masks)
         mask_value, new_sensory = self.mask_encoder(image,
@@ -113,7 +118,7 @@ class MatAnyone(nn.Module,
                       final_pix_feat: torch.Tensor,
                       *,
                       need_sk: bool = True,
-                      need_ek: bool = True) -> (torch.Tensor, torch.Tensor, torch.Tensor):
         key, shrinkage, selection = self.key_proj(final_pix_feat, need_s=need_sk, need_e=need_ek)
         return key, shrinkage, selection
@@ -124,7 +129,7 @@ class MatAnyone(nn.Module,
                     msk_value: torch.Tensor, obj_memory: torch.Tensor, pix_feat: torch.Tensor,
                     sensory: torch.Tensor, last_mask: torch.Tensor,
                     selector: torch.Tensor, uncert_output=None, seg_pass=False,
-                    last_pix_feat=None, last_pred_mask=None) -> (torch.Tensor, Dict[str, torch.Tensor]):
         """
         query_key       : B * CK * H * W
         query_selection : B * CK * H * W
@@ -139,7 +144,7 @@ class MatAnyone(nn.Module,
         uncert_mask = uncert_output["mask"] if uncert_output is not None else None
         # read using visual attention
-        with torch.cuda.amp.autocast(enabled=False):
             affinity = get_affinity(memory_key.float(), memory_shrinkage.float(), query_key.float(),
                                     query_selection.float(), uncert_mask=uncert_mask)
@@ -171,7 +176,7 @@ class MatAnyone(nn.Module,
     def read_first_frame_memory(self, pixel_readout,
                     obj_memory: torch.Tensor, pix_feat: torch.Tensor,
                     sensory: torch.Tensor, last_mask: torch.Tensor,
-                    selector: torch.Tensor, seg_pass=False) -> (torch.Tensor, Dict[str, torch.Tensor]):
         """
         query_key       : B * CK * H * W
         query_selection : B * CK * H * W
@@ -218,7 +223,7 @@ class MatAnyone(nn.Module,
                       *,
                       selector=None,
                       need_weights=False,
-                      seg_pass=False) -> (torch.Tensor, Dict[str, torch.Tensor]):
         return self.object_transformer(pixel_readout,
                                        obj_memory,
                                        selector=selector,
@@ -237,7 +242,7 @@ class MatAnyone(nn.Module,
                 clamp_mat: bool = True,
                 last_mask=None,
                 sigmoid_residual=False,
-                seg_mat=False) -> (torch.Tensor, torch.Tensor, torch.Tensor):
         """
         multi_scale_features is from the key encoder for skip-connection
         memory_readout is from working/long-term memory

+from typing import List, Dict, Iterable, Tuple
 import logging
 from omegaconf import DictConfig
 import torch
 from omegaconf import OmegaConf
 from huggingface_hub import PyTorchModelHubMixin
+from matanyone2.model.big_modules import PixelEncoder, UncertPred, KeyProjection, MaskEncoder, PixelFeatureFuser, MaskDecoder
+from matanyone2.model.aux_modules import AuxComputer
+from matanyone2.model.utils.memory_utils import get_affinity, readout
+from matanyone2.model.transformer.object_transformer import QueryTransformer
+from matanyone2.model.transformer.object_summarizer import ObjectSummarizer
+from matanyone2.utils.tensor_utils import aggregate
+from matanyone2.utils.device import get_default_device, safe_autocast
+device = get_default_device()
 log = logging.getLogger()
+class MatAnyone2(nn.Module,
                 PyTorchModelHubMixin,
+                library_name="matanyone2",
+                repo_url="https://github.com/pq-yang/MatAnyone2",
                 coders={
                     DictConfig: (
                         lambda x: OmegaConf.to_container(x),
         return uncert_output
     def encode_image(self, image: torch.Tensor, seq_length=None, last_feats=None) -> (Iterable[torch.Tensor], torch.Tensor): # type: ignore
+        self.pixel_mean = self.pixel_mean.to(device)
+        self.pixel_std = self.pixel_std.to(device)
         image = (image - self.pixel_mean) / self.pixel_std
         ms_image_feat = self.pixel_encoder(image, seq_length) # f16, f8, f4, f2, f1
         return ms_image_feat, self.pix_feat_proj(ms_image_feat[0])
             *,
             deep_update: bool = True,
             chunk_size: int = -1,
+            need_weights: bool = False) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         image = (image - self.pixel_mean) / self.pixel_std
         others = self._get_others(masks)
         mask_value, new_sensory = self.mask_encoder(image,
                       final_pix_feat: torch.Tensor,
                       *,
                       need_sk: bool = True,
+                      need_ek: bool = True) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         key, shrinkage, selection = self.key_proj(final_pix_feat, need_s=need_sk, need_e=need_ek)
         return key, shrinkage, selection
                     msk_value: torch.Tensor, obj_memory: torch.Tensor, pix_feat: torch.Tensor,
                     sensory: torch.Tensor, last_mask: torch.Tensor,
                     selector: torch.Tensor, uncert_output=None, seg_pass=False,
+                    last_pix_feat=None, last_pred_mask=None) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
         """
         query_key       : B * CK * H * W
         query_selection : B * CK * H * W
         uncert_mask = uncert_output["mask"] if uncert_output is not None else None
         # read using visual attention
+        with safe_autocast(enabled=False):
             affinity = get_affinity(memory_key.float(), memory_shrinkage.float(), query_key.float(),
                                     query_selection.float(), uncert_mask=uncert_mask)
     def read_first_frame_memory(self, pixel_readout,
                     obj_memory: torch.Tensor, pix_feat: torch.Tensor,
                     sensory: torch.Tensor, last_mask: torch.Tensor,
+                    selector: torch.Tensor, seg_pass=False) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
         """
         query_key       : B * CK * H * W
         query_selection : B * CK * H * W
                       *,
                       selector=None,
                       need_weights=False,
+                      seg_pass=False) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
         return self.object_transformer(pixel_readout,
                                        obj_memory,
                                        selector=selector,
                 clamp_mat: bool = True,
                 last_mask=None,
                 sigmoid_residual=False,
+                seg_mat=False) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         multi_scale_features is from the key encoder for skip-connection
         memory_readout is from working/long-term memory

{matanyone → matanyone2}/model/modules.py RENAMED Viewed

@@ -3,7 +3,8 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from matanyone.model.group_modules import MainToGroupDistributor, GroupResBlock, upsample_groups, GConv2d, downsample_groups
 class UpsampleBlock(nn.Module):
@@ -78,7 +79,7 @@ class SensoryUpdater_fullscale(nn.Module):
             self.g2_conv(downsample_groups(g[3], ratio=1/8)) + \
             self.g1_conv(downsample_groups(g[4], ratio=1/16))
-        with torch.cuda.amp.autocast(enabled=False):
             g = g.float()
             h = h.float()
             values = self.transform(torch.cat([g, h], dim=2))
@@ -102,7 +103,7 @@ class SensoryUpdater(nn.Module):
         g = self.g16_conv(g[0]) + self.g8_conv(downsample_groups(g[1], ratio=1/2)) + \
             self.g4_conv(downsample_groups(g[2], ratio=1/4))
-        with torch.cuda.amp.autocast(enabled=False):
             g = g.float()
             h = h.float()
             values = self.transform(torch.cat([g, h], dim=2))
@@ -119,7 +120,7 @@ class SensoryDeepUpdater(nn.Module):
         nn.init.xavier_normal_(self.transform.weight)
     def forward(self, g: torch.Tensor, h: torch.Tensor) -> torch.Tensor:
-        with torch.cuda.amp.autocast(enabled=False):
             g = g.float()
             h = h.float()
             values = self.transform(torch.cat([g, h], dim=2))
@@ -146,4 +147,4 @@ class ResBlock(nn.Module):
         g = self.downsample(g)
-        return out_g + g

 import torch.nn as nn
 import torch.nn.functional as F
+from matanyone2.model.group_modules import MainToGroupDistributor, GroupResBlock, upsample_groups, GConv2d, downsample_groups
+from matanyone2.utils.device import safe_autocast
 class UpsampleBlock(nn.Module):
             self.g2_conv(downsample_groups(g[3], ratio=1/8)) + \
             self.g1_conv(downsample_groups(g[4], ratio=1/16))
+        with safe_autocast(enabled=False):
             g = g.float()
             h = h.float()
             values = self.transform(torch.cat([g, h], dim=2))
         g = self.g16_conv(g[0]) + self.g8_conv(downsample_groups(g[1], ratio=1/2)) + \
             self.g4_conv(downsample_groups(g[2], ratio=1/4))
+        with safe_autocast(enabled=False):
             g = g.float()
             h = h.float()
             values = self.transform(torch.cat([g, h], dim=2))
         nn.init.xavier_normal_(self.transform.weight)
     def forward(self, g: torch.Tensor, h: torch.Tensor) -> torch.Tensor:
+        with safe_autocast(enabled=False):
             g = g.float()
             h = h.float()
             values = self.transform(torch.cat([g, h], dim=2))
         g = self.downsample(g)
+        return out_g + g

{matanyone/model → matanyone2/model/transformer}/__init__.py RENAMED Viewed

File without changes

{matanyone → matanyone2}/model/transformer/object_summarizer.py RENAMED Viewed

@@ -4,7 +4,8 @@ from omegaconf import DictConfig
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from matanyone.model.transformer.positional_encoding import PositionalEncoding
 # @torch.jit.script
@@ -75,7 +76,7 @@ class ObjectSummarizer(nn.Module):
             pe = self.pos_enc(value)
             value = value + pe
-        with torch.cuda.amp.autocast(enabled=False):
             value = value.float()
             feature = self.feature_pred(value)
             logits = self.weights_pred(value)
@@ -86,4 +87,4 @@ class ObjectSummarizer(nn.Module):
         if need_weights:
             return summaries, logits
         else:
-            return summaries, None

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from matanyone2.model.transformer.positional_encoding import PositionalEncoding
+from matanyone2.utils.device import safe_autocast
 # @torch.jit.script
             pe = self.pos_enc(value)
             value = value + pe
+        with safe_autocast(enabled=False):  # autocast disabled intentionally
             value = value.float()
             feature = self.feature_pred(value)
             logits = self.weights_pred(value)
         if need_weights:
             return summaries, logits
         else:
+            return summaries, None

{matanyone → matanyone2}/model/transformer/object_transformer.py RENAMED Viewed

@@ -3,10 +3,10 @@ from omegaconf import DictConfig
 import torch
 import torch.nn as nn
-from matanyone.model.group_modules import GConv2d
-from matanyone.utils.tensor_utils import aggregate
-from matanyone.model.transformer.positional_encoding import PositionalEncoding
-from matanyone.model.transformer.transformer_layers import CrossAttention, SelfAttention, FFN, PixelFFN
 class QueryTransformerBlock(nn.Module):

 import torch
 import torch.nn as nn
+from matanyone2.model.group_modules import GConv2d
+from matanyone2.utils.tensor_utils import aggregate
+from matanyone2.model.transformer.positional_encoding import PositionalEncoding
+from matanyone2.model.transformer.transformer_layers import CrossAttention, SelfAttention, FFN, PixelFFN
 class QueryTransformerBlock(nn.Module):

{matanyone → matanyone2}/model/transformer/positional_encoding.py RENAMED Viewed

@@ -7,6 +7,7 @@ import math
 import numpy as np
 import torch
 from torch import nn
 def get_emb(sin_inp: torch.Tensor) -> torch.Tensor:
@@ -98,8 +99,9 @@ class PositionalEncoding(nn.Module):
 if __name__ == '__main__':
-    pe = PositionalEncoding(8).cuda()
-    input = torch.ones((1, 8, 8, 8)).cuda()
     output = pe(input)
     # print(output)
     print(output[0, :, 0, 0])

 import numpy as np
 import torch
 from torch import nn
+from matanyone2.utils.device import get_default_device
 def get_emb(sin_inp: torch.Tensor) -> torch.Tensor:
 if __name__ == '__main__':
+    device = get_default_device()
+    pe = PositionalEncoding(8).to(device)
+    input = torch.ones((1, 8, 8, 8), device=device)
     output = pe(input)
     # print(output)
     print(output[0, :, 0, 0])

{matanyone → matanyone2}/model/transformer/transformer_layers.py RENAMED Viewed

@@ -6,7 +6,7 @@ import torch
 from torch import Tensor
 import torch.nn as nn
 import torch.nn.functional as F
-from matanyone.model.channel_attn import CAResBlock
 class SelfAttention(nn.Module):

 from torch import Tensor
 import torch.nn as nn
 import torch.nn.functional as F
+from matanyone2.model.channel_attn import CAResBlock
 class SelfAttention(nn.Module):

{matanyone/model/transformer → matanyone2/model/utils}/__init__.py RENAMED Viewed

File without changes

{matanyone → matanyone2}/model/utils/memory_utils.py RENAMED Viewed

File without changes

{matanyone → matanyone2}/model/utils/parameter_groups.py RENAMED Viewed

File without changes

{matanyone → matanyone2}/model/utils/resnet.py RENAMED Viewed

File without changes

{matanyone/model → matanyone2}/utils/__init__.py RENAMED Viewed

File without changes

matanyone2/utils/device.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+import functools
+def get_default_device():
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    elif torch.backends.mps.is_built() and torch.backends.mps.is_available():
+        return torch.device("mps")
+    else:
+        return torch.device("cpu")
+def safe_autocast_decorator(enabled=True):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            device = get_default_device()
+            if device.type in ["cuda", "cpu"]:
+                with torch.amp.autocast(device_type=device.type, enabled=enabled):
+                    return func(*args, **kwargs)
+            else:
+                return func(*args, **kwargs)
+        return wrapper
+    return decorator
+import contextlib
+@contextlib.contextmanager
+def safe_autocast(enabled=True):
+    device = get_default_device()
+    if device.type in ["cuda", "cpu"]:
+        with torch.amp.autocast(device_type=device.type, enabled=enabled):
+            yield
+    else:
+        yield  # MPS or other unsupported backends skip autocast

{matanyone → matanyone2}/utils/get_default_model.py RENAMED Viewed

@@ -5,9 +5,9 @@ from omegaconf import open_dict
 from hydra import compose, initialize
 import torch
-from matanyone.model.matanyone import MatAnyone
-def get_matanyone_model(ckpt_path, device=None) -> MatAnyone:
     initialize(version_base='1.3.2', config_path="../config", job_name="eval_our_config")
     cfg = compose(config_name="eval_matanyone_config")
@@ -16,12 +16,12 @@ def get_matanyone_model(ckpt_path, device=None) -> MatAnyone:
     # Load the network weights
     if device is not None:
-        matanyone = MatAnyone(cfg, single_object=True).to(device).eval()
         model_weights = torch.load(cfg.weights, map_location=device)
     else:  # if device is not specified, `.cuda()` by default
-        matanyone = MatAnyone(cfg, single_object=True).cuda().eval()
         model_weights = torch.load(cfg.weights)
-    matanyone.load_weights(model_weights)
-    return matanyone

 from hydra import compose, initialize
 import torch
+from matanyone2.model.matanyone2 import MatAnyone2
+def get_matanyone2_model(ckpt_path, device=None) -> MatAnyone2:
     initialize(version_base='1.3.2', config_path="../config", job_name="eval_our_config")
     cfg = compose(config_name="eval_matanyone_config")
     # Load the network weights
     if device is not None:
+        matanyone2 = MatAnyone2(cfg, single_object=True).to(device).eval()
         model_weights = torch.load(cfg.weights, map_location=device)
     else:  # if device is not specified, `.cuda()` by default
+        matanyone2 = MatAnyone2(cfg, single_object=True).cuda().eval()
         model_weights = torch.load(cfg.weights)
+    matanyone2.load_weights(model_weights)
+    return matanyone2

matanyone2/utils/inference_utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import cv2
+import random
+import numpy as np
+import torch
+import torchvision
+IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG')
+VIDEO_EXTENSIONS = ('.mp4', '.mov', '.avi', '.MP4', '.MOV', '.AVI')
+def read_frame_from_videos(frame_root):
+    if frame_root.endswith(VIDEO_EXTENSIONS):  # Video file path
+        video_name = os.path.basename(frame_root)[:-4]
+        frames, _, info = torchvision.io.read_video(filename=frame_root, pts_unit='sec', output_format='TCHW') # RGB
+        fps = info['video_fps']
+    else:
+        video_name = os.path.basename(frame_root)
+        frames = []
+        fr_lst = sorted(os.listdir(frame_root))
+        for fr in fr_lst:
+            frame = cv2.imread(os.path.join(frame_root, fr))[...,[2,1,0]] # RGB, HWC
+            frames.append(frame)
+        fps = 24  # default
+        frames = torch.Tensor(np.array(frames)).permute(0, 3, 1, 2).contiguous() # TCHW
+    length = frames.shape[0]
+    return frames, fps, length, video_name
+def get_video_paths(input_root):
+    video_paths = []
+    for root, _, files in os.walk(input_root):
+        for file in files:
+            if file.lower().endswith(VIDEO_EXTENSIONS):
+                video_paths.append(os.path.join(root, file))
+    return sorted(video_paths)
+def str_to_list(value):
+    return list(map(int, value.split(',')))
+def gen_dilate(alpha, min_kernel_size, max_kernel_size):
+    kernel_size = random.randint(min_kernel_size, max_kernel_size)
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size,kernel_size))
+    fg_and_unknown = np.array(np.not_equal(alpha, 0).astype(np.float32))
+    dilate = cv2.dilate(fg_and_unknown, kernel, iterations=1)*255
+    return dilate.astype(np.float32)
+def gen_erosion(alpha, min_kernel_size, max_kernel_size):
+    kernel_size = random.randint(min_kernel_size, max_kernel_size)
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size,kernel_size))
+    fg = np.array(np.equal(alpha, 255).astype(np.float32))
+    erode = cv2.erode(fg, kernel, iterations=1)*255
+    return erode.astype(np.float32)

{matanyone → matanyone2}/utils/tensor_utils.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from typing import List, Iterable
 import torch
 import torch.nn.functional as F
 # STM
 def pad_divide_by(in_img: torch.Tensor, d: int) -> (torch.Tensor, Iterable[int]):
@@ -45,7 +45,7 @@ def unpad(img: torch.Tensor, pad: Iterable[int]) -> torch.Tensor:
 # @torch.jit.script
 def aggregate(prob: torch.Tensor, dim: int) -> torch.Tensor:
-    with torch.cuda.amp.autocast(enabled=False):
         prob = prob.float()
         new_prob = torch.cat([torch.prod(1 - prob, dim=dim, keepdim=True), prob],
                              dim).clamp(1e-7, 1 - 1e-7)
@@ -59,4 +59,4 @@ def cls_to_one_hot(cls_gt: torch.Tensor, num_objects: int) -> torch.Tensor:
     # cls_gt: B*1*H*W
     B, _, H, W = cls_gt.shape
     one_hot = torch.zeros(B, num_objects + 1, H, W, device=cls_gt.device).scatter_(1, cls_gt, 1)
-    return one_hot

 from typing import List, Iterable
 import torch
 import torch.nn.functional as F
+from matanyone2.utils.device import safe_autocast
 # STM
 def pad_divide_by(in_img: torch.Tensor, d: int) -> (torch.Tensor, Iterable[int]):
 # @torch.jit.script
 def aggregate(prob: torch.Tensor, dim: int) -> torch.Tensor:
+    with safe_autocast(enabled=False):
         prob = prob.float()
         new_prob = torch.cat([torch.prod(1 - prob, dim=dim, keepdim=True), prob],
                              dim).clamp(1e-7, 1 - 1e-7)
     # cls_gt: B*1*H*W
     B, _, H, W = cls_gt.shape
     one_hot = torch.zeros(B, num_objects + 1, H, W, device=cls_gt.device).scatter_(1, cls_gt, 1)
+    return one_hot