Instructions to use szang18/spatialvla-liftpeg-new with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use szang18/spatialvla-liftpeg-new with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("IPEC-COMMUNITY/spatialvla-4b-224-pt") model = PeftModel.from_pretrained(base_model, "szang18/spatialvla-liftpeg-new") - Notebooks
- Google Colab
- Kaggle
| # coding=utf-8 | |
| # Copyright 2024 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved. | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import warnings | |
| from transformers.configuration_utils import PretrainedConfig | |
| from transformers.utils import logging | |
| from transformers import CONFIG_MAPPING, AutoConfig | |
| logger = logging.get_logger(__name__) | |
| class SpatialVLAConfig(PretrainedConfig): | |
| model_type = "spatialvla" | |
| sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "vision_zoe_config": AutoConfig} | |
| def __init__( | |
| self, | |
| vision_config=None, | |
| text_config=None, | |
| ignore_index=-100, | |
| image_token_index=256000, | |
| vocab_size=257152, | |
| projection_dim=2048, | |
| hidden_size=2048, | |
| vision_zoe_config=None, | |
| action_token_begin_idx=None, | |
| spatial_token_num=259, | |
| use_spatial_token=False, | |
| ego3d_patch_reso=4, | |
| n_freqs=8, | |
| use_vision_zoe=True, | |
| **kwargs, | |
| ): | |
| self._ignore_index = ignore_index | |
| self.image_token_index = image_token_index | |
| self._vocab_size = vocab_size | |
| self.projection_dim = projection_dim | |
| self.hidden_size = hidden_size | |
| self.vision_config = vision_config | |
| self.is_encoder_decoder = False | |
| if isinstance(self.vision_config, dict): | |
| vision_config["model_type"] = ( | |
| vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model" | |
| ) | |
| self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) | |
| elif vision_config is None: | |
| self.vision_config = CONFIG_MAPPING["siglip_vision_model"]( | |
| intermediate_size=4096, | |
| hidden_size=1152, | |
| patch_size=14, | |
| image_size=224, | |
| num_hidden_layers=27, | |
| num_attention_heads=16, | |
| vocab_size=257152, | |
| vision_use_head=False, | |
| ) | |
| self.text_config = text_config | |
| if isinstance(self.text_config, dict): | |
| text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "gemma2" | |
| self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) | |
| elif text_config is None: | |
| self.text_config = CONFIG_MAPPING["gemma2"]( | |
| hidden_size=2048, | |
| num_hidden_layers=18, | |
| intermediate_size=16384, | |
| num_attention_heads=8, | |
| num_key_value_heads=1, | |
| is_encoder_decoder=False, | |
| vocab_size=vocab_size, | |
| ) | |
| self.text_config.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2 | |
| self.vision_config.projection_dim = projection_dim | |
| # vision zoe config | |
| self.vision_zoe_config = vision_zoe_config | |
| if isinstance(self.vision_zoe_config, dict): | |
| vision_zoe_config["model_type"] = vision_zoe_config["model_type"] if "model_type" in vision_zoe_config else "zoedepth" | |
| self.vision_zoe_config = CONFIG_MAPPING[vision_zoe_config["model_type"]](**vision_zoe_config) | |
| else: | |
| pass | |
| # additional attributes | |
| self.action_token_begin_idx = action_token_begin_idx | |
| self.spatial_token_num = spatial_token_num | |
| self.use_spatial_token = use_spatial_token | |
| self.ego3d_patch_reso = ego3d_patch_reso | |
| self.n_freqs = n_freqs | |
| self.use_vision_zoe = use_vision_zoe | |
| super().__init__(**kwargs) | |
| def ignore_index(self): | |
| warnings.warn( | |
| "The `ignore_index` attribute is deprecated and will be removed in v4.47.", | |
| FutureWarning, | |
| ) | |
| return self._ignore_index | |
| def ignore_index(self, value): | |
| self._ignore_index = value | |
| def to_dict(self): | |
| output = super().to_dict() | |
| output.pop("_ignore_index", None) | |
| return output |