| """ |
| Oculus Configuration |
| |
| HuggingFace-compatible configuration for the unified Oculus model. |
| """ |
|
|
| from typing import Optional, Dict, Any, List |
| from transformers import PretrainedConfig |
|
|
|
|
| class OculusConfig(PretrainedConfig): |
| """ |
| Configuration class for Oculus vision-language model. |
| |
| Args: |
| vision_config: Configuration for vision encoders |
| projector_config: Configuration for vision-to-language projector |
| text_config: Configuration for language model |
| reasoning_enabled: Whether to enable thinking traces |
| output_mode: Default output mode ("text", "point", "box", "polygon") |
| """ |
| |
| model_type = "oculus" |
| |
| def __init__( |
| self, |
| |
| dinov3_model_id: str = "facebook/dinov2-large", |
| siglip_model_id: str = "google/siglip-base-patch16-224", |
| dinov3_hidden_size: int = 1280, |
| siglip_hidden_size: int = 768, |
| |
| |
| projector_hidden_dim: int = 2048, |
| num_vision_tokens: int = 64, |
| |
| |
| text_model_id: str = "Salesforce/blip-image-captioning-base", |
| lm_hidden_size: int = 1536, |
| vocab_size: int = 131072, |
| max_position_embeddings: int = 32768, |
| |
| |
| reasoning_enabled: bool = True, |
| thinking_token: str = "<think>", |
| thinking_end_token: str = "</think>", |
| max_thinking_tokens: int = 256, |
| |
| |
| output_mode: str = "text", |
| num_detection_classes: int = 80, |
| num_segmentation_classes: int = 150, |
| |
| |
| max_new_tokens: int = 512, |
| temperature: float = 0.7, |
| top_p: float = 0.95, |
| |
| |
| enable_focus: bool = True, |
| focus_token: str = "<focus>", |
| focus_end_token: str = "</focus>", |
| |
| **kwargs |
| ): |
| super().__init__(**kwargs) |
| |
| |
| self.dinov3_model_id = dinov3_model_id |
| self.siglip_model_id = siglip_model_id |
| self.dinov3_hidden_size = dinov3_hidden_size |
| self.siglip_hidden_size = siglip_hidden_size |
| self.fused_vision_dim = dinov3_hidden_size + siglip_hidden_size |
| |
| |
| self.projector_hidden_dim = projector_hidden_dim |
| self.num_vision_tokens = num_vision_tokens |
| |
| |
| self.text_model_id = text_model_id |
| self.lm_hidden_size = lm_hidden_size |
| self.vocab_size = vocab_size |
| self.max_position_embeddings = max_position_embeddings |
| |
| |
| self.reasoning_enabled = reasoning_enabled |
| self.thinking_token = thinking_token |
| self.thinking_end_token = thinking_end_token |
| self.max_thinking_tokens = max_thinking_tokens |
| |
| |
| self.output_mode = output_mode |
| self.num_detection_classes = num_detection_classes |
| self.num_segmentation_classes = num_segmentation_classes |
| |
| |
| self.max_new_tokens = max_new_tokens |
| self.temperature = temperature |
| self.top_p = top_p |
| |
| |
| self.enable_focus = enable_focus |
| self.focus_token = focus_token |
| self.focus_end_token = focus_end_token |
| |
| @classmethod |
| def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): |
| """Load config from pretrained path.""" |
| config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) |
| return cls.from_dict(config_dict, **kwargs) |
| |
| def to_dict(self) -> Dict[str, Any]: |
| """Serialize config to dictionary.""" |
| output = super().to_dict() |
| return output |
|
|
|
|
| |
| OculusConfig.register_for_auto_class() |
|
|