| """ |
| GeoMotionGPT Configuration |
| |
| This module contains the configuration class for GeoMotionGPT, a motion-to-text model |
| that combines a VQ-VAE motion tokenizer with a fine-tuned GPT-2 language model. |
| """ |
|
|
| from transformers import PretrainedConfig |
|
|
|
|
| class GeoMotionGPTConfig(PretrainedConfig): |
| """ |
| Configuration class for GeoMotionGPT model. |
| |
| GeoMotionGPT consists of two components: |
| 1. Motion Tokenizer (DVQ-GSST): Converts 263-dim HumanML3D motion features to discrete tokens |
| 2. Language Model (GPT-2): Generates text descriptions from motion tokens |
| |
| Args: |
| motion_vocab_size (`int`, *optional*, defaults to 512): |
| Size of the motion codebook vocabulary. |
| motion_input_dim (`int`, *optional*, defaults to 263): |
| Input dimension of motion features (HumanML3D format). |
| motion_hidden_dim (`int`, *optional*, defaults to 512): |
| Hidden dimension for motion encoder. |
| motion_down_t (`int`, *optional*, defaults to 3): |
| Number of temporal downsampling layers. |
| motion_depth (`int`, *optional*, defaults to 3): |
| Depth of ResNet blocks in encoder. |
| text_vocab_size (`int`, *optional*, defaults to 50257): |
| Size of the text vocabulary (GPT-2). |
| n_positions (`int`, *optional*, defaults to 1024): |
| Maximum sequence length. |
| n_embd (`int`, *optional*, defaults to 768): |
| Embedding dimension for GPT-2. |
| n_layer (`int`, *optional*, defaults to 12): |
| Number of transformer layers. |
| n_head (`int`, *optional*, defaults to 12): |
| Number of attention heads. |
| mot_factor (`float`, *optional*, defaults to 1.0): |
| Factor for motion embedding dimension. |
| attention_mode (`str`, *optional*, defaults to "all"): |
| Cross-modal attention mode. |
| lambda_ortho (`float`, *optional*, defaults to 0.01): |
| Orthogonality regularization weight. |
| |
| Example: |
| ```python |
| from transformers import AutoConfig |
| |
| config = AutoConfig.from_pretrained("zy22b/GeoMotionGPT", trust_remote_code=True) |
| print(config.motion_vocab_size) # 512 |
| ``` |
| """ |
| |
| model_type = "geomotiongpt" |
| |
| def __init__( |
| self, |
| |
| motion_vocab_size: int = 512, |
| motion_input_dim: int = 263, |
| motion_hidden_dim: int = 512, |
| motion_down_t: int = 3, |
| motion_depth: int = 3, |
| motion_dilation_growth_rate: int = 3, |
| |
| text_vocab_size: int = 50257, |
| n_positions: int = 1024, |
| n_embd: int = 768, |
| n_layer: int = 12, |
| n_head: int = 12, |
| n_inner: int = None, |
| activation_function: str = "gelu_new", |
| resid_pdrop: float = 0.1, |
| embd_pdrop: float = 0.1, |
| attn_pdrop: float = 0.1, |
| layer_norm_epsilon: float = 1e-5, |
| initializer_range: float = 0.02, |
| |
| mot_factor: float = 1.0, |
| attention_mode: str = "all", |
| lambda_ortho: float = 0.01, |
| |
| bos_token_id: int = 50256, |
| eos_token_id: int = 50256, |
| pad_token_id: int = 50256, |
| **kwargs |
| ): |
| |
| self.motion_vocab_size = motion_vocab_size |
| self.motion_input_dim = motion_input_dim |
| self.motion_hidden_dim = motion_hidden_dim |
| self.motion_down_t = motion_down_t |
| self.motion_depth = motion_depth |
| self.motion_dilation_growth_rate = motion_dilation_growth_rate |
| |
| |
| self.text_vocab_size = text_vocab_size |
| self.vocab_size = text_vocab_size + motion_vocab_size + 3 |
| self.n_positions = n_positions |
| self.n_embd = n_embd |
| self.n_layer = n_layer |
| self.n_head = n_head |
| self.n_inner = n_inner |
| self.activation_function = activation_function |
| self.resid_pdrop = resid_pdrop |
| self.embd_pdrop = embd_pdrop |
| self.attn_pdrop = attn_pdrop |
| self.layer_norm_epsilon = layer_norm_epsilon |
| self.initializer_range = initializer_range |
| |
| |
| self.mot_factor = mot_factor |
| self.attention_mode = attention_mode |
| self.lambda_ortho = lambda_ortho |
| |
| super().__init__( |
| bos_token_id=bos_token_id, |
| eos_token_id=eos_token_id, |
| pad_token_id=pad_token_id, |
| **kwargs |
| ) |
|
|