| """HELM-BERT configuration.""" |
|
|
| from transformers import PretrainedConfig |
|
|
|
|
| class HELMBertConfig(PretrainedConfig): |
| """Configuration class for HELM-BERT model. |
| |
| This configuration class stores all the parameters needed to instantiate a HELM-BERT model. |
| It inherits from PretrainedConfig and can be used with HuggingFace's from_pretrained and |
| save_pretrained methods. |
| |
| Args: |
| vocab_size: Size of the vocabulary (default: 78 for HELM character vocabulary) |
| hidden_size: Dimensionality of the encoder layers (default: 768) |
| num_hidden_layers: Number of transformer layers (default: 6) |
| num_attention_heads: Number of attention heads (default: 12) |
| intermediate_size: Dimensionality of the feed-forward layer (default: 3072) |
| hidden_dropout_prob: Dropout probability for hidden layers (default: 0.1) |
| attention_probs_dropout_prob: Dropout probability for attention (default: 0.1) |
| max_position_embeddings: Maximum sequence length (default: 512) |
| max_relative_positions: Maximum relative position distance (default: 512) |
| position_buckets: Number of position buckets for log-bucketing (default: 256) |
| pos_att_type: Position attention types, pipe-separated (default: "c2p|p2c") |
| share_att_key: Whether to share attention key projections (default: False) |
| ngie_kernel_size: Kernel size for nGiE convolution (default: 3) |
| ngie_dropout: Dropout for nGiE layer (default: 0.1) |
| pad_token_id: ID of padding token (default: 0) |
| bos_token_id: ID of beginning-of-sequence token (default: 1) |
| eos_token_id: ID of end-of-sequence token (default: 2) |
| sep_token_id: ID of separator token (default: 2) |
| mask_token_id: ID of mask token (default: 4) |
| |
| Example: |
| >>> from helmbert import HELMBertConfig, HELMBertModel |
| >>> config = HELMBertConfig(hidden_size=768, num_hidden_layers=6) |
| >>> model = HELMBertModel(config) |
| """ |
|
|
| model_type = "helmbert" |
|
|
| def __init__( |
| self, |
| vocab_size: int = 78, |
| hidden_size: int = 768, |
| num_hidden_layers: int = 6, |
| num_attention_heads: int = 12, |
| intermediate_size: int = 3072, |
| hidden_dropout_prob: float = 0.1, |
| attention_probs_dropout_prob: float = 0.1, |
| max_position_embeddings: int = 512, |
| |
| max_relative_positions: int = 512, |
| position_buckets: int = 256, |
| pos_att_type: str = "c2p|p2c", |
| share_att_key: bool = False, |
| |
| ngie_kernel_size: int = 3, |
| ngie_dropout: float = 0.1, |
| |
| pad_token_id: int = 0, |
| bos_token_id: int = 1, |
| eos_token_id: int = 2, |
| sep_token_id: int = 2, |
| mask_token_id: int = 4, |
| |
| num_labels: int = 2, |
| problem_type: str = None, |
| classifier_num_layers: int = 0, |
| classifier_dropout: float = 0.1, |
| **kwargs, |
| ): |
| super().__init__( |
| pad_token_id=pad_token_id, |
| bos_token_id=bos_token_id, |
| eos_token_id=eos_token_id, |
| **kwargs, |
| ) |
|
|
| |
| self.vocab_size = vocab_size |
| self.hidden_size = hidden_size |
| self.num_hidden_layers = num_hidden_layers |
| self.num_attention_heads = num_attention_heads |
| self.intermediate_size = intermediate_size |
| self.hidden_dropout_prob = hidden_dropout_prob |
| self.attention_probs_dropout_prob = attention_probs_dropout_prob |
| self.max_position_embeddings = max_position_embeddings |
|
|
| |
| self.max_relative_positions = max_relative_positions |
| self.position_buckets = position_buckets |
| self.pos_att_type = pos_att_type |
| self.share_att_key = share_att_key |
|
|
| |
| self.ngie_kernel_size = ngie_kernel_size |
| self.ngie_dropout = ngie_dropout |
|
|
| |
| self.sep_token_id = sep_token_id |
| self.mask_token_id = mask_token_id |
|
|
| |
| self.num_labels = num_labels |
| self.problem_type = problem_type |
| self.classifier_num_layers = classifier_num_layers |
| self.classifier_dropout = classifier_dropout |
|
|