| |
| |
| ''' |
| @license: (C) Copyright 2025, Hey. |
| @author: Hey |
| @email: sanyuan.hy@alibaba-inc.com |
| @tel: 137****6540 |
| @datetime: 2025/12/30 11:34 |
| @project: lucaone |
| @file: tokenization_lucaone |
| @desc: tokenization_lucaone |
| ''' |
|
|
| from typing import Literal |
| from transformers import PretrainedConfig |
|
|
| class LucaGPLMConfig(PretrainedConfig): |
| model_type = "lucaone" |
| |
| def __init__( |
| self, |
| vocab_size: int = 39, |
| pad_token_id: int = 0, |
| unk_token_id: int = 1, |
| bos_token_id: int = 2, |
| eos_token_id: int = 3, |
| sep_token_id: int = 3, |
| mask_token_id: int = 4, |
| hidden_act: str = "gelu", |
| max_position_embeddings: int = 4096, |
| type_vocab_size: int = 2, |
| num_hidden_layers: int = 20, |
| num_attention_heads: int = 40, |
| hidden_size: int = 2560, |
| ffn_dim: int = 10240, |
| no_position_embeddings: bool = True, |
| no_token_type_embeddings: bool = False, |
| alphabet: str = "gene_prot", |
| token_dropout: bool = False, |
| attention_probs_dropout_prob: float = 0.0, |
| hidden_dropout_prob: float = 0.0, |
| use_embed_layer_norm: bool = False, |
| use_last_layer_norm: bool = True, |
| embed_scale: float = 1.0, |
| ignore_index: int = -100, |
| layer_norm_eps: float = 1e-12, |
| initializer_range: float = 0.02, |
| task_level: Literal["seq_level", "token_level"] = "seq_level", |
| task_type: Literal["embedding", "mlm", "multi_class", "binary_class", "regression", "multi_label"] = "embedding", |
| classifier_num_labels: int = -1, |
| classifier_dropout_prob: float = 0.1, |
| classifier_pooling_type: Literal["cls", "value_attention", "context_attention", "mean"] = "value_attention", |
| classifier_loss_type: Literal["binary_cross_entropy", "cross_entropy", "mse", "mae"] = "cross_entropy", |
| classifier_loss_reduction: Literal["mean", "sum", "none"] = "mean", |
| classifier_pos_weight: float=1.0, |
| classifier_weight: list=None, |
| tie_word_embeddings: bool=True, |
| **kwargs |
| ): |
| super().__init__( |
| tie_word_embeddings=tie_word_embeddings, |
| pad_token_id=pad_token_id, |
| **kwargs |
| ) |
| |
| self.alphabet = alphabet |
| self.vocab_size = vocab_size |
| self.max_position_embeddings = max_position_embeddings |
| self.type_vocab_size = type_vocab_size |
| self.no_token_type_embeddings = no_token_type_embeddings |
| self.no_position_embeddings = no_position_embeddings |
| self.num_hidden_layers = num_hidden_layers |
| self.hidden_size = hidden_size |
| self.num_attention_heads = num_attention_heads |
| self.ffn_dim = ffn_dim |
| self.token_dropout = token_dropout |
| self.attention_probs_dropout_prob = attention_probs_dropout_prob |
| self.hidden_dropout_prob = hidden_dropout_prob |
| self.classifier_dropout_prob = classifier_dropout_prob |
| self.ignore_index = ignore_index |
| self.use_embed_layer_norm = use_embed_layer_norm |
| self.use_last_layer_norm = use_last_layer_norm |
| self.embed_scale = embed_scale |
| self.layer_norm_eps = layer_norm_eps |
| self.initializer_range = initializer_range |
| self.unk_token_id = unk_token_id |
| self.bos_token_id = bos_token_id |
| self.eos_token_id = eos_token_id |
| self.sep_token_id = sep_token_id |
| self.mask_token_id = mask_token_id |
| self.hidden_act = hidden_act |
| self.classifier_num_labels = classifier_num_labels |
| self.classifier_pooling_type = classifier_pooling_type |
| self.task_level = task_level |
| self.task_type = task_type |
| self.classifier_loss_type = classifier_loss_type |
| self.classifier_loss_reduction = classifier_loss_reduction |
| self.classifier_pos_weight = classifier_pos_weight |
| self.classifier_weight = classifier_weight |
|
|
|
|
| __all__ = ["LucaGPLMConfig"] |