Tom Aarsen commited on
Commit
1a5fa14
·
1 Parent(s): e21cde3

Integrate with upcoming Sentence Transformers v5.5.0

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  license: apache-2.0
3
- library_name: pytorch
4
  pipeline_tag: sentence-similarity
5
  tags:
6
  - sentence-transformers
@@ -84,15 +84,70 @@ Training characteristics:
84
 
85
  ## How To Use It
86
 
87
- ## Installation
 
 
88
 
89
  ```bash
90
- pip install torch sentence-transformers transformers accelerate safetensors pillow librosa soundfile huggingface_hub
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  ```
92
 
93
- ## Python Usage
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- The simplest way to use the model is to download the repository snapshot, load the packaged source code, and then encode one or more modality-tagged items.
96
 
97
  ```python
98
  import json
 
1
  ---
2
  license: apache-2.0
3
+ library_name: sentence-transformers
4
  pipeline_tag: sentence-similarity
5
  tags:
6
  - sentence-transformers
 
84
 
85
  ## How To Use It
86
 
87
+ ### Using Sentence Transformers
88
+
89
+ Install Sentence Transformers with the audio and image extras:
90
 
91
  ```bash
92
+ pip install "sentence_transformers[image,audio]"
93
+ ```
94
+
95
+ Then load the model directly. Modality is inferred automatically from the input (plain strings -> `text`, image paths/URLs/PIL images -> `image`, audio paths/URLs/NumPy arrays -> `audio`):
96
+
97
+ ```python
98
+ from sentence_transformers import SentenceTransformer
99
+
100
+ model = SentenceTransformer("llm-semantic-router/multi-modal-embed-large", trust_remote_code=True)
101
+
102
+ text_embeddings = model.encode(
103
+ [
104
+ "Martin Luther King Jr. delivering his I have a dream speech",
105
+ "two cats sleeping side by side on a pink couch",
106
+ ]
107
+ )
108
+ image_embeddings = model.encode(
109
+ [
110
+ "http://images.cocodataset.org/val2017/000000039769.jpg", # two cats on a pink couch
111
+ "http://images.cocodataset.org/val2017/000000000139.jpg", # distractor
112
+ ]
113
+ )
114
+ audio_embeddings = model.encode(
115
+ [
116
+ "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac", # MLK speech
117
+ "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/i-know-kung-fu.mp3", # distractor
118
+ ]
119
+ )
120
+
121
+ print(text_embeddings.shape, image_embeddings.shape, audio_embeddings.shape)
122
+ # (2, 768) (2, 768) (2, 768)
123
+
124
+ # Each row is a text query, each column a media candidate; the highest score per row is the
125
+ # correct cross-modal match.
126
+ print(model.similarity(text_embeddings, image_embeddings))
127
+ # tensor([[0.0704, 0.0121], # MLK text: neither image matches
128
+ # [0.5532, 0.3070]]) # cats text: the cats photo wins
129
+
130
+ print(model.similarity(text_embeddings, audio_embeddings))
131
+ # tensor([[ 0.2186, 0.1428], # MLK text: the MLK audio wins
132
+ # [-0.0625, 0.0667]]) # cats text: neither audio matches
133
  ```
134
 
135
+ Each modality routes through the matching sub-module pipeline:
136
+
137
+ - `text` -> `Transformer(mmbert) -> Pooling(mean) -> Normalize`
138
+ - `image` -> `SiglipVisionTransformer -> Pooling(mean) -> Dense(1152, 768) -> Normalize`
139
+ - `audio` -> `WhisperEncoderTransformer -> Pooling(mean) -> Dense(1024, 768) -> Normalize`
140
+
141
+
142
+ ### Using the packaged `hf_st_mm` source code
143
+
144
+ The original packaged inference path remains available alongside the Sentence Transformers integration. Install the dependencies:
145
+
146
+ ```bash
147
+ pip install torch sentence-transformers transformers accelerate safetensors pillow librosa soundfile huggingface_hub
148
+ ```
149
 
150
+ Then download the repository snapshot, load the packaged source code, and encode modality-tagged items:
151
 
152
  ```python
153
  import json
audio_0_WhisperEncoderTransformer/config.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "gelu",
4
+ "apply_spec_augment": false,
5
+ "architectures": [
6
+ "WhisperEncoder"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "classifier_proj_size": 256,
15
+ "d_model": 1024,
16
+ "decoder_attention_heads": 16,
17
+ "decoder_ffn_dim": 4096,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 24,
20
+ "decoder_start_token_id": 50258,
21
+ "dropout": 0.0,
22
+ "dtype": "float32",
23
+ "encoder_attention_heads": 16,
24
+ "encoder_ffn_dim": 4096,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 24,
27
+ "eos_token_id": 50257,
28
+ "forced_decoder_ids": [
29
+ [
30
+ 1,
31
+ 50259
32
+ ],
33
+ [
34
+ 2,
35
+ 50359
36
+ ],
37
+ [
38
+ 3,
39
+ 50363
40
+ ]
41
+ ],
42
+ "init_std": 0.02,
43
+ "is_encoder_decoder": true,
44
+ "mask_feature_length": 10,
45
+ "mask_feature_min_masks": 0,
46
+ "mask_feature_prob": 0.0,
47
+ "mask_time_length": 10,
48
+ "mask_time_min_masks": 2,
49
+ "mask_time_prob": 0.05,
50
+ "max_source_positions": 1500,
51
+ "max_target_positions": 448,
52
+ "median_filter_width": 7,
53
+ "model_type": "whisper",
54
+ "num_mel_bins": 80,
55
+ "pad_token_id": 50257,
56
+ "scale_embedding": false,
57
+ "suppress_tokens": [
58
+ 1,
59
+ 2,
60
+ 7,
61
+ 8,
62
+ 9,
63
+ 10,
64
+ 14,
65
+ 25,
66
+ 26,
67
+ 27,
68
+ 28,
69
+ 29,
70
+ 31,
71
+ 58,
72
+ 59,
73
+ 60,
74
+ 61,
75
+ 62,
76
+ 63,
77
+ 90,
78
+ 91,
79
+ 92,
80
+ 93,
81
+ 359,
82
+ 503,
83
+ 522,
84
+ 542,
85
+ 873,
86
+ 893,
87
+ 902,
88
+ 918,
89
+ 922,
90
+ 931,
91
+ 1350,
92
+ 1853,
93
+ 1982,
94
+ 2460,
95
+ 2627,
96
+ 3246,
97
+ 3253,
98
+ 3268,
99
+ 3536,
100
+ 3846,
101
+ 3961,
102
+ 4183,
103
+ 4667,
104
+ 6585,
105
+ 6647,
106
+ 7273,
107
+ 9061,
108
+ 9383,
109
+ 10428,
110
+ 10929,
111
+ 11938,
112
+ 12033,
113
+ 12331,
114
+ 12562,
115
+ 13793,
116
+ 14157,
117
+ 14635,
118
+ 15265,
119
+ 15618,
120
+ 16553,
121
+ 16604,
122
+ 18362,
123
+ 18956,
124
+ 20075,
125
+ 21675,
126
+ 22520,
127
+ 26130,
128
+ 26161,
129
+ 26435,
130
+ 28279,
131
+ 29464,
132
+ 31650,
133
+ 32302,
134
+ 32470,
135
+ 36865,
136
+ 42863,
137
+ 47425,
138
+ 49870,
139
+ 50254,
140
+ 50258,
141
+ 50358,
142
+ 50359,
143
+ 50360,
144
+ 50361,
145
+ 50362
146
+ ],
147
+ "tie_word_embeddings": true,
148
+ "transformers_version": "5.6.2",
149
+ "use_cache": true,
150
+ "use_weighted_layer_sum": false,
151
+ "vocab_size": 51865
152
+ }
audio_0_WhisperEncoderTransformer/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e1159b2a18a2298f88a5a6951ade86ccbe6d6dae520fbcfe21c2336a58bcf92
3
+ size 1228902976
audio_0_WhisperEncoderTransformer/processor_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor": {
3
+ "chunk_length": 30,
4
+ "dither": 0.0,
5
+ "feature_extractor_type": "WhisperFeatureExtractor",
6
+ "feature_size": 80,
7
+ "hop_length": 160,
8
+ "n_fft": 400,
9
+ "n_samples": 480000,
10
+ "nb_max_frames": 3000,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ },
16
+ "processor_class": "WhisperProcessor"
17
+ }
audio_0_WhisperEncoderTransformer/sentence_bert_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "feature-extraction",
3
+ "modality_config": {
4
+ "audio": {
5
+ "method": "forward",
6
+ "method_output_name": "last_hidden_state"
7
+ },
8
+ "audio+text": {
9
+ "method": "forward",
10
+ "method_output_name": "last_hidden_state"
11
+ }
12
+ },
13
+ "module_output_name": "token_embeddings"
14
+ }
audio_0_WhisperEncoderTransformer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c0be0f47d27cf2a0de40266711d8ca68ce58bafedab73fc457719fe437d2bfa
3
+ size 4195347
audio_0_WhisperEncoderTransformer/tokenizer_config.json ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|endoftext|>",
5
+ "clean_up_tokenization_spaces": true,
6
+ "eos_token": "<|endoftext|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|endoftext|>",
10
+ "<|startoftranscript|>",
11
+ "<|en|>",
12
+ "<|zh|>",
13
+ "<|de|>",
14
+ "<|es|>",
15
+ "<|ru|>",
16
+ "<|ko|>",
17
+ "<|fr|>",
18
+ "<|ja|>",
19
+ "<|pt|>",
20
+ "<|tr|>",
21
+ "<|pl|>",
22
+ "<|ca|>",
23
+ "<|nl|>",
24
+ "<|ar|>",
25
+ "<|sv|>",
26
+ "<|it|>",
27
+ "<|id|>",
28
+ "<|hi|>",
29
+ "<|fi|>",
30
+ "<|vi|>",
31
+ "<|he|>",
32
+ "<|uk|>",
33
+ "<|el|>",
34
+ "<|ms|>",
35
+ "<|cs|>",
36
+ "<|ro|>",
37
+ "<|da|>",
38
+ "<|hu|>",
39
+ "<|ta|>",
40
+ "<|no|>",
41
+ "<|th|>",
42
+ "<|ur|>",
43
+ "<|hr|>",
44
+ "<|bg|>",
45
+ "<|lt|>",
46
+ "<|la|>",
47
+ "<|mi|>",
48
+ "<|ml|>",
49
+ "<|cy|>",
50
+ "<|sk|>",
51
+ "<|te|>",
52
+ "<|fa|>",
53
+ "<|lv|>",
54
+ "<|bn|>",
55
+ "<|sr|>",
56
+ "<|az|>",
57
+ "<|sl|>",
58
+ "<|kn|>",
59
+ "<|et|>",
60
+ "<|mk|>",
61
+ "<|br|>",
62
+ "<|eu|>",
63
+ "<|is|>",
64
+ "<|hy|>",
65
+ "<|ne|>",
66
+ "<|mn|>",
67
+ "<|bs|>",
68
+ "<|kk|>",
69
+ "<|sq|>",
70
+ "<|sw|>",
71
+ "<|gl|>",
72
+ "<|mr|>",
73
+ "<|pa|>",
74
+ "<|si|>",
75
+ "<|km|>",
76
+ "<|sn|>",
77
+ "<|yo|>",
78
+ "<|so|>",
79
+ "<|af|>",
80
+ "<|oc|>",
81
+ "<|ka|>",
82
+ "<|be|>",
83
+ "<|tg|>",
84
+ "<|sd|>",
85
+ "<|gu|>",
86
+ "<|am|>",
87
+ "<|yi|>",
88
+ "<|lo|>",
89
+ "<|uz|>",
90
+ "<|fo|>",
91
+ "<|ht|>",
92
+ "<|ps|>",
93
+ "<|tk|>",
94
+ "<|nn|>",
95
+ "<|mt|>",
96
+ "<|sa|>",
97
+ "<|lb|>",
98
+ "<|my|>",
99
+ "<|bo|>",
100
+ "<|tl|>",
101
+ "<|mg|>",
102
+ "<|as|>",
103
+ "<|tt|>",
104
+ "<|haw|>",
105
+ "<|ln|>",
106
+ "<|ha|>",
107
+ "<|ba|>",
108
+ "<|jw|>",
109
+ "<|su|>",
110
+ "<|translate|>",
111
+ "<|transcribe|>",
112
+ "<|startoflm|>",
113
+ "<|startofprev|>",
114
+ "<|nocaptions|>",
115
+ "<|notimestamps|>"
116
+ ],
117
+ "is_local": false,
118
+ "language": null,
119
+ "local_files_only": false,
120
+ "model_max_length": 1024,
121
+ "pad_token": "<|endoftext|>",
122
+ "predict_timestamps": false,
123
+ "processor_class": "WhisperProcessor",
124
+ "return_attention_mask": false,
125
+ "task": null,
126
+ "tokenizer_class": "WhisperTokenizer",
127
+ "unk_token": "<|endoftext|>"
128
+ }
audio_1_Pooling/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "embedding_dimension": 1024,
3
+ "pooling_mode": "mean",
4
+ "include_prompt": true
5
+ }
audio_2_Dense/config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "in_features": 1024,
3
+ "out_features": 768,
4
+ "bias": true,
5
+ "activation_function": "torch.nn.modules.linear.Identity",
6
+ "module_input_name": "sentence_embedding",
7
+ "module_output_name": "sentence_embedding"
8
+ }
audio_2_Dense/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:722c64f37968e8a783ec7765d752b0e51d8194865a9f705812bca038f87bd7dc
3
+ size 3148960
config_sentence_transformers.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "pytorch": "2.10.0+cu128",
4
+ "sentence_transformers": "5.5.0",
5
+ "transformers": "5.6.2"
6
+ },
7
+ "default_prompt_name": null,
8
+ "model_type": "SentenceTransformer",
9
+ "prompts": {},
10
+ "similarity_fn_name": "cosine"
11
+ }
image_0_SiglipVisionTransformer/config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SiglipVisionModel"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "float32",
7
+ "hidden_act": "gelu_pytorch_tanh",
8
+ "hidden_size": 1152,
9
+ "image_size": 384,
10
+ "intermediate_size": 4304,
11
+ "layer_norm_eps": 1e-06,
12
+ "model_type": "siglip_vision_model",
13
+ "num_attention_heads": 16,
14
+ "num_channels": 3,
15
+ "num_hidden_layers": 27,
16
+ "patch_size": 14,
17
+ "transformers_version": "5.6.2"
18
+ }
image_0_SiglipVisionTransformer/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c441e33eea6623e252f9ec9915cf9e75b2fdd8245309456884316676d0229ac
3
+ size 1712951472
image_0_SiglipVisionTransformer/preprocessor_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "SiglipImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "resample": 2,
18
+ "rescale_factor": 0.00392156862745098,
19
+ "size": {
20
+ "height": 384,
21
+ "width": 384
22
+ }
23
+ }
image_0_SiglipVisionTransformer/sentence_bert_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "feature-extraction",
3
+ "modality_config": {
4
+ "image": {
5
+ "method": "forward",
6
+ "method_output_name": "last_hidden_state"
7
+ }
8
+ },
9
+ "module_output_name": "token_embeddings"
10
+ }
image_1_Pooling/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "embedding_dimension": 1152,
3
+ "pooling_mode": "mean",
4
+ "include_prompt": true
5
+ }
image_2_Dense/config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "in_features": 1152,
3
+ "out_features": 768,
4
+ "bias": true,
5
+ "activation_function": "torch.nn.modules.linear.Identity",
6
+ "module_input_name": "sentence_embedding",
7
+ "module_output_name": "sentence_embedding"
8
+ }
image_2_Dense/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16e7e4136c4d0a4f19bec7e7d22afd16ffe1755f7143046f2b80f89d00719654
3
+ size 3542176
modeling_multimodal_embed.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from torch import Tensor
6
+ from transformers import PretrainedConfig, PreTrainedModel
7
+
8
+ from sentence_transformers.base.modules.transformer import Transformer
9
+
10
+
11
+ class SiglipVisionTransformer(Transformer):
12
+ """Drop-in :class:`Transformer` subclass that exposes only the SigLIP vision tower."""
13
+
14
+ def __init__(self, model_name_or_path: str, **kwargs: Any) -> None:
15
+ super().__init__(model_name_or_path, **kwargs)
16
+ # Drop the unused SigLIP text tokenizer (~17 MB) from the saved layout.
17
+ if hasattr(self.processor, "image_processor"):
18
+ self.processor = self.processor.image_processor
19
+
20
+ def _load_model(
21
+ self,
22
+ model_name_or_path: str,
23
+ transformer_task: str,
24
+ config: PretrainedConfig,
25
+ backend: str,
26
+ is_peft_model: bool,
27
+ **model_kwargs: Any,
28
+ ) -> PreTrainedModel:
29
+ full_model = super()._load_model(
30
+ model_name_or_path, transformer_task, config, backend, is_peft_model, **model_kwargs
31
+ )
32
+ # getattr keeps the vision tower on fresh init; on reload it's already SiglipVisionModel.
33
+ return getattr(full_model, "vision_model", full_model)
34
+
35
+ def forward(self, features: dict[str, Tensor], **kwargs: Any) -> dict[str, Tensor]:
36
+ features = super().forward(features, **kwargs)
37
+ # Drop the first patch token to match training-time pooling.
38
+ features["token_embeddings"] = features["token_embeddings"][:, 1:]
39
+ return features
40
+
41
+
42
+ class WhisperEncoderTransformer(Transformer):
43
+ """Drop-in :class:`Transformer` subclass that decodes audio file paths/URLs into waveforms."""
44
+
45
+ def preprocess(
46
+ self, inputs: list[Any], prompt: str | None = None, **kwargs: Any
47
+ ) -> dict[str, Tensor]:
48
+ from transformers.audio_utils import load_audio
49
+
50
+ loaded = [load_audio(item) if isinstance(item, str) else item for item in inputs]
51
+ return super().preprocess(loaded, prompt=prompt, **kwargs)
modules.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.base.modules.router.Router"
7
+ }
8
+ ]
router_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "types": {
3
+ "text_0_Transformer": "sentence_transformers.base.modules.transformer.Transformer",
4
+ "text_1_Pooling": "sentence_transformers.sentence_transformer.modules.pooling.Pooling",
5
+ "text_2_Normalize": "sentence_transformers.sentence_transformer.modules.normalize.Normalize",
6
+ "image_0_SiglipVisionTransformer": "modeling_multimodal_embed.SiglipVisionTransformer",
7
+ "image_1_Pooling": "sentence_transformers.sentence_transformer.modules.pooling.Pooling",
8
+ "image_2_Dense": "sentence_transformers.base.modules.dense.Dense",
9
+ "image_3_Normalize": "sentence_transformers.sentence_transformer.modules.normalize.Normalize",
10
+ "audio_0_WhisperEncoderTransformer": "modeling_multimodal_embed.WhisperEncoderTransformer",
11
+ "audio_1_Pooling": "sentence_transformers.sentence_transformer.modules.pooling.Pooling",
12
+ "audio_2_Dense": "sentence_transformers.base.modules.dense.Dense",
13
+ "audio_3_Normalize": "sentence_transformers.sentence_transformer.modules.normalize.Normalize"
14
+ },
15
+ "structure": {
16
+ "text": [
17
+ "text_0_Transformer",
18
+ "text_1_Pooling",
19
+ "text_2_Normalize"
20
+ ],
21
+ "image": [
22
+ "image_0_SiglipVisionTransformer",
23
+ "image_1_Pooling",
24
+ "image_2_Dense",
25
+ "image_3_Normalize"
26
+ ],
27
+ "audio": [
28
+ "audio_0_WhisperEncoderTransformer",
29
+ "audio_1_Pooling",
30
+ "audio_2_Dense",
31
+ "audio_3_Normalize"
32
+ ]
33
+ },
34
+ "parameters": {
35
+ "default_route": "text",
36
+ "allow_empty_key": true,
37
+ "route_mappings": {}
38
+ }
39
+ }
text_0_Transformer/config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 2,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "mean",
12
+ "cls_token_id": 1,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "dtype": "bfloat16",
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 1,
18
+ "global_attn_every_n_layers": 3,
19
+ "gradient_checkpointing": false,
20
+ "hidden_activation": "gelu",
21
+ "hidden_size": 768,
22
+ "initializer_cutoff_factor": 2.0,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 1152,
25
+ "layer_norm_eps": 1e-05,
26
+ "layer_types": [
27
+ "full_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "full_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention",
37
+ "sliding_attention",
38
+ "sliding_attention",
39
+ "full_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "full_attention",
43
+ "sliding_attention",
44
+ "sliding_attention",
45
+ "full_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "full_attention"
49
+ ],
50
+ "local_attention": 128,
51
+ "mask_token_id": 4,
52
+ "max_position_embeddings": 32768,
53
+ "mlp_bias": false,
54
+ "mlp_dropout": 0.0,
55
+ "model_type": "modernbert",
56
+ "norm_bias": false,
57
+ "norm_eps": 1e-05,
58
+ "num_attention_heads": 12,
59
+ "num_hidden_layers": 22,
60
+ "pad_token_id": 0,
61
+ "position_embedding_type": "sans_pos",
62
+ "repad_logits_with_grad": false,
63
+ "rope_parameters": {
64
+ "full_attention": {
65
+ "rope_theta": 160000,
66
+ "rope_type": "default"
67
+ },
68
+ "sliding_attention": {
69
+ "rope_theta": 160000,
70
+ "rope_type": "default"
71
+ }
72
+ },
73
+ "sep_token_id": 1,
74
+ "sparse_pred_ignore_index": -100,
75
+ "sparse_prediction": false,
76
+ "tie_word_embeddings": true,
77
+ "transformers_version": "5.6.2",
78
+ "vocab_size": 256000
79
+ }
text_0_Transformer/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45714faf6758d630a8d7b4a57bd288632ec40172d92490e89534e404207857da
3
+ size 613892480
text_0_Transformer/sentence_bert_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "feature-extraction",
3
+ "modality_config": {
4
+ "text": {
5
+ "method": "forward",
6
+ "method_output_name": "last_hidden_state"
7
+ }
8
+ },
9
+ "module_output_name": "token_embeddings"
10
+ }
text_0_Transformer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c925c6b3dbd208644702bb1856672fa7315c6158a753d629470fcf3724ad284c
3
+ size 36944238
text_0_Transformer/tokenizer_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<bos>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "cls_token": "<bos>",
6
+ "eos_token": "<eos>",
7
+ "is_local": false,
8
+ "local_files_only": false,
9
+ "mask_token": "<mask>",
10
+ "max_length": 32768,
11
+ "model_input_names": [
12
+ "input_ids",
13
+ "attention_mask"
14
+ ],
15
+ "model_max_length": 32768,
16
+ "pad_to_multiple_of": null,
17
+ "pad_token": "<pad>",
18
+ "pad_token_type_id": 0,
19
+ "padding_side": "right",
20
+ "sep_token": "<eos>",
21
+ "spaces_between_special_tokens": false,
22
+ "stride": 0,
23
+ "tokenizer_class": "TokenizersBackend",
24
+ "truncation_side": "right",
25
+ "truncation_strategy": "longest_first",
26
+ "unk_token": "<unk>"
27
+ }
text_1_Pooling/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "embedding_dimension": 768,
3
+ "pooling_mode": "mean",
4
+ "include_prompt": true
5
+ }