Integrate with Sentence Transformers

Files changed (10) hide show

1_Pooling/config.json +5 -0
README.md +67 -1
additional_chat_templates/sentence_transformers.jinja +48 -0
chat_template.jinja +7 -0
chat_template.json +0 -3
config.json +4 -0
config_sentence_transformers.json +11 -0
modeling_lco_omni.py +8 -0
modules.json +20 -0
sentence_bert_config.json +48 -0

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "embedding_dimension": 2048,
+    "pooling_mode": "lasttoken",
+    "include_prompt": true
+}

README.md CHANGED Viewed

@@ -1,7 +1,12 @@
 ---
 license: apache-2.0
 pipeline_tag: feature-extraction
-library_name: transformers
 ---
 # LCO-Embedding: Scaling Language-Centric Omnimodal Representation Learning
@@ -19,6 +24,67 @@ This model implements the framework presented in the paper [Scaling Language-Cen
 Note: We are only using the `thinker` component of Qwen2.5 Omni and drops the `talker` component.
 ```python
 from transformers import Qwen2_5OmniThinkerForConditionalGeneration, Qwen2_5OmniProcessor
 from qwen_omni_utils import process_mm_info

 ---
 license: apache-2.0
 pipeline_tag: feature-extraction
+library_name: sentence-transformers
+tags:
+  - transformers
+  - sentence-transformers
+  - feature-extraction
+  - multimodal-embedding
 ---
 # LCO-Embedding: Scaling Language-Centric Omnimodal Representation Learning
 Note: We are only using the `thinker` component of Qwen2.5 Omni and drops the `talker` component.
+### Using Sentence Transformers
+Install Sentence Transformers:
+```bash
+pip install "sentence_transformers[image]"
+```
+```python
+import torch
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer(
+    "LCO-Embedding/LCO-Embedding-Omni-3B",
+    trust_remote_code=True,
+    model_kwargs={"dtype": torch.bfloat16},
+)
+# The same "Summarize the above <modality> in one word:" instruction used in
+# the paper is baked into the chat template, so encode() takes plain text or
+# multimodal dicts directly.
+texts = [
+    "The capital of France is Paris.",
+    "Paris is the capital city of France.",
+    "The Eiffel Tower is located in Paris.",
+    "Berlin is the capital of Germany.",
+]
+text_embeddings = model.encode(texts)
+print(text_embeddings.shape)
+# (4, 2048)
+text_similarities = model.similarity(text_embeddings, text_embeddings)
+print(text_similarities)
+# tensor([[1.0000, 0.9538, 0.6566, 0.5988],
+#         [0.9538, 1.0000, 0.7059, 0.5932],
+#         [0.6566, 0.7059, 1.0000, 0.4198],
+#         [0.5988, 0.5932, 0.4198, 1.0000]])
+# Encoding images (text, audio, and video also work, individually or combined using a dict input):
+image_embeddings = model.encode([
+    "path/to/image_1.png",
+    "path/to/image_2.png",
+])
+print(image_embeddings.shape)
+# (2, 2048)
+# Multimodal inputs can mix modalities via dicts (text + image + audio + video):
+queries = ["A diagram of the Qwen2.5-Omni architecture"]
+documents = [
+    {"image": "path/to/qwen_diagram.png"},
+    {"text": "Llama 4 architecture overview", "image": "path/to/llama_diagram.png"},
+]
+query_embeddings = model.encode(queries)
+document_embeddings = model.encode(documents)
+similarities = model.similarity(query_embeddings, document_embeddings)
+print(similarities.shape)
+# torch.Size([1, 2])
+```
+### Using Transformers
 ```python
 from transformers import Qwen2_5OmniThinkerForConditionalGeneration, Qwen2_5OmniProcessor
 from qwen_omni_utils import process_mm_info

additional_chat_templates/sentence_transformers.jinja ADDED Viewed

	@@ -0,0 +1,48 @@

+{%- set audio_count = namespace(value=0) -%}
+{%- set image_count = namespace(value=0) -%}
+{%- set video_count = namespace(value=0) -%}
+{%- for message in messages -%}
+    {%- if loop.first and message['role'] != 'system' -%}
+        <|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif -%}
+    <|im_start|>{{ message['role'] }}
+{% if message['content'] is string -%}
+        {{- message['content'] -}}<|im_end|>
+{% else -%}
+        {%- set seen = namespace(image=false, audio=false, video=false) -%}
+        {%- for content in message['content'] -%}
+            {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
+                {%- set image_count.value = image_count.value + 1 -%}
+                {%- set seen.image = true -%}
+                {%- if add_vision_id -%}Picture {{ image_count.value }}: {% endif -%}
+                <|vision_bos|><|IMAGE|><|vision_eos|>
+            {%- elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content -%}
+                {%- set audio_count.value = audio_count.value + 1 -%}
+                {%- set seen.audio = true -%}
+                {%- if add_audio_id -%}Audio {{ audio_count.value }}: {% endif -%}
+                <|audio_bos|><|AUDIO|><|audio_eos|>
+            {%- elif content['type'] == 'video' or 'video' in content -%}
+                {%- set video_count.value = video_count.value + 1 -%}
+                {%- set seen.video = true -%}
+                {%- if add_vision_id -%}Video {{ video_count.value }}: {% endif -%}
+                <|vision_bos|><|VIDEO|><|vision_eos|>
+            {%- elif 'text' in content -%}
+                {{- content['text'] -}}
+            {%- endif -%}
+        {%- endfor -%}
+        {%- if seen.image -%}
+{{ '\n' }}Summarize the above image in one word:
+        {%- elif seen.video -%}
+{{ '\n' }}Summarize the above video in one word:
+        {%- elif seen.audio -%}
+{{ '\n' }}Summarize the above audio in one word:
+        {%- else -%}
+{{ '\n' }}Summarize the above text in one word:
+        {%- endif -%}
+        <|im_end|>
+{% endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    <|im_start|>assistant
+{% endif -%}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

chat_template.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "chat_template": "{% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
-}

config.json CHANGED Viewed

@@ -3,6 +3,10 @@
   "architectures": [
     "Qwen2_5OmniThinkerForConditionalGeneration"
   ],
   "audio_config": {
     "_attn_implementation_autoset": true,
     "activation_dropout": 0.0,

   "architectures": [
     "Qwen2_5OmniThinkerForConditionalGeneration"
   ],
+  "auto_map": {
+    "AutoConfig": "modeling_lco_omni.Qwen2_5OmniThinkerConfig",
+    "AutoModel": "modeling_lco_omni.Qwen2_5OmniThinkerForConditionalGeneration"
+  },
   "audio_config": {
     "_attn_implementation_autoset": true,
     "activation_dropout": 0.0,

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "__version__": {
+    "pytorch": "2.10.0+cu128",
+    "sentence_transformers": "5.5.0.dev0",
+    "transformers": "5.5.0.dev0"
+  },
+  "default_prompt_name": null,
+  "model_type": "SentenceTransformer",
+  "prompts": {},
+  "similarity_fn_name": "cosine"
+}

modeling_lco_omni.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# Re-exported so `auto_map` in config.json can resolve the Thinker classes;
+# `qwen2_5_omni_thinker` is shipped by transformers but not in `AutoConfig`.
+from transformers import Qwen2_5OmniThinkerConfig, Qwen2_5OmniThinkerForConditionalGeneration
+__all__ = [
+    "Qwen2_5OmniThinkerConfig",
+    "Qwen2_5OmniThinkerForConditionalGeneration",
+]

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.base.modules.transformer.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.sentence_transformer.modules.pooling.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.sentence_transformer.modules.normalize.Normalize"
+  }
+]

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+    "transformer_task": "feature-extraction",
+    "modality_config": {
+        "text": {
+            "method": "forward",
+            "method_output_name": [
+                "hidden_states",
+                -1
+            ]
+        },
+        "image": {
+            "method": "forward",
+            "method_output_name": [
+                "hidden_states",
+                -1
+            ]
+        },
+        "audio": {
+            "method": "forward",
+            "method_output_name": [
+                "hidden_states",
+                -1
+            ]
+        },
+        "video": {
+            "method": "forward",
+            "method_output_name": [
+                "hidden_states",
+                -1
+            ]
+        },
+        "message": {
+            "method": "forward",
+            "method_output_name": [
+                "hidden_states",
+                -1
+            ],
+            "format": "structured"
+        }
+    },
+    "module_output_name": "token_embeddings",
+    "processing_kwargs": {
+        "chat_template": {
+            "chat_template": "sentence_transformers",
+            "add_generation_prompt": true
+        }
+    }
+}