Tom Aarsen commited on
Commit
8d65917
·
1 Parent(s): 36bbcd9

Integrate with Sentence Transformers

Browse files
1_Pooling/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "embedding_dimension": 2048,
3
+ "pooling_mode": "lasttoken",
4
+ "include_prompt": true
5
+ }
README.md CHANGED
@@ -1,7 +1,12 @@
1
  ---
2
  license: apache-2.0
3
  pipeline_tag: feature-extraction
4
- library_name: transformers
 
 
 
 
 
5
  ---
6
 
7
  # LCO-Embedding: Scaling Language-Centric Omnimodal Representation Learning
@@ -19,6 +24,67 @@ This model implements the framework presented in the paper [Scaling Language-Cen
19
 
20
  Note: We are only using the `thinker` component of Qwen2.5 Omni and drops the `talker` component.
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  ```python
23
  from transformers import Qwen2_5OmniThinkerForConditionalGeneration, Qwen2_5OmniProcessor
24
  from qwen_omni_utils import process_mm_info
 
1
  ---
2
  license: apache-2.0
3
  pipeline_tag: feature-extraction
4
+ library_name: sentence-transformers
5
+ tags:
6
+ - transformers
7
+ - sentence-transformers
8
+ - feature-extraction
9
+ - multimodal-embedding
10
  ---
11
 
12
  # LCO-Embedding: Scaling Language-Centric Omnimodal Representation Learning
 
24
 
25
  Note: We are only using the `thinker` component of Qwen2.5 Omni and drops the `talker` component.
26
 
27
+ ### Using Sentence Transformers
28
+
29
+ Install Sentence Transformers:
30
+ ```bash
31
+ pip install "sentence_transformers[image]"
32
+ ```
33
+
34
+ ```python
35
+ import torch
36
+ from sentence_transformers import SentenceTransformer
37
+
38
+ model = SentenceTransformer(
39
+ "LCO-Embedding/LCO-Embedding-Omni-3B",
40
+ trust_remote_code=True,
41
+ model_kwargs={"dtype": torch.bfloat16},
42
+ )
43
+
44
+ # The same "Summarize the above <modality> in one word:" instruction used in
45
+ # the paper is baked into the chat template, so encode() takes plain text or
46
+ # multimodal dicts directly.
47
+ texts = [
48
+ "The capital of France is Paris.",
49
+ "Paris is the capital city of France.",
50
+ "The Eiffel Tower is located in Paris.",
51
+ "Berlin is the capital of Germany.",
52
+ ]
53
+ text_embeddings = model.encode(texts)
54
+ print(text_embeddings.shape)
55
+ # (4, 2048)
56
+
57
+ text_similarities = model.similarity(text_embeddings, text_embeddings)
58
+ print(text_similarities)
59
+ # tensor([[1.0000, 0.9538, 0.6566, 0.5988],
60
+ # [0.9538, 1.0000, 0.7059, 0.5932],
61
+ # [0.6566, 0.7059, 1.0000, 0.4198],
62
+ # [0.5988, 0.5932, 0.4198, 1.0000]])
63
+
64
+ # Encoding images (text, audio, and video also work, individually or combined using a dict input):
65
+ image_embeddings = model.encode([
66
+ "path/to/image_1.png",
67
+ "path/to/image_2.png",
68
+ ])
69
+ print(image_embeddings.shape)
70
+ # (2, 2048)
71
+
72
+ # Multimodal inputs can mix modalities via dicts (text + image + audio + video):
73
+ queries = ["A diagram of the Qwen2.5-Omni architecture"]
74
+ documents = [
75
+ {"image": "path/to/qwen_diagram.png"},
76
+ {"text": "Llama 4 architecture overview", "image": "path/to/llama_diagram.png"},
77
+ ]
78
+ query_embeddings = model.encode(queries)
79
+ document_embeddings = model.encode(documents)
80
+
81
+ similarities = model.similarity(query_embeddings, document_embeddings)
82
+ print(similarities.shape)
83
+ # torch.Size([1, 2])
84
+ ```
85
+
86
+ ### Using Transformers
87
+
88
  ```python
89
  from transformers import Qwen2_5OmniThinkerForConditionalGeneration, Qwen2_5OmniProcessor
90
  from qwen_omni_utils import process_mm_info
additional_chat_templates/sentence_transformers.jinja ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set audio_count = namespace(value=0) -%}
2
+ {%- set image_count = namespace(value=0) -%}
3
+ {%- set video_count = namespace(value=0) -%}
4
+ {%- for message in messages -%}
5
+ {%- if loop.first and message['role'] != 'system' -%}
6
+ <|im_start|>system
7
+ You are a helpful assistant.<|im_end|>
8
+ {% endif -%}
9
+ <|im_start|>{{ message['role'] }}
10
+ {% if message['content'] is string -%}
11
+ {{- message['content'] -}}<|im_end|>
12
+ {% else -%}
13
+ {%- set seen = namespace(image=false, audio=false, video=false) -%}
14
+ {%- for content in message['content'] -%}
15
+ {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
16
+ {%- set image_count.value = image_count.value + 1 -%}
17
+ {%- set seen.image = true -%}
18
+ {%- if add_vision_id -%}Picture {{ image_count.value }}: {% endif -%}
19
+ <|vision_bos|><|IMAGE|><|vision_eos|>
20
+ {%- elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content -%}
21
+ {%- set audio_count.value = audio_count.value + 1 -%}
22
+ {%- set seen.audio = true -%}
23
+ {%- if add_audio_id -%}Audio {{ audio_count.value }}: {% endif -%}
24
+ <|audio_bos|><|AUDIO|><|audio_eos|>
25
+ {%- elif content['type'] == 'video' or 'video' in content -%}
26
+ {%- set video_count.value = video_count.value + 1 -%}
27
+ {%- set seen.video = true -%}
28
+ {%- if add_vision_id -%}Video {{ video_count.value }}: {% endif -%}
29
+ <|vision_bos|><|VIDEO|><|vision_eos|>
30
+ {%- elif 'text' in content -%}
31
+ {{- content['text'] -}}
32
+ {%- endif -%}
33
+ {%- endfor -%}
34
+ {%- if seen.image -%}
35
+ {{ '\n' }}Summarize the above image in one word:
36
+ {%- elif seen.video -%}
37
+ {{ '\n' }}Summarize the above video in one word:
38
+ {%- elif seen.audio -%}
39
+ {{ '\n' }}Summarize the above audio in one word:
40
+ {%- else -%}
41
+ {{ '\n' }}Summarize the above text in one word:
42
+ {%- endif -%}
43
+ <|im_end|>
44
+ {% endif -%}
45
+ {%- endfor -%}
46
+ {%- if add_generation_prompt -%}
47
+ <|im_start|>assistant
48
+ {% endif -%}
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
chat_template.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "chat_template": "{% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
- }
 
 
 
 
config.json CHANGED
@@ -3,6 +3,10 @@
3
  "architectures": [
4
  "Qwen2_5OmniThinkerForConditionalGeneration"
5
  ],
 
 
 
 
6
  "audio_config": {
7
  "_attn_implementation_autoset": true,
8
  "activation_dropout": 0.0,
 
3
  "architectures": [
4
  "Qwen2_5OmniThinkerForConditionalGeneration"
5
  ],
6
+ "auto_map": {
7
+ "AutoConfig": "modeling_lco_omni.Qwen2_5OmniThinkerConfig",
8
+ "AutoModel": "modeling_lco_omni.Qwen2_5OmniThinkerForConditionalGeneration"
9
+ },
10
  "audio_config": {
11
  "_attn_implementation_autoset": true,
12
  "activation_dropout": 0.0,
config_sentence_transformers.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "pytorch": "2.10.0+cu128",
4
+ "sentence_transformers": "5.5.0.dev0",
5
+ "transformers": "5.5.0.dev0"
6
+ },
7
+ "default_prompt_name": null,
8
+ "model_type": "SentenceTransformer",
9
+ "prompts": {},
10
+ "similarity_fn_name": "cosine"
11
+ }
modeling_lco_omni.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Re-exported so `auto_map` in config.json can resolve the Thinker classes;
2
+ # `qwen2_5_omni_thinker` is shipped by transformers but not in `AutoConfig`.
3
+ from transformers import Qwen2_5OmniThinkerConfig, Qwen2_5OmniThinkerForConditionalGeneration
4
+
5
+ __all__ = [
6
+ "Qwen2_5OmniThinkerConfig",
7
+ "Qwen2_5OmniThinkerForConditionalGeneration",
8
+ ]
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.base.modules.transformer.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.sentence_transformer.modules.pooling.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.sentence_transformer.modules.normalize.Normalize"
19
+ }
20
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "feature-extraction",
3
+ "modality_config": {
4
+ "text": {
5
+ "method": "forward",
6
+ "method_output_name": [
7
+ "hidden_states",
8
+ -1
9
+ ]
10
+ },
11
+ "image": {
12
+ "method": "forward",
13
+ "method_output_name": [
14
+ "hidden_states",
15
+ -1
16
+ ]
17
+ },
18
+ "audio": {
19
+ "method": "forward",
20
+ "method_output_name": [
21
+ "hidden_states",
22
+ -1
23
+ ]
24
+ },
25
+ "video": {
26
+ "method": "forward",
27
+ "method_output_name": [
28
+ "hidden_states",
29
+ -1
30
+ ]
31
+ },
32
+ "message": {
33
+ "method": "forward",
34
+ "method_output_name": [
35
+ "hidden_states",
36
+ -1
37
+ ],
38
+ "format": "structured"
39
+ }
40
+ },
41
+ "module_output_name": "token_embeddings",
42
+ "processing_kwargs": {
43
+ "chat_template": {
44
+ "chat_template": "sentence_transformers",
45
+ "add_generation_prompt": true
46
+ }
47
+ }
48
+ }