Integrate with Sentence Transformers v5.4

#2
by tomaarsen HF Staff - opened
1_Pooling/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "embedding_dimension": 4096,
3
+ "pooling_mode": "lasttoken",
4
+ "include_prompt": true
5
+ }
README.md CHANGED
@@ -1,6 +1,8 @@
1
  ---
2
- library_name: transformers
3
- tags: []
 
 
4
  ---
5
 
6
  # [E5-V: Universal Embeddings with Multimodal Large Language Models](https://arxiv.org/abs/2407.12580)
@@ -14,7 +16,52 @@ More details can be found in https://github.com/kongds/E5-V
14
 
15
 
16
 
17
- ## Example
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  ``` python
19
  import torch
20
  import torch.nn.functional as F
@@ -31,11 +78,15 @@ model = LlavaNextForConditionalGeneration.from_pretrained('royokong/e5-v', torch
31
  img_prompt = llama3_template.format('<image>\nSummary above image in one word: ')
32
  text_prompt = llama3_template.format('<sent>\nSummary above sentence in one word: ')
33
 
34
- urls = ['https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg',
35
- 'https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg']
 
 
36
  images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
37
 
38
  texts = ['A dog sitting in the grass.',
 
 
39
  'A cat standing in the snow.']
40
 
41
  text_inputs = processor([text_prompt.replace('<sent>', text) for text in texts], return_tensors="pt", padding=True).to('cuda')
@@ -49,6 +100,10 @@ with torch.no_grad():
49
  img_embs = F.normalize(img_embs, dim=-1)
50
 
51
  print(text_embs @ img_embs.t())
 
 
 
 
52
  ```
53
 
54
 
 
1
  ---
2
+ library_name: sentence-transformers
3
+ tags:
4
+ - sentence-transformers
5
+ pipeline_tag: sentence-similarity
6
  ---
7
 
8
  # [E5-V: Universal Embeddings with Multimodal Large Language Models](https://arxiv.org/abs/2407.12580)
 
16
 
17
 
18
 
19
+ ## Usage
20
+
21
+ ### Using Sentence Transformers
22
+
23
+ Install Sentence Transformers:
24
+ ```bash
25
+ pip install "sentence_transformers[image]"
26
+ ```
27
+
28
+ ```python
29
+ from sentence_transformers import SentenceTransformer
30
+
31
+ model = SentenceTransformer("royokong/e5-v")
32
+
33
+ # Encode text inputs
34
+ texts = [
35
+ "A dog sitting in the grass.",
36
+ "A dog standing in the snow.",
37
+ "A cat sitting in the grass.",
38
+ "A cat standing in the snow.",
39
+ ]
40
+ text_embeddings = model.encode(texts)
41
+ print(text_embeddings.shape)
42
+ # (4, 4096)
43
+
44
+ # Encode image inputs
45
+ images = [
46
+ "https://huggingface.co/royokong/e5-v/resolve/main/assets/dog.jpg",
47
+ "https://huggingface.co/royokong/e5-v/resolve/main/assets/cat.jpg",
48
+ ]
49
+ image_embeddings = model.encode(images)
50
+ print(image_embeddings.shape)
51
+ # (2, 4096)
52
+
53
+ # Compute text-image similarities
54
+ similarities = model.similarity(text_embeddings, image_embeddings)
55
+ print(similarities)
56
+ # tensor([[0.7183, 0.3579],
57
+ # [0.5806, 0.5522],
58
+ # [0.4714, 0.6479],
59
+ # [0.4150, 0.8081]])
60
+ ```
61
+
62
+ The model uses a custom chat template that automatically wraps text inputs with the instruction "Summary above sentence in one word:" and image inputs with "Summary above image in one word:".
63
+
64
+ ### Using transformers
65
  ``` python
66
  import torch
67
  import torch.nn.functional as F
 
78
  img_prompt = llama3_template.format('<image>\nSummary above image in one word: ')
79
  text_prompt = llama3_template.format('<sent>\nSummary above sentence in one word: ')
80
 
81
+ urls = [
82
+ 'https://huggingface.co/royokong/e5-v/resolve/main/assets/dog.jpg',
83
+ 'https://huggingface.co/royokong/e5-v/resolve/main/assets/cat.jpg',
84
+ ]
85
  images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
86
 
87
  texts = ['A dog sitting in the grass.',
88
+ 'A dog standing in the snow.',
89
+ 'A cat sitting in the grass.',
90
  'A cat standing in the snow.']
91
 
92
  text_inputs = processor([text_prompt.replace('<sent>', text) for text in texts], return_tensors="pt", padding=True).to('cuda')
 
100
  img_embs = F.normalize(img_embs, dim=-1)
101
 
102
  print(text_embs @ img_embs.t())
103
+ # tensor([[0.7275, 0.3630],
104
+ # [0.5957, 0.5522],
105
+ # [0.4709, 0.6406],
106
+ # [0.4202, 0.7974]])
107
  ```
108
 
109
 
assets/cat.jpg ADDED
assets/dog.jpg ADDED
chat_template.jinja ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- for message in messages %}
2
+ {%- if message['role'] == 'system' %}
3
+ {%- elif message['role'] == 'user' %}
4
+ <|start_header_id|>user<|end_header_id|>
5
+
6
+ {% if message['content'] is string %}
7
+ {{- message['content'] }}{{ '\nSummary above sentence in one word: ' }}
8
+ {%- else %}
9
+ {%- set has_image = message['content'] | selectattr('type', 'equalto', 'image') | list | length > 0 %}
10
+ {%- for item in message['content'] %}
11
+ {%- if item['type'] == 'image' %}
12
+ {{- '<image>' }}
13
+ {%- elif item['type'] == 'text' %}
14
+ {{- item['text'] }}
15
+ {%- endif %}
16
+ {%- endfor %}
17
+ {%- if has_image %}
18
+ {{- '\nSummary above image in one word: ' }}
19
+ {%- else %}
20
+ {{- '\nSummary above sentence in one word: ' }}
21
+ {%- endif %}
22
+ {%- endif %}
23
+ {{- '<|eot_id|>' }}
24
+ {%- endif %}
25
+ {%- endfor %}
26
+ {%- if add_generation_prompt %}
27
+ <|start_header_id|>assistant<|end_header_id|>{{ '\n\n \n' }}
28
+ {%- endif %}
config_sentence_transformers.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "pytorch": "2.10.0+cu128",
4
+ "sentence_transformers": "5.4.0",
5
+ "transformers": "5.5.0"
6
+ },
7
+ "default_prompt_name": null,
8
+ "model_type": "SentenceTransformer",
9
+ "prompts": {},
10
+ "similarity_fn_name": "cosine"
11
+ }
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.base.modules.transformer.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.sentence_transformer.modules.pooling.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.sentence_transformer.modules.normalize.Normalize"
19
+ }
20
+ ]
processor_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "patch_size": 14,
3
+ "processor_class": "LlavaNextProcessor",
4
+ "vision_feature_select_strategy": "full"
5
+ }
sentence_bert_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "feature-extraction",
3
+ "modality_config": {
4
+ "text": {
5
+ "method": "forward",
6
+ "method_output_name": "last_hidden_state"
7
+ },
8
+ "image": {
9
+ "method": "forward",
10
+ "method_output_name": "last_hidden_state"
11
+ },
12
+ "message": {
13
+ "method": "forward",
14
+ "method_output_name": "last_hidden_state",
15
+ "format": "structured"
16
+ }
17
+ },
18
+ "module_output_name": "token_embeddings",
19
+ "processing_kwargs": {
20
+ "chat_template": {
21
+ "add_generation_prompt": true
22
+ }
23
+ }
24
+ }