Integrate with Sentence Transformers v5.4

by tomaarsen HF Staff - opened 8 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+153

-5

Files changed (9) hide show

1_Pooling/config.json +5 -0
README.md +60 -5
assets/cat.jpg +0 -0
assets/dog.jpg +0 -0
chat_template.jinja +28 -0
config_sentence_transformers.json +11 -0
modules.json +20 -0
processor_config.json +5 -0
sentence_bert_config.json +24 -0

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "embedding_dimension": 4096,
+    "pooling_mode": "lasttoken",
+    "include_prompt": true
+}

README.md CHANGED Viewed

@@ -1,6 +1,8 @@
 ---
-library_name: transformers
-tags: []
 ---
 # [E5-V: Universal Embeddings with Multimodal Large Language Models](https://arxiv.org/abs/2407.12580)
@@ -14,7 +16,52 @@ More details can be found in https://github.com/kongds/E5-V
-## Example
 ``` python
 import torch
 import torch.nn.functional as F
@@ -31,11 +78,15 @@ model = LlavaNextForConditionalGeneration.from_pretrained('royokong/e5-v', torch
 img_prompt = llama3_template.format('<image>\nSummary above image in one word: ')
 text_prompt = llama3_template.format('<sent>\nSummary above sentence in one word: ')
-urls = ['https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg',
-        'https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg']
 images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
 texts = ['A dog sitting in the grass.',
          'A cat standing in the snow.']
 text_inputs = processor([text_prompt.replace('<sent>', text) for text in texts], return_tensors="pt", padding=True).to('cuda')
@@ -49,6 +100,10 @@ with torch.no_grad():
     img_embs = F.normalize(img_embs, dim=-1)
 print(text_embs @ img_embs.t())
 ```

 ---
+library_name: sentence-transformers
+tags:
+- sentence-transformers
+pipeline_tag: sentence-similarity
 ---
 # [E5-V: Universal Embeddings with Multimodal Large Language Models](https://arxiv.org/abs/2407.12580)
+## Usage
+### Using Sentence Transformers
+Install Sentence Transformers:
+```bash
+pip install "sentence_transformers[image]"
+```
+```python
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer("royokong/e5-v")
+# Encode text inputs
+texts = [
+    "A dog sitting in the grass.",
+    "A dog standing in the snow.",
+    "A cat sitting in the grass.",
+    "A cat standing in the snow.",
+]
+text_embeddings = model.encode(texts)
+print(text_embeddings.shape)
+# (4, 4096)
+# Encode image inputs
+images = [
+    "https://huggingface.co/royokong/e5-v/resolve/main/assets/dog.jpg",
+    "https://huggingface.co/royokong/e5-v/resolve/main/assets/cat.jpg",
+]
+image_embeddings = model.encode(images)
+print(image_embeddings.shape)
+# (2, 4096)
+# Compute text-image similarities
+similarities = model.similarity(text_embeddings, image_embeddings)
+print(similarities)
+# tensor([[0.7183, 0.3579],
+#         [0.5806, 0.5522],
+#         [0.4714, 0.6479],
+#         [0.4150, 0.8081]])
+```
+The model uses a custom chat template that automatically wraps text inputs with the instruction "Summary above sentence in one word:" and image inputs with "Summary above image in one word:".
+### Using transformers
 ``` python
 import torch
 import torch.nn.functional as F
 img_prompt = llama3_template.format('<image>\nSummary above image in one word: ')
 text_prompt = llama3_template.format('<sent>\nSummary above sentence in one word: ')
+urls = [
+    'https://huggingface.co/royokong/e5-v/resolve/main/assets/dog.jpg',
+    'https://huggingface.co/royokong/e5-v/resolve/main/assets/cat.jpg',
+]
 images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
 texts = ['A dog sitting in the grass.',
+         'A dog standing in the snow.',
+         'A cat sitting in the grass.',
          'A cat standing in the snow.']
 text_inputs = processor([text_prompt.replace('<sent>', text) for text in texts], return_tensors="pt", padding=True).to('cuda')
     img_embs = F.normalize(img_embs, dim=-1)
 print(text_embs @ img_embs.t())
+# tensor([[0.7275, 0.3630],
+#         [0.5957, 0.5522],
+#         [0.4709, 0.6406],
+#         [0.4202, 0.7974]])
 ```

assets/cat.jpg ADDED Viewed

assets/dog.jpg ADDED Viewed

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,28 @@

+{%- for message in messages %}
+{%- if message['role'] == 'system' %}
+{%- elif message['role'] == 'user' %}
+<|start_header_id|>user<|end_header_id|>
+{% if message['content'] is string %}
+{{- message['content'] }}{{ '\nSummary above sentence in one word: ' }}
+{%- else %}
+{%- set has_image = message['content'] | selectattr('type', 'equalto', 'image') | list | length > 0 %}
+{%- for item in message['content'] %}
+{%- if item['type'] == 'image' %}
+{{- '<image>' }}
+{%- elif item['type'] == 'text' %}
+{{- item['text'] }}
+{%- endif %}
+{%- endfor %}
+{%- if has_image %}
+{{- '\nSummary above image in one word: ' }}
+{%- else %}
+{{- '\nSummary above sentence in one word: ' }}
+{%- endif %}
+{%- endif %}
+{{- '<|eot_id|>' }}
+{%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+<|start_header_id|>assistant<|end_header_id|>{{ '\n\n \n' }}
+{%- endif %}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "__version__": {
+    "pytorch": "2.10.0+cu128",
+    "sentence_transformers": "5.4.0",
+    "transformers": "5.5.0"
+  },
+  "default_prompt_name": null,
+  "model_type": "SentenceTransformer",
+  "prompts": {},
+  "similarity_fn_name": "cosine"
+}

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.base.modules.transformer.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.sentence_transformer.modules.pooling.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.sentence_transformer.modules.normalize.Normalize"
+  }
+]

processor_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "patch_size": 14,
+  "processor_class": "LlavaNextProcessor",
+  "vision_feature_select_strategy": "full"
+}

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+    "transformer_task": "feature-extraction",
+    "modality_config": {
+        "text": {
+            "method": "forward",
+            "method_output_name": "last_hidden_state"
+        },
+        "image": {
+            "method": "forward",
+            "method_output_name": "last_hidden_state"
+        },
+        "message": {
+            "method": "forward",
+            "method_output_name": "last_hidden_state",
+            "format": "structured"
+        }
+    },
+    "module_output_name": "token_embeddings",
+    "processing_kwargs": {
+        "chat_template": {
+            "add_generation_prompt": true
+        }
+    }
+}