File size: 6,646 Bytes
95c9918 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | ---
language: en
license: apache-2.0
tags:
- human-behavior
- multimodal
- qwen2.5-omni
- sarcasm-detection
- sarcasm
datasets:
- keentomato/human_behavior_atlas
---
# OmniSapiens BAM — Sarcasm Detection
Fine-tuned [Qwen2.5-Omni-7B](https://huggingface.co/Qwen/Qwen2.5-Omni-7B) for multimodal sarcasm detection on the MUStARD/MMSD benchmark. Uses LoRA adapters merged into the backbone and a lightweight classification head.
## Benchmark
Evaluated on [keentomato/human_behavior_atlas](https://huggingface.co/datasets/keentomato/human_behavior_atlas).
## Usage
### Installation
```bash
pip install transformers torch huggingface_hub
```
### Classification
```python
import json, torch
from huggingface_hub import hf_hub_download
from transformers import Qwen2_5OmniThinkerForConditionalGeneration, AutoProcessor
MODEL_ID = "keentomato/omnisapiens_bam_sarcasm_detection"
# 1. Load backbone and processor
model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
MODEL_ID, torch_dtype=torch.float16, device_map="auto"
)
processor = AutoProcessor.from_pretrained(MODEL_ID)
# 2. Load classification heads and label scheme
heads_path = hf_hub_download(MODEL_ID, "heads.bin")
label_path = hf_hub_download(MODEL_ID, "label_scheme.json")
heads_sd = torch.load(heads_path, map_location="cpu")
with open(label_path) as f:
label_scheme = json.load(f)
# 3. Reconstruct domain heads
global_classes = label_scheme["meta"]["global_classes"] # {domain: [{index, label}, ...]}
hidden_size = model.config.hidden_size
domain_names = list(global_classes.keys())
domain_heads = torch.nn.ModuleList([
torch.nn.Linear(hidden_size, len(global_classes[d])) for d in domain_names
])
domain_heads.load_state_dict({k.replace("heads.", ""): v for k, v in heads_sd.items()})
domain_heads.eval().to(model.device).to(torch.float16)
domain_to_id = {d: i for i, d in enumerate(domain_names)}
# 4. Prepare multimodal inputs
# video_tensor: [T, C, H, W] tensor or list of PIL images
# audio_waveform: 1-D numpy array / tensor at 16 kHz
domain = "sarcasm"
messages = [{"role": "user", "content": [
{"type": "video"},
{"type": "audio"},
{"type": "text", "text": "Classify the human behavior expressed."},
]}]
text = processor.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)
inputs = processor(text=[text], videos=[video_tensor], audio=[audio_waveform], return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# 5. Forward pass — pool penultimate hidden layer, route through domain head
with torch.no_grad():
out = model(**inputs, output_hidden_states=True, use_cache=False)
h = out.hidden_states[-2] # [B, T, H]
mask = inputs["attention_mask"].unsqueeze(-1).float()
pooled = (h * mask).sum(1) / mask.sum(1) # [B, H]
logits = domain_heads[domain_to_id[domain]](pooled.float()) # [B, K_d]
pred_idx = logits.argmax(dim=-1).item()
label_name = global_classes[domain][pred_idx]["label"]
print(f"Predicted {domain}: {label_name}")
```
### Behavioral Descriptors (BAM Adapters)
As `adapters.bin` is present in the repo, the model supports side-channel
behavioral descriptors extracted from OpenPose (video) and OpenSmile (audio).
These replace the raw video/audio inputs to the backbone with pre-computed
behavioral feature vectors that are injected via lightweight MLP adapters.
**Video — OpenPose keypoints**
OpenPose produces a dict per clip with keys `pose`, `face`, `left_hand`, `right_hand`,
each a `[T, K, 2or3]` tensor (T frames, K keypoints, x/y/conf).
```python
def prepare_video_feats(openpose_dict, temporal_mode="meanstd"):
"""OpenPose dict → pooled feature vector [D_v_pooled]."""
parts = []
for key in ("pose", "face", "left_hand", "right_hand"):
t = openpose_dict.get(key) # [T, K, 2or3]
if t is None: continue
t = torch.as_tensor(t).float()[..., :2] # drop confidence, keep x/y
parts.append(t.reshape(t.shape[0], -1)) # [T, K*2]
seq = torch.cat(parts, dim=-1).float() # [T, D_v]
if temporal_mode == "meanstd":
return torch.cat([seq.mean(0), seq.std(0)]) # [D_v*2]
return seq.mean(0) # [D_v]
video_feats = prepare_video_feats(openpose_dict).unsqueeze(0) # [1, D_v_pooled]
```
**Audio — OpenSmile features**
OpenSmile produces a dict with key `features` → `[T, D_a]` or `[D_a]`.
```python
def prepare_audio_feats(opensmile_dict):
"""OpenSmile dict → L2-normalised feature vector [D_a]."""
x = torch.as_tensor(opensmile_dict["features"]).float()
if x.ndim == 2: x = x.squeeze(0) # [D_a] (single frame assumed)
return x / x.norm(p=2).clamp_min(1e-6)
audio_feats = prepare_audio_feats(opensmile_dict).unsqueeze(0) # [1, D_a]
```
**Loading and applying the adapters**
```python
import torch, torch.nn as nn
from huggingface_hub import hf_hub_download
adapters_sd = torch.load(hf_hub_download(MODEL_ID, "adapters.bin"), map_location="cpu")
# Infer architecture from saved weight shapes — no config needed
def _make_adapter(prefix, sd):
w0 = sd[f"{prefix}.mlp.0.weight"] # [hidden, feat_dim]
w2 = sd[f"{prefix}.mlp.2.weight"] # [out_dim, hidden]
feat_dim, hidden, out_dim = w0.shape[1], w0.shape[0], w2.shape[0]
mlp = nn.Sequential(nn.Linear(feat_dim, hidden), nn.ReLU(), nn.Linear(hidden, out_dim))
alpha = nn.Parameter(sd[f"{prefix}.alpha"])
class _Adapter(nn.Module):
def __init__(self): super().__init__(); self.mlp = mlp; self.alpha = alpha
def forward(self, x): return self.mlp(x) * self.alpha
m = _Adapter()
m.load_state_dict({k[len(prefix)+1:]: v for k, v in sd.items() if k.startswith(prefix)}, strict=False)
return m.eval()
video_adapter = _make_adapter("video_adapter", adapters_sd).to(model.device).half()
audio_adapter = _make_adapter("audio_adapter", adapters_sd).to(model.device).half()
# Augment pooled repr with BAM deltas before the classification head
with torch.no_grad():
out = model(**inputs, output_hidden_states=True, use_cache=False)
h = out.hidden_states[-2]
mask = inputs["attention_mask"].unsqueeze(-1).float()
pooled = (h * mask).sum(1) / mask.sum(1) # [B, H]
pooled = pooled + video_adapter(video_feats.to(model.device).half())
pooled = pooled + audio_adapter(audio_feats.to(model.device).half())
logits = domain_heads[domain_to_id[domain]](pooled.float())
pred_idx = logits.argmax(dim=-1).item()
label_name = global_classes[domain][pred_idx]["label"]
print(f"Predicted {domain}: {label_name}")
```
|