upload supp files

Browse files

Files changed (12) hide show

bert_modeling_bert_self_attn_patch.py +93 -0
loralib/__init__.py +2 -0
loralib/__pycache__/__init__.cpython-310.pyc +0 -0
loralib/__pycache__/layers.cpython-310.pyc +0 -0
loralib/__pycache__/utils.cpython-310.pyc +0 -0
loralib/easymultiheadattention.py +124 -0
loralib/layers.py +937 -0
loralib/utils.py +213 -0
loss.py +170 -0
open_clip_patch.py +37 -0
prompt_templates.py +4 -0
timm_vit_return_attn_patch.py +65 -0

bert_modeling_bert_self_attn_patch.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+from transformers.models.bert import modeling_bert
+from typing import Optional, Tuple
+import torch.nn as nn
+import math
+def patch_bert_self_attn():
+    def bert_self_attn_forward_patched(self,
+                            hidden_states: torch.Tensor,
+                            attention_mask: Optional[torch.FloatTensor] = None,
+                            head_mask: Optional[torch.FloatTensor] = None,
+                            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+                            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+                            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+                            output_attentions: Optional[bool] = False):
+        mixed_query_layer = self.query(hidden_states)
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention and past_key_value is not None:
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            past_key_value = (key_layer, value_layer)
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        attn_scores = attention_scores
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        attention_probs = self.dropout(attention_probs)
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        outputs = (context_layer, attn_scores) if output_attentions else (context_layer,)
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+    modeling_bert.BertSelfAttention.forward = bert_self_attn_forward_patched

loralib/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .layers import *
2	+ from .utils import *

loralib/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (195 Bytes). View file

loralib/__pycache__/layers.cpython-310.pyc ADDED Viewed

Binary file (21.8 kB). View file

loralib/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (5.48 kB). View file

loralib/easymultiheadattention.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+"""
+Source : https://github.com/KyanChen/MakeMultiHeadNaive/blob/master/main.py
+"""
+class PlainMultiHeadAttention(nn.Module):
+    def __init__(
+            self,
+            existing_mha: nn.MultiheadAttention):
+        super().__init__()
+        self.dropout = 0 # this module is not used to retrain the main block
+        self.embed_dim = existing_mha.embed_dim
+        self.kdim = existing_mha.kdim
+        self.vdim = existing_mha.vdim
+        self._qkv_same_embed_dim = existing_mha._qkv_same_embed_dim
+        self.num_heads = existing_mha.num_heads
+        self.batch_first = existing_mha.batch_first
+        self.head_dim = existing_mha.head_dim
+        self.qkv = nn.Linear(self.embed_dim, self.embed_dim * 3, bias=existing_mha.in_proj_bias is not None)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim, bias=existing_mha.out_proj.bias is not None)
+        # Initialize parameters
+        with torch.no_grad():
+            self.qkv.weight.data.copy_(existing_mha.in_proj_weight.data)
+            if self.qkv.bias is not None:
+                self.qkv.bias.data.copy_(existing_mha.in_proj_bias.data)
+            self.proj.weight.data.copy_(existing_mha.out_proj.weight.data)
+            if self.proj.bias is not None:
+                self.proj.bias.data.copy_(existing_mha.out_proj.bias.data)
+        self.scaled_dot_product_attention = F.scaled_dot_product_attention
+    def forward(
+            self,
+            query,
+            key,
+            value,
+            key_padding_mask=None,
+            need_weights=True,
+            attn_mask=None,
+            average_attn_weights=True,
+            is_causal=False):
+        if attn_mask is not None and is_causal:
+            raise AssertionError("Only allow causal mask or attn_mask")
+        is_batched = query.dim() == 3
+        key_padding_mask = F._canonical_mask(
+            mask=key_padding_mask,
+            mask_name="key_padding_mask",
+            other_type=F._none_or_dtype(attn_mask),
+            other_name="attn_mask",
+            target_type=query.dtype
+        )
+        if self.batch_first and is_batched:
+            if key is value:
+                if query is key:
+                    query = key = value = query.transpose(1, 0)
+                else:
+                    query, key = [x.transpose(1, 0) for x in (query, key)]
+                    value = key
+            else:
+                query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
+        tgt_len, bsz, embed_dim = query.shape
+        src_len, _, _ = key.shape
+        E = query.size(-1)
+        qkv = self.qkv(query)
+        qkv = qkv.unflatten(-1, (3, E)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn_mask = F._canonical_mask(
+            mask=attn_mask,
+            mask_name="attn_mask",
+            other_type=F._none_or_dtype(key_padding_mask),
+            other_name="key_padding_mask",
+            target_type=q.dtype,
+            check_other=False,
+        )
+        if attn_mask is not None:
+            # ensure attn_mask's dim is 3
+            if attn_mask.dim() == 2:
+                correct_2d_size = (tgt_len, src_len)
+                if attn_mask.shape != correct_2d_size:
+                    raise RuntimeError(
+                        f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.")
+                attn_mask = attn_mask.unsqueeze(0)
+            elif attn_mask.dim() == 3:
+                correct_3d_size = (bsz * self.num_heads, tgt_len, src_len)
+                if attn_mask.shape != correct_3d_size:
+                    raise RuntimeError(
+                        f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}.")
+            else:
+                raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
+        if attn_mask is not None:
+            if attn_mask.size(0) == 1 and attn_mask.dim() == 3:
+                attn_mask = attn_mask.unsqueeze(0)
+            else:
+                attn_mask = attn_mask.view(bsz, self.num_heads, -1, src_len)
+        dropout_p = self.dropout if self.training else 0.
+        q = q.view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        k = k.view(src_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        v = v.view(src_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        src_len = k.size(1)
+        q = q.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        k = k.view(bsz, self.num_heads, src_len, self.head_dim)
+        v = v.view(bsz, self.num_heads, src_len, self.head_dim)
+        attn_output = self.scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
+        attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
+        attn_output = self.proj(attn_output)
+        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
+        if self.batch_first and is_batched:
+            return attn_output.transpose(1, 0), None
+        return attn_output, None

loralib/layers.py ADDED Viewed

	@@ -0,0 +1,937 @@

+#  ------------------------------------------------------------------------------------------
+#  This code is reconstructed based on loralib (https://github.com/microsoft/LoRA) by Baijiong Lin.
+#  ------------------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from typing import Optional, List
+from torch.jit import Final
+from timm.layers import use_fused_attn
+from timm.models.vision_transformer import Attention
+from transformers.models.bert.modeling_bert import BertAttention
+from typing import Optional, Tuple
+def set_param(curr_mod, name, param=None, mode='update'):
+    r"""Refer to https://github.com/Baijiong-Lin/MOML/blob/main/MTL/utils.py"""
+    if '.' in name:
+        n = name.split('.')
+        module_name = n[0]
+        rest = '.'.join(n[1:])
+        for name, mod in curr_mod.named_children():
+            if module_name == name:
+                return set_param(mod, rest, param, mode=mode)
+    else:
+        if mode == 'update':
+            delattr(curr_mod, name)
+            setattr(curr_mod, name, param)
+        elif mode == 'get':
+            if hasattr(curr_mod, name):
+                p = getattr(curr_mod, name)
+                return p
+class LoRALayer():
+    def __init__(
+        self,
+        r: int,
+        lora_alpha: int,
+        fan_in_fan_out: bool = False,
+        dropout_rate:float = 0,
+    ):
+        self.r = r
+        self.lora_alpha = lora_alpha
+        self.dropout_rate = dropout_rate
+        if self.r > 0:
+            #self.scaling = self.lora_alpha / self.r
+            self.scaling = self.lora_alpha/math.sqrt(self.r) #
+        # Mark the weight as unmerged
+        self.merged = False
+        # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        self.fan_in_fan_out = fan_in_fan_out
+        # define params that require LoRA {'param_name': 'lora_name'}
+        self.params_with_lora = {}
+    def register_lora_param(self):
+        r"""Register LoRA matrix"""
+        for param_name, lora_name in self.params_with_lora.items():
+            assert len(eval(f'self.{param_name}').size()) == 2
+            self.register_parameter(f'{lora_name}_lora_A',
+                nn.Parameter(eval(f'self.{param_name}').new_zeros((self.r, eval(f'self.{param_name}').size()[1])))
+                )
+            self.register_parameter(f'{lora_name}_lora_B',
+                nn.Parameter(eval(f'self.{param_name}').new_zeros((eval(f'self.{param_name}').size()[0], self.r)))
+                )
+            eval(f'self.{param_name}').requires_grad = False
+    def init_lora_param(self):
+        for param_name, lora_name in self.params_with_lora.items():
+            if hasattr(self, f'{lora_name}_lora_A'):
+                # initialize A the same way as the default for nn.Linear and B to zero
+                nn.init.kaiming_uniform_(eval(f'self.{lora_name}_lora_A'), a=math.sqrt(5))
+                nn.init.zeros_(eval(f'self.{lora_name}_lora_B'))
+    def transpose(self, w: torch.Tensor):
+        return w.transpose(0, 1) if self.fan_in_fan_out else w
+    def merge_BA(self, param_name: str):
+        lora_name = self.params_with_lora[param_name]
+        return self.transpose((eval(f'self.{lora_name}_lora_B') @ eval(f'self.{lora_name}_lora_A')).view(eval(f'self.{param_name}').shape))
+    def merge_lora_param(self):
+        r"""p_new = p + scaling * B @ A and keep differentiable to A and B"""
+        for param_name, lora_name in self.params_with_lora.items():
+            p = set_param(self, param_name, mode='get')
+            # detach() is very important here
+            p_new = p.detach() + self.merge_BA(param_name) * self.scaling
+            set_param(self, param_name, param=p_new, mode='update')
+    def add_lora_data(self):
+        r"""NOT differentiable"""
+        for param_name, lora_name in self.params_with_lora.items():
+            eval(f'self.{param_name}').data += self.merge_BA(param_name) * self.scaling
+    def sub_lora_data(self):
+        r"""NOT differentiable"""
+        for param_name, lora_name in self.params_with_lora.items():
+            eval(f'self.{param_name}').data -= self.merge_BA(param_name) * self.scaling
+    def lora_train(self, mode: bool = True):
+        if mode:
+            if self.merged and self.r > 0:
+            # Make sure that the weights are not merged
+                self.sub_lora_data()
+            self.merged = False
+        else:
+            if not self.merged and self.r > 0:
+            # Merge the weights and mark it
+                self.add_lora_data()
+            self.merged = True
+class Embedding(nn.Embedding, LoRALayer):
+    # LoRA implemented in a Embedding layer
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        **kwargs
+    ):
+        nn.Embedding.__init__(self, num_embeddings, embedding_dim, **kwargs)
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha)
+        self.params_with_lora = {'weight': 'w'}
+        if r > 0:
+            self.register_lora_param()
+        nn.Embedding.reset_parameters(self)
+        self.init_lora_param()
+    def init_lora_param(self):
+        if hasattr(self, 'w_lora_A'):
+            # initialize A the same way as the default for nn.Linear and B to zero
+            nn.init.zeros_(self.w_lora_A)
+            nn.init.normal_(self.w_lora_B)
+    def train(self, mode: bool = True):
+        nn.Embedding.train(self, mode)
+        self.lora_train(mode)
+    def forward(self, x: torch.Tensor, **kwargs):
+        if self.r > 0 and not self.merged:
+            self.merge_lora_param()
+            result = nn.Embedding.forward(self, x, **kwargs)
+            self.sub_lora_data()
+            return result
+        else:
+            return nn.Embedding.forward(self, x, **kwargs)
+class LinearLoRA(nn.Linear, LoRALayer):
+    # LoRA implemented in a Linear layer
+    def __init__(
+        self,
+        existing_linear: nn.Linear,
+        r: int = 0,
+        lora_alpha: int = 1,
+        fan_in_fan_out: bool = False,
+        dropout_rate = 0.,
+        seed: int = 1,
+        **kwargs
+    ):
+        super().__init__(
+            in_features=existing_linear.in_features,
+            out_features=existing_linear.out_features)
+        self.load_state_dict(existing_linear.state_dict())
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, fan_in_fan_out=fan_in_fan_out)
+        # Actual trainable parameters
+        self.params_with_lora = {'weight': 'w'}
+        if r > 0:
+            self.register_lora_param()
+        self.init_lora_param()
+        self.weight.data = self.transpose(self.weight.data)
+        if dropout_rate > 0:
+            self.dropout = nn.Dropout(dropout_rate)
+        else:
+            self.dropout = None
+    def train(self, mode: bool = True):
+        super().train(mode)
+        self.lora_train(mode)
+    def forward(self, x: torch.Tensor, **kwargs):
+        if self.dropout is None: # do as before
+            if self.r > 0 and not self.merged:
+                self.merge_lora_param()
+                result = nn.Linear.forward(self, x, **kwargs)
+                self.sub_lora_data()
+                return result
+            else:
+                return nn.Linear.forward(self, x, **kwargs)
+        # Compute the original linear transformation
+        original_output = nn.Linear.forward(self, x)
+        if self.training and self.dropout.p > 0:
+            x = self.dropout(x)
+        if self.r > 0 and not self.merged:
+            lora_adjustment = torch.matmul(x,self.merge_BA('weight').transpose(0, 1)) * self.scaling
+            result = original_output + lora_adjustment
+        else:
+            result = original_output
+        return result
+class Conv1d(nn.Conv1d, LoRALayer):
+    # LoRA implemented in a Conv1d layer
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        **kwargs
+    ):
+        nn.Conv1d.__init__(self, in_channels, out_channels, kernel_size, **kwargs)
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha)
+        assert type(kernel_size) is int
+        # Actual trainable parameters
+        self.params_with_lora = {'weight': 'w'}
+        if r > 0:
+            self.w_lora_A = nn.Parameter(
+                self.weight.new_zeros((r*kernel_size, in_channels*kernel_size))
+            )
+            self.w_lora_B = nn.Parameter(
+                self.weight.new_zeros((out_channels//self.groups*kernel_size, r*kernel_size))
+            )
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+        nn.Conv1d.reset_parameters(self)
+        self.init_lora_param()
+    def train(self, mode: bool = True):
+        nn.Conv1d.train(self, mode)
+        self.lora_train(mode)
+    def forward(self, x: torch.Tensor, **kwargs):
+        if self.r > 0 and not self.merged:
+            self.merge_lora_param()
+            result = nn.Conv1d.forward(self, x, **kwargs)
+            self.sub_lora_data()
+            return result
+        else:
+            return nn.Conv1d.forward(self, x, **kwargs)
+class Conv2d(nn.Conv2d, LoRALayer):
+    # LoRA implemented in a Conv2d layer
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        **kwargs
+    ):
+        nn.Conv2d.__init__(self, in_channels, out_channels, kernel_size, **kwargs)
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha)
+        assert type(kernel_size) is int
+        # Actual trainable parameters
+        self.params_with_lora = {'weight': 'w'}
+        if r > 0:
+            self.w_lora_A = nn.Parameter(
+                self.weight.new_zeros((r*kernel_size, in_channels*kernel_size))
+            )
+            self.w_lora_B = nn.Parameter(
+                self.weight.new_zeros((out_channels//self.groups*kernel_size, r*kernel_size))
+            )
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+        nn.Conv2d.reset_parameters(self)
+        self.init_lora_param()
+    def train(self, mode: bool = True):
+        nn.Conv2d.train(self, mode)
+        self.lora_train(mode)
+    def forward(self, x: torch.Tensor, **kwargs):
+        if self.r > 0 and not self.merged:
+            self.merge_lora_param()
+            result = nn.Conv2d.forward(self, x, **kwargs)
+            self.sub_lora_data()
+            return result
+        else:
+            return nn.Conv2d.forward(self, x, **kwargs)
+class Conv3d(nn.Conv3d, LoRALayer):
+    # LoRA implemented in a Conv3d layer
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        **kwargs
+    ):
+        nn.Conv3d.__init__(self, in_channels, out_channels, kernel_size, **kwargs)
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha)
+        assert type(kernel_size) is int
+        # Actual trainable parameters
+        self.params_with_lora = {'weight': 'w'}
+        if r > 0:
+            self.w_lora_A = nn.Parameter(
+                self.weight.new_zeros((r*kernel_size, in_channels*kernel_size))
+            )
+            self.w_lora_B = nn.Parameter(
+                self.weight.new_zeros((out_channels//self.groups*kernel_size, r*kernel_size))
+            )
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+        nn.Conv3d.reset_parameters(self)
+        self.init_lora_param()
+    def train(self, mode: bool = True):
+        nn.Conv3d.train(self, mode)
+        self.lora_train(mode)
+    def forward(self, x: torch.Tensor, **kwargs):
+        if self.r > 0 and not self.merged:
+            self.merge_lora_param()
+            result = nn.Conv3d.forward(self, x, **kwargs)
+            self.sub_lora_data()
+            return result
+        else:
+            return nn.Conv3d.forward(self, x, **kwargs)
+class PlainMultiheadAttentionLoRA(nn.Module):
+    def __init__(
+            self,
+            existing_mha: nn.MultiheadAttention,
+            enable_lora: list = ['q', 'k', 'v', 'o'],
+            r: int = 0,
+            lora_alpha: int = 1,
+            dropout_rate:float = 0.,
+            **kwargs
+        ):
+        super().__init__()
+        self.dropout = 0 # this module is not used to retrain the main block
+        self.embed_dim = existing_mha.embed_dim
+        self.kdim = existing_mha.kdim
+        self.vdim = existing_mha.vdim
+        self._qkv_same_embed_dim = existing_mha._qkv_same_embed_dim
+        self.num_heads = existing_mha.num_heads
+        self.batch_first = existing_mha.batch_first
+        self.head_dim = existing_mha.head_dim
+        #self.qkv = nn.Linear(self.embed_dim, self.embed_dim * 3, bias=existing_mha.in_proj_bias is not None)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=existing_mha.in_proj_bias is not None)
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=existing_mha.in_proj_bias is not None)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=existing_mha.in_proj_bias is not None)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim, bias=existing_mha.out_proj.bias is not None)
+        # Initialize parameters
+        with torch.no_grad():
+            # Extract the existing weights and biases
+            existing_weight = existing_mha.in_proj_weight.data
+            existing_bias = existing_mha.in_proj_bias.data if existing_mha.in_proj_bias is not None else None
+            # Initialize q_proj
+            self.q_proj.weight.data.copy_(existing_weight[:self.embed_dim, :])
+            if existing_bias is not None:
+                self.q_proj.bias.data.copy_(existing_bias[:self.embed_dim])
+            # Initialize k_proj
+            self.k_proj.weight.data.copy_(existing_weight[self.embed_dim:2*self.embed_dim, :])
+            if existing_bias is not None:
+                self.k_proj.bias.data.copy_(existing_bias[self.embed_dim:2*self.embed_dim])
+            # Initialize v_proj
+            self.v_proj.weight.data.copy_(existing_weight[2*self.embed_dim:, :])
+            if existing_bias is not None:
+                self.v_proj.bias.data.copy_(existing_bias[2*self.embed_dim:])
+            # Initialize proj
+            self.proj.weight.data.copy_(existing_mha.out_proj.weight.data)
+            if self.proj.bias is not None:
+                self.proj.bias.data.copy_(existing_mha.out_proj.bias.data)
+        self.scaled_dot_product_attention = F.scaled_dot_product_attention
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, dropout_rate=dropout_rate)
+        # Init qkv as a new lora linear layer
+        for item in enable_lora:
+            if item == 'q':
+                self.q_proj = LinearLoRA(self.q_proj,
+                                         r=r,
+                                         lora_alpha=lora_alpha,
+                                         fan_in_fan_out=False,
+                                         dropout_rate = dropout_rate)
+            elif item == 'k':
+                self.k_proj = LinearLoRA(self.k_proj,
+                                         r=r,
+                                         lora_alpha=lora_alpha,
+                                         fan_in_fan_out=False,
+                                         dropout_rate = dropout_rate)
+            elif item == 'v':
+                self.v_proj = LinearLoRA(self.v_proj,
+                                         r=r,
+                                         lora_alpha=lora_alpha,
+                                         fan_in_fan_out=False,
+                                         dropout_rate = dropout_rate)
+            elif item == 'o':
+                self.proj = LinearLoRA(self.proj,
+                                         r=r,
+                                         lora_alpha=lora_alpha,
+                                         fan_in_fan_out=False,
+                                         dropout_rate = dropout_rate)
+    def forward_module(
+            self,
+            query,
+            key,
+            value,
+            key_padding_mask=None,
+            need_weights=True,
+            attn_mask=None,
+            average_attn_weights=True,
+            is_causal=False):
+        if attn_mask is not None and is_causal:
+            raise AssertionError("Only allow causal mask or attn_mask")
+        is_batched = query.dim() == 3
+        key_padding_mask = F._canonical_mask(
+            mask=key_padding_mask,
+            mask_name="key_padding_mask",
+            other_type=F._none_or_dtype(attn_mask),
+            other_name="attn_mask",
+            target_type=query.dtype
+        )
+        if self.batch_first and is_batched:
+            if key is value:
+                if query is key:
+                    query = key = value = query.transpose(1, 0)
+                else:
+                    query, key = [x.transpose(1, 0) for x in (query, key)]
+                    value = key
+            else:
+                query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
+        tgt_len, bsz, embed_dim = query.shape
+        src_len, _, _ = key.shape
+        """
+        E = query.size(-1)
+        qkv = self.qkv(query)
+        qkv = qkv.unflatten(-1, (3, E)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        """
+        q = self.q_proj(query)
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        attn_mask = F._canonical_mask(
+            mask=attn_mask,
+            mask_name="attn_mask",
+            other_type=F._none_or_dtype(key_padding_mask),
+            other_name="key_padding_mask",
+            target_type=q.dtype,
+            check_other=False,
+        )
+        if attn_mask is not None:
+            # ensure attn_mask's dim is 3
+            if attn_mask.dim() == 2:
+                correct_2d_size = (tgt_len, src_len)
+                if attn_mask.shape != correct_2d_size:
+                    raise RuntimeError(
+                        f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.")
+                attn_mask = attn_mask.unsqueeze(0)
+            elif attn_mask.dim() == 3:
+                correct_3d_size = (bsz * self.num_heads, tgt_len, src_len)
+                if attn_mask.shape != correct_3d_size:
+                    raise RuntimeError(
+                        f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}.")
+            else:
+                raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
+        if attn_mask is not None:
+            if attn_mask.size(0) == 1 and attn_mask.dim() == 3:
+                attn_mask = attn_mask.unsqueeze(0)
+            else:
+                attn_mask = attn_mask.view(bsz, self.num_heads, -1, src_len)
+        dropout_p = self.dropout if self.training else 0.
+        q = q.view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        k = k.view(src_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        v = v.view(src_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        src_len = k.size(1)
+        q = q.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        k = k.view(bsz, self.num_heads, src_len, self.head_dim)
+        v = v.view(bsz, self.num_heads, src_len, self.head_dim)
+        attn_output = self.scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
+        attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
+        attn_output = self.proj(attn_output)
+        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
+        if self.batch_first and is_batched:
+            return attn_output.transpose(1, 0), None
+        return attn_output, None
+    def train(self, mode: bool = True):
+        super().train(mode)
+        #self.lora_train(mode)
+    def forward(self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            **kwargs):
+        return self.forward_module(query, key, value, **kwargs)
+class AttentionLoRA(nn.Module):
+    fused_attn: Final[bool]
+    def __init__(
+            self,
+            existing_mha: Attention,
+            enable_lora: list = ['q', 'k', 'v', 'o'],
+            r: int = 0,
+            lora_alpha: int = 1,
+            dropout_rate: float = 0.,
+            seed: int = 1,
+    ) -> None:
+        super().__init__()
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        self.embed_dim = existing_mha.proj.in_features
+        self.num_heads = existing_mha.num_heads
+        self.head_dim = existing_mha.head_dim
+        assert self.embed_dim % self.num_heads == 0, 'dim should be divisible by num_heads'
+        self.scale = self.head_dim ** -0.5
+        self.fused_attn = use_fused_attn()
+        self.dropout = 0
+        self.q_norm = existing_mha.q_norm
+        self.k_norm = existing_mha.k_norm
+        self.attn_drop = nn.Dropout(self.dropout)
+        self.proj_drop = nn.Dropout(self.dropout)
+        self.r = r
+        self.lora_alpha = lora_alpha
+        self.dropout_rate = dropout_rate
+        self.enable_lora = enable_lora
+        self.seed = seed
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, dropout_rate=dropout_rate)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=existing_mha.qkv.bias is not None)
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=existing_mha.qkv.bias is not None)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=existing_mha.qkv.bias is not None)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim, bias=existing_mha.proj.bias is not None)
+        # Initialize parameters
+        with torch.no_grad():
+            existing_weight = existing_mha.qkv.weight.data
+            existing_bias = existing_mha.qkv.bias.data
+            self.q_proj.weight.data.copy_(existing_weight[:self.embed_dim, :])
+            if existing_bias is not None:
+                self.q_proj.bias.data.copy_(existing_bias[:self.embed_dim])
+            self.k_proj.weight.data.copy_(existing_weight[self.embed_dim:2*self.embed_dim, :])
+            if existing_bias is not None:
+                self.k_proj.bias.data.copy_(existing_bias[self.embed_dim:2*self.embed_dim])
+            self.v_proj.weight.data.copy_(existing_weight[2*self.embed_dim:, :])
+            if existing_bias is not None:
+                self.v_proj.bias.data.copy_(existing_bias[2*self.embed_dim:])
+            self.proj.weight.data.copy_(existing_mha.proj.weight.data)
+            if self.proj.bias is not None:
+                self.proj.bias.data.copy_(existing_mha.proj.bias.data)
+        self.q_proj, self.k_proj, self.v_proj, self.proj = self.inject_lora(self.q_proj, self.k_proj, self.v_proj, self.proj)
+    def inject_lora(self, q, k, v, proj):
+        for item in self.enable_lora:
+            if item == 'q':
+                q = LinearLoRA(q,
+                                r=self.r,
+                                lora_alpha=self.lora_alpha,
+                                fan_in_fan_out=False,
+                                dropout_rate = self.dropout_rate,
+                                seed=self.seed)
+            elif item == 'k':
+                k = LinearLoRA(k,
+                                r=self.r,
+                                lora_alpha=self.lora_alpha,
+                                fan_in_fan_out=False,
+                                dropout_rate = self.dropout_rate,
+                                seed=self.seed)
+            elif item == 'v':
+                v = LinearLoRA(v,
+                                r=self.r,
+                                lora_alpha=self.lora_alpha,
+                                fan_in_fan_out=False,
+                                dropout_rate = self.dropout_rate,
+                                seed=self.seed)
+            elif item == 'o':
+                proj = LinearLoRA(proj,
+                                r=self.r,
+                                lora_alpha=self.lora_alpha,
+                                fan_in_fan_out=False,
+                                dropout_rate = self.dropout_rate,
+                                seed=self.seed)
+        return q, k, v, proj
+    def forward(self, x: torch.Tensor, return_attn_scores=False) -> torch.Tensor:
+        B, N, C = x.shape
+        q = self.q_proj(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        k = self.k_proj(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        v = self.v_proj(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if return_attn_scores:
+            q = q * self.scale
+            attn_scores = q @ k.transpose(-2, -1)
+            attn = attn_scores.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+            x = x.transpose(1, 2).reshape(B, N, C)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+            return (x, attn_scores)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class BertAttentionLoRA(nn.Module):
+    def __init__(self,
+                 existing_mha: BertAttention,
+                 enable_lora: list = ['q', 'k', 'v', 'o'],
+                 r: int = 0,
+                 lora_alpha: int = 1,
+                 dropout_rate: float = 0.,
+                 seed:int = 1,):
+        super().__init__()
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        self.self_attn = existing_mha.self
+        self.output = existing_mha.output
+        self.num_attention_heads = self.self_attn.num_attention_heads
+        self.attention_head_size = self.self_attn.attention_head_size
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.hidden_size = self.self_attn.query.in_features
+        self.q_proj = nn.Linear(self.hidden_size, self.all_head_size)
+        self.k_proj = nn.Linear(self.hidden_size, self.all_head_size)
+        self.v_proj = nn.Linear(self.hidden_size, self.all_head_size)
+        self.proj = nn.Linear(self.output.dense.in_features, self.output.dense.in_features)
+        self.LayerNorm = self.output.LayerNorm
+        self.dropout = nn.Dropout(0)
+        self.r = r
+        self.lora_alpha = lora_alpha
+        self.dropout_rate = dropout_rate
+        self.enable_lora = enable_lora
+        self.seed = seed
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, dropout_rate=dropout_rate)
+        # Initialize parameters
+        with torch.no_grad():
+            self.q_proj.weight.data.copy_(self.self_attn.query.weight.data)
+            if self.self_attn.query.bias.data is not None:
+                self.q_proj.bias.data.copy_(self.self_attn.query.bias.data)
+            self.k_proj.weight.data.copy_(self.self_attn.key.weight.data)
+            if self.self_attn.key.bias.data is not None:
+                self.k_proj.bias.data.copy_(self.self_attn.key.bias.data)
+            self.v_proj.weight.data.copy_(self.self_attn.value.weight.data)
+            if self.self_attn.value.bias.data is not None:
+                self.v_proj.bias.data.copy_(self.self_attn.value.bias.data)
+            self.proj.weight.data.copy_(self.output.dense.weight.data)
+            if self.output.dense.bias.data is not None:
+                self.proj.bias.data.copy_(self.output.dense.bias.data)
+        self.q_proj, self.k_proj, self.v_proj, self.proj = self.inject_lora(self.q_proj, self.k_proj, self.v_proj, self.proj)
+        self.position_embedding_type = self.self_attn.position_embedding_type
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = self.self_attn.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * self.self_attn.max_position_embeddings - 1, self.attention_head_size)
+        self.is_decoder = self.self_attn.is_decoder
+    def inject_lora(self, q, k, v, proj):
+        for item in self.enable_lora:
+            if item == 'q':
+                q = LinearLoRA(q,
+                                r=self.r,
+                                lora_alpha=self.lora_alpha,
+                                fan_in_fan_out=False,
+                                dropout_rate = self.dropout_rate,
+                                seed=self.seed)
+            elif item == 'k':
+                k = LinearLoRA(k,
+                                r=self.r,
+                                lora_alpha=self.lora_alpha,
+                                fan_in_fan_out=False,
+                                dropout_rate = self.dropout_rate,
+                                seed=self.seed)
+            elif item == 'v':
+                v = LinearLoRA(v,
+                                r=self.r,
+                                lora_alpha=self.lora_alpha,
+                                fan_in_fan_out=False,
+                                dropout_rate = self.dropout_rate,
+                                seed=self.seed)
+            elif item == 'o':
+                proj = LinearLoRA(proj,
+                                r=self.r,
+                                lora_alpha=self.lora_alpha,
+                                fan_in_fan_out=False,
+                                dropout_rate = self.dropout_rate,
+                                seed=self.seed)
+        return q, k, v, proj
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.q_proj(hidden_states)
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.k_proj(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.v_proj(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
+            value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
+            value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        self_attn_outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        if self.is_decoder:
+            self_attn_outputs = self_attn_outputs + (past_key_value,)
+        # attention_output = self.output(self_outputs[0], hidden_states)
+        self_outputs = self.proj(self_attn_outputs[0])
+        attention_output = self.LayerNorm(self_outputs + hidden_states)
+        outputs = (attention_output,) + self_attn_outputs[1:]  # add attentions if we output them
+        return outputs
+class MergedLinear(nn.Linear, LoRALayer):
+    # LoRA implemented in a dense layer
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        enable_lora: List[bool] = [False],
+        fan_in_fan_out: bool = False,
+        **kwargs
+    ):
+        nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha)
+        assert out_features % len(enable_lora) == 0, \
+            'The length of enable_lora must divide out_features'
+        self.enable_lora = enable_lora
+        # Actual trainable parameters
+        self.params_with_lora = {'weight': 'w'}
+        if r > 0 and any(enable_lora):
+            self.w_lora_A = nn.Parameter(
+                self.weight.new_zeros((r * sum(enable_lora), in_features)))
+            self.w_lora_B = nn.Parameter(
+                self.weight.new_zeros((out_features // len(enable_lora) * sum(enable_lora), r))
+            ) # weights for Conv1D with groups=sum(enable_lora)
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+            # Compute the indices
+            self.lora_ind = self.weight.new_zeros(
+                (out_features, ), dtype=torch.bool
+            ).view(len(enable_lora), -1)
+            self.lora_ind[enable_lora, :] = True
+            self.lora_ind = self.lora_ind.view(-1)
+        nn.Linear.reset_parameters(self)
+        self.init_lora_param()
+        self.weight.data = self.transpose(self.weight.data)
+    def zero_pad(self, x):
+        result = x.new_zeros((len(self.lora_ind), *x.shape[1:]))
+        result[self.lora_ind] = x
+        return result
+    def merge_BA(self, param_name: str):
+        lora_name = self.params_with_lora[param_name]
+        delta_w = F.conv1d(
+            eval(f'self.{lora_name}_lora_A').unsqueeze(0),
+            eval(f'self.{lora_name}_lora_B').unsqueeze(-1),
+            groups=sum(self.enable_lora)
+        ).squeeze(0)
+        return self.transpose(self.zero_pad(delta_w))
+    def train(self, mode: bool = True):
+        nn.Linear.train(self, mode)
+        self.lora_train(mode)
+    def forward(self, x: torch.Tensor, **kwargs):
+        if self.r > 0 and not self.merged:
+            self.merge_lora_param()
+            result = nn.Linear.forward(self, x, **kwargs)
+            self.sub_lora_data()
+            return result
+        else:
+            return nn.Linear.forward(self, x, **kwargs)

loralib/utils.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import os
+import torch
+import torch.nn as nn
+from typing import Dict
+from .layers import LoRALayer, AttentionLoRA, BertAttentionLoRA
+from timm.models.vision_transformer import Attention
+from transformers.models.bert.modeling_bert import BertAttention
+INDEX_POSITIONS_TEXT = {
+    'top1': [11],
+    'top2': [10, 11],
+    'top3': [9, 10, 11],
+    'bottom': [0, 1, 2, 3],
+    'mid': [4, 5, 6, 7],
+    'up': [8, 9, 10, 11],
+    'half-up': [6, 7, 8, 9, 10, 11],
+    'half-bottom': [0, 1, 2, 3, 4, 5],
+    'all': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}
+INDEX_POSITIONS_VISION = {
+    'top': [11],
+    'top3': [9, 10, 11],
+    'bottom': [0, 1, 2, 3],
+    'mid': [4, 5, 6, 7],
+    'up': [8, 9, 10, 11],
+    'half-up': [6, 7, 8, 9, 10, 11],
+    'half-bottom': [0, 1, 2, 3, 4, 5],
+    'all': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
+}
+def mark_only_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None:
+    for n, p in model.named_parameters():
+        if 'lora_' not in n:
+            p.requires_grad = False
+    if bias == 'none':
+        return
+    elif bias == 'all':
+        for n, p in model.named_parameters():
+            if 'bias' in n:
+                p.requires_grad = True
+    elif bias == 'lora_only':
+        for m in model.modules():
+            if isinstance(m, LoRALayer) and \
+                    hasattr(m, 'bias') and \
+                    m.bias is not None:
+                m.bias.requires_grad = True
+    else:
+        raise NotImplementedError
+def lora_state_dict(model: nn.Module, bias: str = 'none') -> Dict[str, torch.Tensor]:
+    my_state_dict = model.state_dict()
+    if bias == 'none':
+        return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k}
+    elif bias == 'all':
+        return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k or 'bias' in k}
+    elif bias == 'lora_only':
+        to_return = {}
+        for k in my_state_dict:
+            if 'lora_' in k:
+                to_return[k] = my_state_dict[k]
+                bias_name = k.split('lora_')[0]+'bias'
+                if bias_name in my_state_dict:
+                    to_return[bias_name] = my_state_dict[bias_name]
+        return to_return
+    else:
+        raise NotImplementedError
+def get_lora_parameters(model, bias='none'):
+    params = []
+    for name, param in model.named_parameters():
+        if bias == 'none':
+            if 'lora_' in name:
+                params.append(param)
+        elif bias == 'all':
+            if 'lora_' in name or 'bias' in name:
+                params.append(param)
+        elif bias == 'lora_only':
+            if 'lora_' in name:
+                params.append(param)
+                bias_name = name.split('lora_')[0] + 'bias'
+                if bias_name in model.state_dict():
+                    bias_param = dict(model.named_parameters())[bias_name]
+                    params.append(bias_param)
+        else:
+            raise NotImplementedError
+    return params
+def apply_lora(args, clip_model):
+    list_lora_layers = []
+    indices = INDEX_POSITIONS_TEXT[args.position]
+    text_encoder = clip_model.text.transformer.encoder
+    for i, block in enumerate(text_encoder.layer):
+        if i in indices:
+            for name, submodule in block.named_children():
+                if isinstance(submodule, BertAttention):
+                    new_multi_head_lora = BertAttentionLoRA(
+                        submodule, enable_lora=args.params, r=args.r, lora_alpha=args.alpha, dropout_rate=args.dropout_rate, seed=args.seed)
+                    setattr(block, name, new_multi_head_lora)
+                    list_lora_layers.append(new_multi_head_lora)
+    indices = INDEX_POSITIONS_VISION[args.position]
+    vision_encoder = clip_model.visual.trunk
+    for i, block in enumerate(vision_encoder.blocks):
+        if i in indices:
+            for name, submodule in block.named_children():
+                if isinstance(submodule, Attention):
+                    new_multi_head_lora = AttentionLoRA(
+                        submodule, enable_lora=args.params, r=args.r, lora_alpha=args.alpha, dropout_rate=args.dropout_rate, seed=args.seed)
+                    setattr(block, name, new_multi_head_lora)
+                    list_lora_layers.append(new_multi_head_lora)
+    return list_lora_layers
+def save_lora(args, list_lora_layers, loss_fn, msg, save_dir):
+    weights = {}
+    for i, layer in enumerate(list_lora_layers):
+        layer_weights = {}
+        if 'q' in args.params:
+            layer_weights['q_proj'] = {
+                'w_lora_A': layer.q_proj.w_lora_A.data,
+                'w_lora_B': layer.q_proj.w_lora_B.data
+            }
+        if 'k' in args.params:
+            layer_weights['k_proj'] = {
+                'w_lora_A': layer.k_proj.w_lora_A.data,
+                'w_lora_B': layer.k_proj.w_lora_B.data
+            }
+        if 'v' in args.params:
+            layer_weights['v_proj'] = {
+                'w_lora_A': layer.v_proj.w_lora_A.data,
+                'w_lora_B': layer.v_proj.w_lora_B.data
+            }
+        if 'o' in args.params:
+            layer_weights['proj'] = {
+                'w_lora_A': layer.proj.w_lora_A.data,
+                'w_lora_B': layer.proj.w_lora_B.data
+            }
+        weights[f'layer_{i}'] = layer_weights
+    if args.loss_type == 'clip_loss_ace_hgnn':
+        weights['img_edge_adapter'] = loss_fn.img_edge_adapter.state_dict()
+        weights['img_node_adapter'] = loss_fn.img_node_adapter.state_dict()
+        weights['text_edge_adapter'] = loss_fn.text_edge_adapter.state_dict()
+        weights['text_node_adapter'] = loss_fn.text_node_adapter.state_dict()
+    if args.learnable_logit_scale:
+        weights['logit_scale'] = loss_fn.logit_scale.data.cpu()
+    metadata = {
+        'r': args.r,
+        'topk': args.topk,
+        'params': args.params,
+        'position': args.position,
+        'loss_type' : args.loss_type,
+    }
+    save_data = {
+        'weights': weights,
+        'metadata': metadata
+    }
+    save_path = f'{save_dir}/{args.filename}_{msg}.pt'
+    torch.save(save_data, save_path)
+    print(f'LoRA weights saved to {save_path}')
+def load_model(args, list_lora_layers, device, loss_fn=None):
+    if not os.path.exists(args.load_path):
+        raise FileNotFoundError(f'File {args.load_path} does not exist.')
+    loaded_data = torch.load(args.load_path, map_location=device)
+    weights = loaded_data['weights']
+    for i, layer in enumerate(list_lora_layers):
+        layer_weights = weights[f'layer_{i}']
+        if 'q' in args.params and 'q_proj' in layer_weights:
+            layer.q_proj.w_lora_A.data.copy_(
+                layer_weights['q_proj']['w_lora_A'])
+            layer.q_proj.w_lora_B.data.copy_(
+                layer_weights['q_proj']['w_lora_B'])
+        if 'k' in args.params and 'k_proj' in layer_weights:
+            layer.k_proj.w_lora_A.data.copy_(
+                layer_weights['k_proj']['w_lora_A'])
+            layer.k_proj.w_lora_B.data.copy_(
+                layer_weights['k_proj']['w_lora_B'])
+        if 'v' in args.params and 'v_proj' in layer_weights:
+            layer.v_proj.w_lora_A.data.copy_(
+                layer_weights['v_proj']['w_lora_A'])
+            layer.v_proj.w_lora_B.data.copy_(
+                layer_weights['v_proj']['w_lora_B'])
+        if 'o' in args.params and 'proj' in layer_weights:
+            layer.proj.w_lora_A.data.copy_(layer_weights['proj']['w_lora_A'])
+            layer.proj.w_lora_B.data.copy_(layer_weights['proj']['w_lora_B'])
+    if args.loss_type == 'clip_loss_ace_hgnn':
+        loss_fn.img_edge_adapter.load_state_dict(weights['img_edge_adapter'])
+        loss_fn.img_node_adapter.load_state_dict(weights['img_node_adapter'])
+        loss_fn.text_edge_adapter.load_state_dict(weights['text_edge_adapter'])
+        loss_fn.text_node_adapter.load_state_dict(weights['text_node_adapter'])
+    if args.learnable_logit_scale:
+        loss_fn.logit_scale.data.copy_(weights['logit_scale'])
+    print(f'LoRA weights loaded from {args.load_path}')

loss.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import warnings
+warnings.filterwarnings("ignore")
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from run_utils import set_random_seed
+class Identity(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x
+class CLIPLoss(nn.Module):
+    def __init__(self, args, logit_scale):
+        super(CLIPLoss, self).__init__()
+        self.args = args
+        if args.learnable_logit_scale:
+            self.logit_scale = nn.Parameter(logit_scale.clone().detach())
+        else:
+            self.register_buffer('logit_scale', logit_scale.clone().detach())
+    def forward(self, image_features, text_features, merged_df=None, indices=None):
+        device = image_features.device
+        batch_size, feature_dim = image_features.size()
+        labels = torch.arange(batch_size, device=device, dtype=torch.long)
+        logits_per_image = self.logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.T
+        if merged_df is not None:
+            compare_matrix = merged_df.iloc[indices, 2:].to_numpy()
+            vector_similarity_matrix = np.ones((compare_matrix.shape[0], compare_matrix.shape[0]), dtype=np.int32)
+            comparison = (compare_matrix[:, None, :] == compare_matrix[None, :, :]).all(axis=2)
+            vector_similarity_matrix[comparison] = 0
+            np.fill_diagonal(vector_similarity_matrix, 1)
+            vector_similarity_matrix = torch.from_numpy(vector_similarity_matrix).bool().to(device)
+            masked_logits_per_image = logits_per_image.masked_fill(~vector_similarity_matrix, float('-inf'))
+            masked_logits_per_text = logits_per_text.masked_fill(~vector_similarity_matrix.T, float('-inf'))
+            loss = (F.cross_entropy(masked_logits_per_image, labels) + F.cross_entropy(masked_logits_per_text, labels)) / 2
+        else:
+            loss = (F.cross_entropy(logits_per_image, labels) + F.cross_entropy(logits_per_text, labels)) / 2
+        return loss
+class ResidualAdapter(nn.Module):
+    def __init__(self, dim, bottleneck_dim=128):
+        super().__init__()
+        self.down = nn.Linear(dim, bottleneck_dim)
+        self.act = nn.LeakyReLU(0.2)
+        self.up = nn.Linear(bottleneck_dim, dim)
+        nn.init.kaiming_normal_(self.down.weight)
+        nn.init.kaiming_normal_(self.up.weight)
+    def forward(self, x):
+        return self.up(self.act(self.down(x)))
+class CLIPLossACE_HGAT(nn.Module):
+    def __init__(self, args, logit_scale, in_channels):
+        super(CLIPLossACE_HGAT, self).__init__()
+        set_random_seed(args.seed)
+        self.args = args
+        self.img_edge_adapter = ResidualAdapter(in_channels, args.hidden_features)
+        self.text_edge_adapter = ResidualAdapter(in_channels, args.hidden_features)
+        self.img_node_adapter = ResidualAdapter(in_channels, args.hidden_features)
+        self.text_node_adapter = ResidualAdapter(in_channels, args.hidden_features)
+        if args.learnable_logit_scale:
+            self.logit_scale = nn.Parameter(logit_scale.clone().detach())
+        else:
+            self.register_buffer('logit_scale', logit_scale.clone().detach())
+    def apply_ace_hgat(self, features, attn_weights, encoder="img"):
+        if encoder =="img":
+            edge_adapter = self.img_edge_adapter
+            node_adapter = self.img_node_adapter
+        elif encoder == 'text':
+            edge_adapter = self.text_edge_adapter
+            node_adapter = self.text_node_adapter
+        else:
+            raise ValueError(f"encoder must be img or text but given {encoder}")
+        B, N, D = features.shape
+        patches_norm = F.normalize(features[:, 1:, :], p=2, dim=-1)
+        # Similarity Matrix: (B, P, P)
+        sim = torch.zeros(size=(B, N, N), device=features.device)
+        patch_sim = torch.bmm(patches_norm, patches_norm.transpose(1, 2)) # [B, P, P]
+        sim[:, 1:, 1:] = patch_sim
+        sim[:, 0, 1:] = attn_weights
+        mask_logic = torch.eye(N, device=features.device).bool().unsqueeze(0).repeat(B, 1, 1)
+        mask_logic[:, 1:, 0] = True
+        sim = sim.masked_fill(mask_logic, -float('inf'))
+        topk_vals, topk_indices = torch.topk(sim, k=self.args.topk, dim=-1)
+        mask_sparse = torch.full_like(sim, -float('inf'))
+        mask_sparse.scatter_(-1, topk_indices, topk_vals)
+        A = F.softmax(mask_sparse, dim=-1)
+        A = A.masked_fill(torch.eye(N, device=features.device).bool().unsqueeze(0).repeat(B, 1, 1), 1)
+        A[:, 1:, 0] = A[:, 0, 1:]
+        H_edges_raw = torch.matmul(A, features)
+        H_edges_refined = edge_adapter(H_edges_raw)
+        H_context_raw = torch.matmul(A.transpose(1, 2), H_edges_refined)
+        H_context_processed = node_adapter(H_context_raw)
+        x_out = H_context_processed
+        return x_out
+    def forward(self, clip_model, images, texts, merged_df=None, indices=None):
+        device = images.device
+        clip_model.visual.trunk.global_pool = ''
+        image_features, img_attn_scores = clip_model.visual.trunk.get_attn_scores(images)
+        image_features = F.normalize(clip_model.visual.head(image_features), dim=-1)
+        text_features, text_attn_scores = clip_model.encode_text(texts, normalize=True, output_attentions=True, output_tokens=True)
+        img_attn_scores = img_attn_scores.mean(dim=1) # [B, 197, 197]
+        img_attn_weights = img_attn_scores[:, 0, 1:] # relationship between CLS token and patch embeddings [B, 196]
+        text_attn_scores = text_attn_scores[-1].mean(dim=1) # [B, 256, 256]
+        text_attn_weights = text_attn_scores[:, 0, 1:] # relationship between global token and other token embeddings [B, 255]
+        if self.args.apply_gnn_encoders == 'vision':
+            image_features = self.apply_ace_hgat(image_features, img_attn_weights, encoder="img")
+            image_features = F.normalize(image_features, dim=-1)
+            logits_per_image = self.logit_scale * image_features[:, 0] @ text_features[:, 0].t()
+            logits_per_text = logits_per_image.T
+        elif self.args.apply_gnn_encoders == 'text':
+            text_features = self.apply_ace_hgat(text_features, text_attn_weights, encoder="text")
+            text_features = F.normalize(text_features, dim=-1)
+            logits_per_image = self.logit_scale * image_features[:, 0] @ text_features[:, 0].t()
+            logits_per_text = logits_per_image.T
+        elif self.args.apply_gnn_encoders == 'both':
+            image_features = self.apply_ace_hgat(image_features, img_attn_weights, encoder="img")
+            image_features = F.normalize(image_features, dim=-1)
+            text_features = self.apply_ace_hgat(text_features, text_attn_weights, encoder="text")
+            text_features = F.normalize(text_features, dim=-1)
+            logits_per_image = self.logit_scale * image_features[:, 0] @ text_features[:, 0].t()
+            logits_per_text = logits_per_image.T
+        labels = torch.arange(image_features.shape[0], device=device, dtype=torch.long)
+        if logits_per_image.isnan().sum() > 0:
+            raise ValueError('NaN value in logits_per_image')
+        if merged_df is not None: # Label-Guided InfoNCE loss
+            compare_matrix = merged_df.iloc[indices, 2:].to_numpy()
+            vector_similarity_matrix = np.ones((compare_matrix.shape[0], compare_matrix.shape[0]), dtype=np.int32)
+            comparison = (compare_matrix[:, None, :] == compare_matrix[None, :, :]).all(axis=2)
+            vector_similarity_matrix[comparison] = 0
+            np.fill_diagonal(vector_similarity_matrix, 1)
+            vector_similarity_matrix = torch.from_numpy(vector_similarity_matrix).bool().to(device)
+            masked_logits_per_image = logits_per_image.masked_fill(~vector_similarity_matrix, float('-inf'))
+            masked_logits_per_text = logits_per_text.masked_fill(~vector_similarity_matrix.T, float('-inf'))
+            loss = (F.cross_entropy(masked_logits_per_image, labels) + F.cross_entropy(masked_logits_per_text, labels)) / 2
+        else:
+            loss = (F.cross_entropy(logits_per_image, labels) + F.cross_entropy(logits_per_text, labels)) / 2
+        return loss

open_clip_patch.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from transformers.models.bert import modeling_bert
+from open_clip import CustomTextCLIP
+from open_clip.hf_model import HFTextEncoder
+import torch.nn.functional as F
+from torch import TensorType
+def patch_encode_text():
+    def encode_text_patched(self, text, normalize: bool = False, output_attentions = False, output_tokens = False):
+        if output_attentions:
+                features, attn_scores = self.text(text, output_attentions = output_attentions, output_tokens = output_tokens)
+                features = F.normalize(features, dim=-1) if normalize else features
+                return features, attn_scores
+        else:
+            features = self.text(text, output_attentions = output_attentions, output_tokens = output_tokens)
+            return F.normalize(features, dim=-1) if normalize else features
+    def HFText_encoder_patched(self, x: TensorType, output_attentions=False, output_tokens=False):
+        self.output_tokens = output_tokens
+        attn_mask = (x != self.config.pad_token_id).long()
+        out = self.transformer(input_ids=x, attention_mask=attn_mask, output_attentions=output_attentions)
+        if self.output_tokens:
+            tokens = self.proj(out[0])
+            if output_attentions:
+                return tokens, out[1]
+            else:
+                return tokens
+        else:
+            pooled_out = self.pooler(out, attn_mask)
+            projected = self.proj(pooled_out)
+            return projected
+    CustomTextCLIP.encode_text = encode_text_patched
+    HFTextEncoder.forward = HFText_encoder_patched

prompt_templates.py ADDED Viewed

	@@ -0,0 +1,4 @@

+prompt_templates = [
+    lambda c: f'a chest X-ray image of {c}.',
+    lambda c: f'Findings suggesting {c}.',
+]

timm_vit_return_attn_patch.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import timm.models.vision_transformer as vit
+def patch_timm_vit_return_attn_scores():
+    _orig_attn_forward = vit.Attention.forward
+    def attn_forward_patched(self, x, return_attn_scores = False):
+        if not return_attn_scores:
+            return _orig_attn_forward(self, x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        q = q * self.scale
+        attn_scores = q @ k.transpose(-2, -1)
+        attn = attn_scores.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return (x, attn_scores)
+    vit.Attention.forward = attn_forward_patched
+    # Patch Block.forward
+    _orig_block_forward = vit.Block.forward
+    def block_forward_patched(self, x, return_attn_scores= False):
+        if not return_attn_scores:
+            return _orig_block_forward(self, x)
+        out, attn_scores = self.attn(self.norm1(x), return_attn_scores=True)
+        x = x + self.drop_path1(self.ls1(out))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return (x, attn_scores)
+    vit.Block.forward = block_forward_patched
+    def get_attn_scores(self, x, pre_logits: bool = False):
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        depth = len(self.blocks)
+        for i, blk in enumerate(self.blocks):
+            if i == (depth - 1):
+                x, attn_scores = blk(x, return_attn_scores=True)
+            else:
+                x = blk(x)
+        x = self.norm(x)
+        if self.global_pool:
+            x = x[:, self.num_prefix_tokens:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0]
+        x = self.fc_norm(x)
+        x = self.head_drop(x)
+        if not pre_logits:
+            x = self.head(x)
+        return (x, attn_scores)
+    vit.VisionTransformer.get_attn_scores = get_attn_scores