Upload 9 files

Browse files

Files changed (9) hide show

__init__.py +2 -0
config.json +31 -0
configuration_normwear.py +54 -0
latent_bayesian.py +265 -0
layers.py +540 -0
modeling_normwear.py +45 -0
normwear2.py +706 -0
pytorch_model.bin +3 -0
utils.py +26 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .modeling_normwear import NormWear2Model
2	+ from .configuration_normwear import NormWear2Config

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "model_type": "normwear2",
+  "architectures": ["NormWear2Model"],
+  "auto_map": {
+    "AutoConfig": "configuration_normwear.NormWear2Config",
+    "AutoModel": "modeling_normwear.NormWear2Model"
+  },
+"patch_size" : 16,
+"mlp_ratio" : 4.0,
+"fuse_freq" : 2,
+"drop_p" : 0.0,
+"max_in_length" : 256,
+"trainable_pe" : true,
+"embed_dim" : 768,
+"num_heads" : 12,
+"depth" : 12,
+"decoder_embed_dim" : 512,
+"decoder_num_head" : 8,
+"decoder_depth" : 2,
+"token_level_fuse" : true,
+"use_casual" : true,
+"use_cls" : false,
+"jepa" : false,
+"jepa_post_decoder_train" : false
+}

configuration_normwear.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from transformers import PretrainedConfig
+class NormWear2Config(PretrainedConfig):
+    model_type = "normwear2"
+    def __init__(
+        self,
+        patch_size=16,
+        embed_dim=768, decoder_embed_dim=512,
+        depth=4, decoder_depth=2,
+        num_heads=12,decoder_num_head=8,
+        mlp_ratio=4.0, drop_p=0.0,
+        fuse_freq=2, # channel attn every 2 block
+        # layer type
+        # absolute position embedding
+        max_in_length=256, # NOTE: actual is total seq_length // patch_size
+        trainable_pe=True,
+        # mechanism wise config
+        token_level_fuse=True,
+        use_casual=True,
+        use_cls=False,
+        # jepa
+        jepa=False, jepa_post_decoder_train=False,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        # basics
+        self.patch_size = patch_size
+        self.mlp_ratio = mlp_ratio
+        self.fuse_freq = fuse_freq
+        self.drop_p = drop_p
+        # position
+        self.max_in_length = max_in_length
+        self.trainable_pe = trainable_pe
+        # encoder
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.depth = depth
+        # decoder
+        self.decoder_embed_dim = decoder_embed_dim
+        self.decoder_num_head = decoder_num_head
+        self.decoder_depth = decoder_depth
+        # others
+        self.token_level_fuse = token_level_fuse
+        self.use_casual = use_casual
+        self.use_cls = use_cls
+        self.jepa = jepa
+        self.jepa_post_decoder_train = jepa_post_decoder_train

latent_bayesian.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import torch
+import numpy as np
+from sklearn.cluster import KMeans
+from .normwear2 import NormWear2
+################## Bayesian Functions Start ########################################################################
+# helper function for determining state based on transit matrix
+def get_traj_of_state(last_s, transit_p, centroids, centroid_std, sample_steps,
+                      top_k=-1, temperature=1, future_action_enc_out=None,
+                      embed_dim=768,
+                      **kwargs):
+    # last_s: 1, embed_dim*bn*nvar
+    # centroids: num_centroids, embed_dim*bn*nvar
+    # future_action_enc_out: sample_steps, embed_dim*bn*action_nvar
+    # action_nvar < nvar, (action_nvar+phyio_nvar = nvar)
+    # currently, bn is always 1.
+    # init
+    temperature = min(max(1e-6, temperature), 2)
+    prev_ci = np.argmin(np.sqrt(np.sum((centroids - last_s.cpu().numpy())**2, axis=1)))
+    result_embeds = torch.zeros(1, sample_steps, last_s.shape[-1])
+    traj_log = 0
+    # generate across target steps
+    for ss in range(sample_steps):
+        # raw sampling
+        p = transit_p[prev_ci]
+        # up-weight the transition where the transited state representation is closer to the next step of the action.
+        if future_action_enc_out is not None:
+            action_emb = future_action_enc_out[ss] # embed_dim*bn*action_nvar
+            action_emb = action_emb.cpu().numpy()
+            centroids_action_emb = centroids[:, -future_action_enc_out.shape[-1]:] # num_centroids, embed_dim*bn*action_nvar
+            # compute distance, then apply min-max normalization.
+            action_distance = np.linalg.norm(centroids_action_emb - action_emb[None, :], axis=-1) # num_centroids
+            action_distance = (action_distance - action_distance.min()) / (action_distance.max() - action_distance.min() + 1e-8) # minmax norm
+            p = p * (1 - action_distance) # upweight the transition to states whose representation is more similar to the future action.
+        # apply temperature
+        p = p ** (1.0 / temperature)
+        # use top k token
+        if top_k > 0:
+            topk_idx = np.argsort(p)[-top_k:]
+            topk_p = p[topk_idx]
+        else: # use all token
+            topk_idx = np.arange(len(p))
+            topk_p = p
+        topk_p = topk_p / topk_p.sum() # make sure p sum to 1
+        # sampling
+        new_cidx = np.random.choice(np.arange(len(topk_idx)), p=topk_p) # sampling step
+        new_ci = topk_idx[new_cidx]
+        # update
+        # print(centroids.shape, centroid_std.keys())
+        # exit()
+        traj_log += np.log(topk_p[new_cidx] + 1e-12)
+        # result_embeds[:, ss, :] = torch.from_numpy(centroids[new_ci])
+        curr_scale = 0 if centroid_std.get(new_ci) is None else centroid_std[new_ci] # means there are less number of clusters than actual desired number of clusters
+        result_embeds[:, ss, :] = torch.from_numpy(
+            np.random.normal(loc=centroids[new_ci], scale=curr_scale)
+        )
+        prev_ci = new_ci
+    return result_embeds.float().to(last_s.device), traj_log # 1, 2048, dim
+def quantile_traj_of_state(last_s, transit_p, centroids, centroid_std, sample_steps,
+                           top_k=-1, temperature=1, num_traj=20, future_action_enc_out=None):
+    # initialize traj list
+    num_traj = int(min(100, max(0, num_traj)))
+    result_embeds_traj_log = list() # result_embeds, traj_log
+    # repeat for num_traj times
+    for _ in range(num_traj):
+        result_embeds, traj_log = get_traj_of_state(last_s, transit_p, centroids, centroid_std,
+                                                    sample_steps, top_k=top_k, temperature=temperature,
+                                                    future_action_enc_out=future_action_enc_out)
+        result_embeds_traj_log.append((result_embeds, traj_log))
+    result_embeds_traj_log.sort(key=lambda x: x[1], reverse=True)
+    # return result_embeds_traj_log[0][0] # 1, 2048, dim
+    # fuse each sampled traj, weighted by their total energy
+    total_p = torch.tensor([t[1] for t in result_embeds_traj_log]).float().to(result_embeds_traj_log[0][0].device)
+    total_p = torch.softmax(total_p, 0)[:, None, None, None]
+    total_traj = torch.stack([t[0] for t in result_embeds_traj_log]) # num_traj, 1, 2048, dim
+    return (total_traj * total_p).sum(dim=0) # 1, 2048, dim
+    # return total_traj.mean(dim=0) # 1, 2048, dim
+# Helper function to fit new bayesian
+def fit_observed_bayesian(observed_emebds, num_states=16,
+                          original_knowledge=None, post_w=1.0,
+                          ):
+    # observed_emebds: N, embed_dim
+    # original_knowledge: (original_transit, original_centroids), ((3600, 3600), (3600, 768))
+    # return: regularized_transit_p, regularized_centroids
+    # only cluster based on physio channels, ignore action channels.
+    # when physio_channels are introduced, fit only on physio channels
+    # because we want to regularize the transit matrix based on physio states,
+    # and action channels may introduce extra noise for clustering.
+    reg_km = KMeans(
+        n_clusters=num_states,
+        random_state=42,
+        # n_init=10
+        n_init=1,
+        algorithm="elkan",
+    ).fit(observed_emebds)
+    regularized_centroids = reg_km.cluster_centers_ # num_states, 768
+    observed_centroids = reg_km.labels_ # N
+    centroid_std = {
+        observed_centroids[-1]: [
+            (observed_emebds[-1] - regularized_centroids[observed_centroids[-1]])**2,
+            1 # counter
+        ]
+    }
+    # identify prior transit
+    if original_knowledge is not None:
+        original_transit, original_centroids = original_knowledge
+        closest_prior_centroids = np.sum((regularized_centroids[:, None, :]-original_centroids[None, :, :])**2, axis=-1)
+        closest_prior_centroids = np.argmin(closest_prior_centroids, axis=-1) # num_states
+        prior_transit = original_transit[closest_prior_centroids, :][:, closest_prior_centroids] # num_states, num_states
+        prior_transit_p = (prior_transit+1e-8) / ((prior_transit+1e-8).sum(axis=1, keepdims=True))
+    else:
+        prior_transit_p, post_w = 0, 1.0
+    # fit expected bayesian transit matrix
+    posterior_transit = np.zeros((num_states, num_states))
+    for c_i in range(len(observed_centroids)-1):
+        curr_centoids_id = observed_centroids[c_i]
+        # update transit matrix
+        posterior_transit[observed_centroids[c_i], observed_centroids[c_i+1]] += 1
+        # update std stats
+        if centroid_std.get(curr_centoids_id) is None:
+            centroid_std[curr_centoids_id] = [0, 0]
+        centroid_std[curr_centoids_id][0] += ((observed_emebds[c_i] - regularized_centroids[curr_centoids_id])**2)
+        centroid_std[curr_centoids_id][1] += 1
+    # compute posterior probability
+    posterior_transit_p = (posterior_transit+1e-8) / ((posterior_transit+1e-8).sum(axis=-1, keepdims=True))
+    # clean up std
+    for std_k in centroid_std:
+        accum_centroids, centroid_num = centroid_std[std_k]
+        centroid_std[std_k] = np.sqrt(accum_centroids / centroid_num)
+    # aggregate
+    regularized_transit_p = (post_w*posterior_transit_p) + ((1-post_w)*prior_transit_p)
+    regularized_transit_p = (regularized_transit_p+1e-8) / ((regularized_transit_p+1e-8).sum(axis=-1, keepdims=True))
+    return regularized_transit_p, regularized_centroids, centroid_std
+def bayesian_forecast(in_tensor, n_channels, physio_channels,
+                      context_length=2048-16, pred_length=2048+16,
+                      num_states=16, action_channels=[],
+                      condition_bayes=False, num_traj_sampled=1,
+                      latent_encoder=None):
+    # in_tensor: 1, nvar, length
+    end_idx = in_tensor.shape[-1] if len(action_channels) < 1 else context_length
+    with torch.no_grad():
+        enc_out, ids_restore, masked_patches = latent_encoder.forward_encoder(in_tensor.clone()[:, :, :end_idx], masking=False)
+    # regularize transit and centroid then forecast
+    embed_dim = enc_out.shape[-1]
+    enc_out = enc_out.permute(1, 2, 0).flatten(start_dim=1) # L//patch_size + 1, embed_dim*bn*nvar
+    # adjust num state
+    curr_num_state = min(num_states, len(enc_out)-1)
+    # fit bayesian
+    bayesian_outpack = fit_observed_bayesian(
+        # enc_out[0, 1:, :].cpu().numpy(),
+        enc_out[1:, :].cpu().numpy(),
+        num_states=curr_num_state,
+        # num_states=(context_length // 16) // 2,
+        # original_knowledge=(transit_matrix, centroids),
+        post_w=1.0,
+    )
+    # extract core info
+    regularized_transit_p, regularized_centroids, centroid_std = bayesian_outpack[:3]
+    # regularized_transit_p, regularized_centroids = regularize_transit_centroids(enc_out[0, 1:, :].cpu().numpy(), transit_matrix, centroids)
+    future_action_enc_out = None
+    if len(action_channels) > 0:
+        with torch.no_grad():
+            future_action_enc_out, _, _ = latent_encoder.forward_encoder(in_tensor.clone()[:, action_channels, :], masking=False)
+            future_action_enc_out = future_action_enc_out.permute(1, 2, 0).flatten(start_dim=1)[1:1+(pred_length//latent_encoder.patch_size)+1, :] # sample_steps, embed_dim*bn*action_nvar
+    appended_embeds = quantile_traj_of_state(
+        # enc_out[0, -1, :],
+        enc_out[-1:, :],
+        regularized_transit_p,
+        regularized_centroids,
+        centroid_std,
+        (pred_length // latent_encoder.patch_size)+1,
+        top_k=curr_num_state,
+        temperature=1.0,
+        num_traj=num_traj_sampled, # maybe increase this later
+        future_action_enc_out=future_action_enc_out if condition_bayes else None,
+    ) # 1, pred_length//patch_size, dim
+    # decoding
+    # enc_with_append = torch.concatenate((enc_out, appended_embeds), dim=1) # bn*nvar, L//patch_size + 1 + pred_length//patch_size, embed_dim
+    enc_with_append = torch.concatenate((enc_out, appended_embeds[0]), dim=0) # L//patch_size + 1 + pred_length//patch_size, embed_dim*bn*nvar
+    enc_with_append = enc_with_append.reshape(enc_with_append.shape[0], embed_dim, -1).permute(2, 0, 1) # bn*nvar, L//patch_size + 1 + pred_length//patch_size, embed_dim
+    dec_out = latent_encoder.forward_decoder(enc_with_append, ids_restore, masked_patches) # bn*nvar, L
+    # dec_out = torch.concatenate((enc_out, appended_embeds), dim=1)
+    dec_out = dec_out.reshape(dec_out.shape[0], -1)
+    bn_nvar, total_L = dec_out.shape
+    bayesian_out = dec_out.reshape(1, n_channels, total_L)[0, physio_channels, context_length:context_length+pred_length] # bn, nvar, pred_length
+    return bayesian_out # bn, nvar, pred_length
+################## Bayesian Functions End ########################################################################
+################## Base Models Start ########################################################################
+def load_normwear2_model(weight_path='../train_results/ckpts/from_k8s/normwear2_fix_pos_checkpoint-19.pth'):
+    model = NormWear2(
+        # basics
+        patch_size=16,
+        mlp_ratio=4.0,
+        # encoder configuration
+        embed_dim=768,
+        num_heads=12,
+        depth=12,
+        # decoder configuration
+        decoder_embed_dim=512,
+        decoder_num_head=8,
+        decoder_depth=2,
+        # position embedding
+        trainable_pe=True,
+        max_in_length=4096 // 16,
+        # others
+        mask_prob=0.0, # 0.5
+        use_casual=True,
+        token_level_fuse=True,
+        use_cls=False,
+        jepa=False,
+    )
+    # load ckpt
+    state_dict = torch.load(weight_path, weights_only=False)
+    if state_dict.get('model') is not None:
+        state_dict = state_dict['model']
+    model.load_state_dict(state_dict, strict=True)
+    print("Model Load Success!")
+    return model

layers.py ADDED Viewed

	@@ -0,0 +1,540 @@

+import math
+from functools import partial
+from typing import Optional, Tuple
+# import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.jit import Final
+from itertools import repeat
+import collections.abc
+from .utils import *
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+class CheckShape(nn.Module):
+    def __init__(self, remark, key=None):
+        super().__init__()
+        self.remark = remark
+        self.key = key
+    def forward(self, x, **kwargs):
+        if self.remark is not None:
+            print(self.remark, x.shape)
+        out = x
+        if self.key is not None:
+            out = self.key(x)
+        return out
+# fix time position embedding
+class tAPE(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=2048, scale_factor=1.0, trainable=False):
+        super(tAPE, self).__init__()
+        self.max_len = max_len
+        self.trainable = trainable
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)  # positional encoding
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin((position * div_term)*(d_model/max_len))
+        pe[:, 1::2] = torch.cos((position * div_term)*(d_model/max_len))
+        pe = scale_factor * pe.unsqueeze(0)
+        self.register_buffer('pe', pe)  # this stores the variable in the state_dict (used for non-trainable variables)
+        # trainable parameter
+        if self.trainable:
+            self.trainable_pe = nn.Parameter(torch.zeros(pe.shape))
+    def interpolate_pe(self, original_pe, target_len):
+        # original_pe: (1, original_length, embedding_size)
+        # return interpolated_pe: (1, target_len, embedding_size)
+        # fetch required info
+        original_len = original_pe.size(1)
+        if target_len <= original_len: # if shorted then just clip
+            # return original_pe.unfold(dimension=1, size=target_len, step=1).mean(dim=1).permute(0, 2, 1)
+            return original_pe[:, :target_len, :]
+        # interpolate
+        pe_reshaped = original_pe.permute(0, 2, 1) # 1, embedding_size, original_length
+        pe_interpolated = F.interpolate(
+            pe_reshaped,
+            size=target_len,  # target length
+            mode='nearest-exact',
+            # align_corners=True  # casual scenario is recommended to be true
+        )
+        interpolated_pe = pe_interpolated.permute(0, 2, 1) # 1, original_length, embedding_size
+        return interpolated_pe
+    def cyclic_pe(self, original_pe, target_len):
+        # original_pe: (1, original_length, embedding_size)
+        # return interpolated_pe: (1, target_len, embedding_size)
+        # cycling
+        # pe_reshaped = original_pe.permute(0, 2, 1) # 1, embedding_size, original_length
+        cyclic_pe = torch.concat((original_pe, original_pe), dim=1) # 1, original_length*2, embedding_size
+        while cyclic_pe.shape[-1] < target_len:
+            cyclic_pe = torch.concat((cyclic_pe, original_pe), dim=1)
+        # cyclic_pe = pe_reshaped.permute(0, 2, 1) # 1, original_length, embedding_size
+        # clip
+        if target_len <= cyclic_pe.shape[1]: # if shorted then just clip
+            return cyclic_pe[:, :target_len, :]
+        return cyclic_pe
+    def duplicate_pretrained_pe(self, pretrained_end_idx=256-16):
+        # self.pe shape: [1, max_length, embedding_size]
+        # self.trainable_pe shape: [1, max_length, embedding_size]
+        # NOTE: This function will be called after pretrained pe get loaded
+        # TODO: The index from 0 to pretrained_end_idx are well-pretrained, and the rest remain randomly initialized.
+        # when this function get called, duplicate the parameters values from 0 to pretrained_end_idx to all the later indeces, do for both pe and trainable pe
+        with torch.no_grad():
+            for param in [self.pe, self.trainable_pe]:
+                # param shape: [1, max_length, embedding_size]
+                max_len = param.shape[1]
+                pretrained = param[:, :pretrained_end_idx, :].clone()
+                remaining = max_len - pretrained_end_idx
+                if remaining <= 0:
+                    continue
+                # repeat pretrained block enough times
+                repeat_factor = int(((remaining + pretrained_end_idx - 1) / pretrained_end_idx)+1)
+                tiled = pretrained.repeat(1, repeat_factor, 1) # 1, repeat_factor*pretrained_len, embedding_size
+                # fill the remaining positions
+                param[:, pretrained_end_idx:, :] = tiled[:, :remaining, :]
+    def forward(self, x): # N, L, C
+        has_four_dim = False
+        if len(x.shape) == 4:
+            has_four_dim = True
+            bn, nvar, L, C = x.shape
+            x = x.reshape(bn*nvar, L, C)
+        # adjust pe function
+        pe_adjust = self.interpolate_pe # seems work better than cyclic
+        # pe_adjust = self.cyclic_pe
+        # NOTE: this is just because the very 1st version has false length, remove this afterward
+        curr_max_len = self.max_len if self.max_len < 1024 else 256-16
+        # add position embeddings
+        x = x + pe_adjust(self.pe[:, :curr_max_len, :], x.shape[1])
+        # x = x + pe_adjust(self.pe[:, :, :], x.shape[1])
+        # x = x + self.pe[:, pe_start_idx:pe_start_idx+x.shape[1], :]
+        if self.trainable:
+            x = x + pe_adjust(self.trainable_pe[:, :curr_max_len, :], x.shape[1])
+            # x = x + self.trainable_pe[:, pe_start_idx:pe_start_idx+x.shape[1], :]
+        x = self.dropout(x)
+        if has_four_dim:
+            x = x.reshape(bn, nvar, L, C)
+        return x
+class VAE_Latent(nn.Module):
+    def __init__(self, emb_size, out_size, bias=None):
+        super().__init__()
+        self.mu = nn.Linear(emb_size, out_size, bias=bias)
+        self.var = nn.Sequential(
+            nn.Linear(emb_size, out_size, bias=bias),
+            nn.Softplus()
+        )
+    def forward(self, x):
+        if not self.training:
+            # during inference, just return the mean
+            return self.mu(x)
+        # generate mean and variance
+        mu, var = self.mu(x), self.var(x)
+        # reparametrization trick
+        eps = torch.randn_like(var)
+        z = mu + var*eps
+        return z
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(
+            self,
+            in_features,
+            hidden_features=None,
+            out_features=None,
+            act_layer=nn.GELU,
+            norm_layer=None,
+            bias=True,
+            drop=0.,
+            use_conv=False,
+            vae_out=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        # final out linear
+        if not vae_out:
+            self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        else:
+            self.fc2 = VAE_Latent(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class SwiGLU_Mlp(nn.Module):
+    """
+    SwiGLU MLP block used in modern transformers (LLaMA, Qwen).
+    """
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        norm_layer=None,
+        act_layer=None,
+        bias=True,
+        drop=0.,
+        use_conv=False,
+        vae_out=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or int(in_features * 4)  # typical MLP ratio
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        # SwiGLU uses TWO projections
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.fc2 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.norm = norm_layer(hidden_features, eps=1e-06) if norm_layer is not None else nn.Identity()
+        # final projection
+        if not vae_out:
+            self.fc3 = linear_layer(hidden_features, out_features, bias=bias[1])
+        else:
+            self.fc3 = VAE_Latent(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        gate = F.silu(self.fc1(x))     # SiLU activation
+        value = self.fc2(x)
+        x = gate * value               # SwiGLU gating
+        x = self.norm(x)
+        x = self.fc3(x)
+        x = self.drop2(x)
+        return x
+class Attention(nn.Module):
+    fused_attn: Final[bool]
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = False,
+            qk_norm: bool = False,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+            norm_layer: nn.Module = nn.LayerNorm,
+            use_casual: bool = False,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        # self.fused_attn = use_fused_attn()
+        self.fused_attn = True
+        self.use_casual = use_casual
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim, eps=1e-06) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim, eps=1e-06) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        # reservor adjacency matrix
+        self.rc_attn = None
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_kv: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+    ) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        # kv cache
+        if past_kv is not None:
+            past_k, past_v = past_kv
+            k = torch.cat([past_k, k], dim=2)   # [B, h, past+N, d]
+            v = torch.cat([past_v, v], dim=2)
+        # whether to use scaled attn or raw attn
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+                is_causal=self.use_casual
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        # mlp layers
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+def scaled_dot_product_attention_kvcache(query, key, value, attn_mask=None, dropout_p=0.0,
+        is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
+    L, S = query.size(-2), key.size(-2)
+    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+    attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
+    if is_causal:
+        assert attn_mask is None
+        temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+        attn_bias.to(query.dtype)
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+        else:
+            attn_bias = attn_mask + attn_bias
+    if enable_gqa:
+        key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
+        value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
+    attn_weight = query @ key.transpose(-2, -1) * scale_factor
+    attn_weight += attn_bias
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+    return attn_weight @ value
+class LayerScale(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            init_values: float = 1e-5,
+            inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class Block(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int,
+            mlp_ratio: float = 4.,
+            qkv_bias: bool = False,
+            qk_norm: bool = False,
+            proj_drop: float = 0.,
+            attn_drop: float = 0.,
+            init_values: Optional[float] = None,
+            drop_path: float = 0.,
+            act_layer: nn.Module = nn.GELU,
+            norm_layer: nn.Module = nn.LayerNorm,
+            mlp_layer: nn.Module = Mlp,
+            use_casual: bool = False,
+            vae_out: bool = False,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim, eps=1e-06)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            norm_layer=norm_layer,
+            use_casual=use_casual,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim, eps=1e-06)
+        self.mlp = mlp_layer(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=proj_drop,
+            vae_out=vae_out,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob,3):0.3f}'
+class PatchTSTKernelEmbeddingLocal(nn.Module):
+    def __init__(self, poly_degrees=2, num_poly_feats=120, patch_length=16, rff_scale=1.0, num_rff=256, rff_trainable=False, d_feat=512, d_out=512):
+        super().__init__()
+        poly_degrees_lst = range(2, 2 + poly_degrees)
+        self.num_poly_feats = num_poly_feats
+        self.patch_indices = [
+            torch.randint(
+                high=patch_length,
+                size=(self.num_poly_feats, d),
+                requires_grad=False,
+            )
+            for d in poly_degrees_lst
+        ]
+        self.freq_weights = nn.Parameter(
+            rff_scale * torch.randn(patch_length, num_rff // 2),
+            requires_grad=rff_trainable,
+        )
+        self.freq_biases = nn.Parameter(
+            torch.randn(1, 1, 1, num_rff // 2),
+            requires_grad=rff_trainable,
+        )
+        self.projection = nn.Linear(d_feat, d_out, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters:
+            x (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
+                Patch input for embedding
+        return:
+            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)`
+        """
+        poly_feats = [x[..., pis].prod(dim=-1) for pis in self.patch_indices]
+        weighted_x = x @ self.freq_weights + self.freq_biases
+        rff_feats = torch.cat([torch.sin(weighted_x), torch.cos(weighted_x)], dim=-1)
+        # features = torch.cat([cdiff_feats, *poly_feats, rff_feats], dim=-1)
+        features = torch.cat([x, *poly_feats, rff_feats], dim=-1)
+        # print(features.shape)
+        # exit()
+        features = self.projection(features)
+        return features
+class SIGReg(torch.nn.Module):
+    """Sketch Isotropic Gaussian Regularizer (single-GPU!)"""
+    def __init__(self, knots=17, num_proj=1024):
+        super().__init__()
+        self.num_proj = num_proj
+        t = torch.linspace(0, 3, knots, dtype=torch.float32)
+        dt = 3 / (knots - 1)
+        weights = torch.full((knots,), 2 * dt, dtype=torch.float32)
+        weights[[0, -1]] = dt
+        window = torch.exp(-t.square() / 2.0)
+        self.register_buffer("t", t)
+        self.register_buffer("phi", window)
+        self.register_buffer("weights", weights * window)
+    def forward(self, proj):
+        """
+        proj: (T, B, D)
+        """
+        # sample random projections
+        A = torch.randn(proj.size(-1), self.num_proj, device=proj.device)
+        A = A.div_(A.norm(p=2, dim=0))
+        # compute the epps-pulley statistic
+        x_t = (proj @ A).unsqueeze(-1) * self.t
+        err = (x_t.cos().mean(-3) - self.phi).square() + x_t.sin().mean(-3).square()
+        statistic = (err @ self.weights) * proj.size(-2)
+        return statistic.mean() # average over projections and time

modeling_normwear.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+from torch import nn
+from transformers import PreTrainedModel
+from .configuration_normwear import NormWear2Config
+from .normwear2 import NormWear2
+class NormWear2Model(PreTrainedModel):
+    config_class = NormWear2Config
+    base_model_prefix = "normwear"
+    def __init__(self, config: NormWear2Config):
+        super().__init__(config)
+        self.normwear = NormWear2(
+            patch_size=config.patch_size,
+            embed_dim=config.embed_dim, decoder_embed_dim=config.decoder_embed_dim,
+            depth=config.depth, decoder_depth=config.decoder_depth,
+            num_heads=config.num_heads,decoder_num_head=config.decoder_num_head,
+            mlp_ratio=config.mlp_ratio, drop_p=config.drop_p,
+            fuse_freq=config.fuse_freq, # channel attn every 2 block
+            # layer type
+            # absolute position embedding
+            max_in_length=config.max_in_length, # NOTE: actual is total seq_length // patch_size
+            trainable_pe=config.trainable_pe,
+            # mechanism wise config
+            token_level_fuse=config.token_level_fuse,
+            use_casual=config.use_casual,
+            use_cls=config.use_cls,
+            # jepa
+            jepa=config.jepa, jepa_post_decoder_train=config.jepa_post_decoder_train,
+        )
+        self.post_init()
+    def forward(self, *args, **kwargs):
+        return self.normwear(*args, **kwargs)
+    def predict(self, *args, **kwargs):
+        return self.normwear.predict(*args, **kwargs)
+    def simulate(self, *args, **kwargs):
+        return self.normwear.simulate(*args, **kwargs)

normwear2.py ADDED Viewed

	@@ -0,0 +1,706 @@

+# Copyright (c) School of Computing, Information, and Data Science, University of California San Diego.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+# import torch
+# import torch.nn as nn
+# import torch.nn.functional as F
+import numpy as np
+from .layers import *
+class EncoderLayer(nn.Module):
+    def __init__(self,embed_dim = 768,
+                 norm_layer=nn.RMSNorm,
+                 mlp_layer=SwiGLU_Mlp,
+                 num_heads=12,
+                 mlp_ratio=4.0,
+                 qkv_bias=True,
+                 drop_p=0.0,
+                 fuse_frequency=2,
+                 curr_layer = 0,
+                 # fusion scheme
+                 no_fusion=False,
+                 mean_fuse=False,
+                 use_casual=False,
+                 prepend_cls=True,
+                 token_level_fuse=False, # True: will follow Panda's idea, where each token themselves are info exchange laision intead of single cls representative.
+                 vae_out=False,
+                 ):
+        super().__init__()
+        self.no_fusion = no_fusion
+        self.mean_fuse = mean_fuse
+        self.prepend_cls = prepend_cls
+        self.token_level_fuse = token_level_fuse
+        self.curr_layer = curr_layer
+        self.fuse_frequency = fuse_frequency
+        #self.self_attn = self_attn_model.transformer.blocks[curr_layer].eval()
+        self.variate_encoder = Block(
+            mlp_layer=mlp_layer,
+            dim=embed_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            norm_layer=norm_layer,
+            use_casual=use_casual,
+            vae_out=vae_out
+        )
+        if self.curr_layer%self.fuse_frequency==0:
+            self.cls_fusion = Block(
+                mlp_layer=mlp_layer,
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                use_casual=False
+                # proj_drop=drop # comment out for low version on jetson nano
+            )
+    def forward(self,x, nvar=5):
+        '''
+        input: x: bs*n_vars x L+1 x E
+        '''
+        _, N, E = x.shape
+        x_out = self.variate_encoder(x) # bs * nvars, L+1, E
+        # cls fusion
+        if self.curr_layer%self.fuse_frequency==0 and not self.no_fusion:
+            if not self.token_level_fuse: # [CLS] laision fusion
+                x_out = torch.reshape(x_out, (-1,nvar, N, E))   # z: [bs x nvars x num_patch x E]
+                if self.prepend_cls:
+                    patch_tokens = x_out[:,:,1:,:] # if cls was prepended
+                else:
+                    patch_tokens = x_out[:,:,:-1,:] # if cls was appended
+                # fetch token
+                if self.mean_fuse:
+                    cls = x_out.mean(dim=2)
+                else:
+                    if self.prepend_cls:
+                        cls = x_out[:,:,0,:] # bs x n_vars x E, if cls was prepended
+                    else:
+                        cls = x_out[:,:,-1,:] # bs x n_vars x E, if cls was appended
+                # forward and replace
+                cls = self.cls_fusion(cls).unsqueeze(2) # bs x n_vars x 1 x E
+                if self.prepend_cls:
+                    x_out = torch.cat((cls,patch_tokens),dim=2) # prepend cls
+                else:
+                    x_out = torch.cat((patch_tokens, cls),dim=2) # append cls
+                bs, n_vars, N, E = x_out.shape
+                x_out = torch.reshape(x_out,(bs*n_vars,N,E)) #bs * nvars, L+1, E
+            else: # token level laision fusion (Following guidance from Panda's logic)
+                # x_out input shape:  bs * nvars, L+1, E
+                x_out = torch.reshape(x_out, (-1,nvar, N, E)) # z: [bs x nvars x num_patch x E]
+                x_out = x_out.permute(0, 2, 1, 3) # z: [bs x num_patch x nvars x E]
+                bs, N, n_vars, E = x_out.shape
+                x_out = torch.reshape(x_out, (x_out.shape[0]*N, n_vars, E)) # combine the 1st 2 dimensions, prepare for attn
+                # cross channel forward
+                x_out = self.cls_fusion(x_out) # bs*num_patch, nvars, E
+                x_out = torch.reshape(x_out, (bs, N, n_vars, E)).permute(0, 2, 1, 3) # bs, nvars, num_patch, E
+                x_out = torch.reshape(x_out, (bs*n_vars, N, E)) # bs*nvars, num_patch, E
+        return x_out
+class NormWear2(nn.Module):
+    """ Masked Autoencoder
+    """
+    def __init__(self, patch_size=16,
+                 embed_dim=768, decoder_embed_dim=512,
+                 depth=4, decoder_depth=2,
+                 num_heads=12,decoder_num_head=8,
+                 mlp_ratio=4.0, drop_p=0.0,
+                 fuse_freq=2, # channel attn every 2 block
+                 # layer type
+                 norm_layer=nn.RMSNorm,
+                 mlp_layer=SwiGLU_Mlp,
+                 # absolute position embedding
+                 max_in_length=2048, # NOTE: actual is total seq_length // patch_size
+                 trainable_pe=True,
+                 # mechanism wise config
+                 token_level_fuse=False,
+                 use_casual=False,
+                 use_cls=True,
+                 # to be deprecated
+                 mask_prob=0.5, # 0.4, 0.5, deprecated after leverage dynamic mask ratio
+                 max_pred_length=64, # deprecated
+                 prepend_cls=True,
+                 vae_out=False,
+                 # jepa
+                 jepa=False, jepa_post_decoder_train=False,
+                 ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.use_cls = use_cls
+        self.max_in_length = max_in_length
+        self.mask_prob = mask_prob # deprecated
+        self.prepend_cls = prepend_cls # deprecated
+        self.max_pred_length = max_pred_length # deprecated
+        self.jepa = jepa
+        self.jepa_post_decoder_train = jepa_post_decoder_train
+        if jepa:
+            self.SIGReg = SIGReg()
+        # --------------------------------------------------------------------------
+        # MAE encoder specifics
+        self.init_embed = nn.Sequential( # in bn*nvar, L
+            CheckShape(None, key=lambda x: x.unsqueeze(1)), #  bn*nvar, 1, L
+        )
+        self.patch_embed = nn.Sequential( # in: bn*nvar, init_embed_size=1, L
+            nn.Conv1d(in_channels=1,out_channels=embed_dim,kernel_size=patch_size,stride=patch_size), # bn*nvar, embed_dim, L//patch_size
+            CheckShape(None, key=lambda x: x.permute(0, 2, 1)) # bn*nvar, L//patch_size, embed_dim
+        )
+        if self.use_cls:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = tAPE(embed_dim, max_len=max_in_length, trainable=trainable_pe, dropout=0.1)
+        self.encoder_blocks = [
+            EncoderLayer(embed_dim = embed_dim,
+                 norm_layer = norm_layer,
+                 mlp_layer = mlp_layer,
+                 num_heads=num_heads,
+                 mlp_ratio=mlp_ratio,
+                 drop_p=drop_p,
+                 fuse_frequency=fuse_freq,
+                 curr_layer = i,
+                 # fusion scheme
+                 no_fusion=False, # False
+                 mean_fuse=False, # False
+                 use_casual=use_casual,
+                 prepend_cls=prepend_cls,
+                 token_level_fuse=token_level_fuse
+                )
+            for i in range(depth-1)]
+        # add last encoder layer
+        self.encoder_blocks.append(
+            EncoderLayer(embed_dim = embed_dim,
+                 norm_layer = norm_layer,
+                 mlp_layer = mlp_layer,
+                 num_heads=num_heads,
+                 mlp_ratio=mlp_ratio,
+                 drop_p=drop_p,
+                 fuse_frequency=fuse_freq,
+                 curr_layer = depth,
+                 # fusion scheme
+                 no_fusion=False, # False
+                 mean_fuse=False, # False
+                 use_casual=use_casual,
+                 prepend_cls=prepend_cls,
+                 token_level_fuse=token_level_fuse,
+                 vae_out=vae_out
+                )
+        )
+        self.encoder_blocks = nn.ModuleList(self.encoder_blocks)
+        # --------------------------------------------------------------------------
+        # --------------------------------------------------------------------------
+        # MAE decoder specifics
+        self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True)
+        # self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
+        self.decoder_pos_embed = tAPE(decoder_embed_dim, max_len=max_in_length, trainable=trainable_pe)
+        self.decoder_blocks = nn.ModuleList([
+            Block(dim=decoder_embed_dim,num_heads=decoder_num_head,
+                mlp_ratio=mlp_ratio,norm_layer=norm_layer, use_casual=use_casual)
+            for i in range(decoder_depth)]) # bn*nvar, L//patch_size, decoder_embed_dim
+        # reshape layer after the linear map
+        if self.use_cls:
+            if self.prepend_cls:
+                decoder_reshape_layer = CheckShape(None, key=lambda x: x.flatten(start_dim=1)[:, self.patch_size:]) # bn*nvar, L
+            else:
+                decoder_reshape_layer = CheckShape(None, key=lambda x: x.flatten(start_dim=1)[:, :-self.patch_size]) # bn*nvar, L
+        else:
+            decoder_reshape_layer = CheckShape(None, key=lambda x: x.flatten(start_dim=1)) # bn*nvar, L
+        # regular output (same kernel for all step)
+        self.decoder_out = nn.Sequential(
+            nn.Linear(decoder_embed_dim, decoder_embed_dim//2), # bn*nvar, L//patch_size
+            nn.GELU(),
+            nn.Linear(decoder_embed_dim//2, patch_size), # bn*nvar, L//patch_size, patch_size
+            decoder_reshape_layer, # bn*nvar, L
+            # deconvolution/smoothing
+            CheckShape(None, key=lambda x: x.unsqueeze(1)), # bn*nvar, 1, L
+            nn.Conv1d(1, decoder_embed_dim//2, self.patch_size, padding='same'),
+            nn.GELU(),
+            nn.Conv1d(decoder_embed_dim//2, 1, self.patch_size, padding='same'),
+            CheckShape(None, key=lambda x: x.squeeze(1)), # bn*nvar, L
+            # # linear out
+            # nn.Linear(decoder_embed_dim, patch_size),
+            # CheckShape(None, key=lambda x: x.flatten(start_dim=1)[:, self.patch_size:])
+        )
+    def forward_encoder(self, x, masking=True, context_length=None, kv_cache=None, all_visible_length=None, non_visible_channel=list()):
+        '''Input
+        X:bn, nvar, L
+        '''
+        # embed patches
+        bn, nvar, L = x.shape
+        x = self.init_embed(x.flatten(end_dim=-2)) # bn*nvar, 1, L
+        x = self.patch_embed(x) # bn*nvar, L//patch_size, embed_dim
+        # x = self.pos_embed(x) # bn*nvar, L//patch_size, embed_dim
+        ####### MASK PART START ########################################################
+        # masking:
+        if masking:
+            # mask_prob = self.mask_prob
+            mask_prob = np.random.uniform(low=0.3, high=0.7) # varied mask ratio
+        else:
+            mask_prob = 0
+        # randomly masked out the patches
+        masked_patches = torch.ones(x.shape[0], x.shape[1], self.patch_size).to(x.device) # init
+        # use_unstructured = np.random.rand() < 0.5 # interpolation or forecasting
+        for x_i in range(len(x)):
+            # if use_unstructured:
+            # random unstructured masking
+            mask_patches_idx = torch.randperm(x.shape[1]) # shuffle idx
+            ids_restore = mask_patches_idx[torch.rand(mask_patches_idx.shape) < mask_prob].flatten().sort().values # idxs to mask
+            # else:
+            #     # masking only the later part
+            #     mask_patches_idx = torch.arange(x.shape[1]) # regular idx
+            #     if mask_prob > 0:
+            #         start_idx = np.random.choice(np.arange(int(0.3*x.shape[1]), x.shape[1]-1))
+            #         ids_restore = mask_patches_idx[start_idx:].flatten().sort().values
+            #     else:
+            #         ids_restore = mask_patches_idx[torch.rand(mask_patches_idx.shape) < mask_prob].flatten().sort().values # idxs to mask
+            # x = x.float() # dtype adjust
+            # replace those token with mask token
+            x[x_i, ids_restore, :] = self.mask_token[0].expand(len(ids_restore), x.shape[2]).to(x.dtype)
+            masked_patches[x_i, ids_restore, :] *= 2 # scaling up the mask position (for loss)
+        # replace token after context_length as mask token
+        if context_length is not None:
+            end_patch_idx = context_length // self.patch_size
+            x[:, end_patch_idx:, :] = self.mask_token.expand(x.shape[0], x.shape[1]-end_patch_idx, x.shape[2]).to(x.dtype) # replace those with mask token
+        # replace specific channel part with mask token
+        if all_visible_length is not None:
+            end_patch_idx = all_visible_length // self.patch_size
+            x = x.reshape(bn, nvar, x.shape[1], x.shape[2]) # bn, nvar, L//patch_size, embed_dim
+            x[:, non_visible_channel, end_patch_idx:, :] = self.mask_token.unsqueeze(0).expand(x.shape[0], len(non_visible_channel), x.shape[2]-end_patch_idx, x.shape[3]) # replace those with mask token
+            x = x.reshape(bn*nvar, x.shape[2], x.shape[3]) # reshape back to # bn*nvar, L//patch_size, embed_dim
+        ####### MASK PART END ###############################################################
+        ##### add position embedding #######
+        x = self.pos_embed(x) # bn*nvar, L//patch_size, embed_dim, add pos-embed after masking
+        ##### append cls token #######
+        if self.use_cls:
+            cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
+            if self.prepend_cls:
+                x = torch.cat((cls_tokens, x), dim=1) # prepend cls token
+            else:
+                x = torch.cat((x, cls_tokens), dim=1) # append cls token
+        # apply Encoder blocks
+        for blk in self.encoder_blocks:
+            x = blk(x, nvar=nvar) # bn*nvar, L//patch_size + 1, embed_dim
+        return x, ids_restore, masked_patches
+    def forward_decoder(self, x, ids_restore, masked_patches, kv_cache=None):
+        # embed tokens
+        # x: # bn*nvar, L//patch_size+1, embed_dim
+        # add pos embed
+        x_ = self.decoder_pos_embed(self.decoder_embed(x)) # bn*nvar, L//patch_size, decoder_embed_dim
+        # decode
+        for blk in self.decoder_blocks:
+            x_ = blk(x_) # bn*nvar, L//patch_size, embed_dim
+        # predictor projection
+        x_ = self.decoder_out(x_) # bn*nvar, L
+        return x_
+    def forward_loss(self,target_tss, pred, masked_patches=None):
+        """
+        target_tss: bn, nvar, L
+        pred: bn, nvar, L
+        masked_patches: bn*nvar, L//patch_size, patch_size
+        """
+        # cosim_scores = self.cosim(target_tss,pred)
+        # loss = 1 - cosim_scores
+        # cos_loss = loss.mean()
+        loss_function = F.mse_loss
+        # loss_function = F.l1_loss
+        # compute loss
+        recon_loss = loss_function(pred, target_tss, reduction='none')
+        # scale up masked area
+        if masked_patches is not None:
+            masked_patches = masked_patches.flatten(start_dim=1) # bn*nvar, L
+            recon_loss = recon_loss*(masked_patches.reshape(recon_loss.shape))
+        # reduce
+        recon_loss = recon_loss.mean()
+        loss = recon_loss
+        return loss
+    def forward(self, data_pack, output_latent=False, masking=True):
+        '''Input
+        sample: bn, nvar, L
+        target_tss: bn, nvar, L
+        '''
+        # de-pack
+        # data_pack['sample'] = torch.sign(data_pack['sample'])*torch.log1p(torch.abs(data_pack['sample']))
+        imgs = data_pack['sample'] # bn, nvar, L
+        target_tss = data_pack['sample'] # bn, nvar, L
+        # if have noise
+        if data_pack.get('noise_sample') is not None:
+            imgs = data_pack['noise_sample']
+            # print("Check:", imgs.shape, target_tss.shape)
+            # exit()
+        ## ----------- JEPA forward ----------------------
+        if self.jepa: # forward function for jepa
+            return self.forward_jepa(imgs, target_tss, lambd=0.1)
+        ## ----------- Regular MAE forward ----------------------
+        # encoder forward
+        latent, ids_restore, masked_patches = self.forward_encoder(imgs, masking=masking)
+        # decoder forward
+        pred = self.forward_decoder(latent, ids_restore, masked_patches) # bs*nvar, L
+        pred = pred.reshape(target_tss.shape) # bs,nvar, L
+        # calculate loss
+        # loss = self.forward_loss(target_tss, pred, loss_mask=data_pack['awake_mask'], masked_patches=masked_patches, reduce=(not output_latent))
+        loss = self.forward_loss(target_tss, pred, masked_patches=masked_patches)
+        # intermediate return
+        if output_latent:
+            return latent, pred, masked_patches, loss
+        # return loss, pred, mask
+        return loss
+    def forward_jepa(self, in_context, target_context, lambd=0.1):
+        '''Input
+        in_context: bn, nvar, L
+        target_context: bn, nvar, L
+        '''
+        if not self.jepa_post_decoder_train:
+            # encoder forward
+            masked_latent, ids_restore, masked_patches = self.forward_encoder(in_context, masking=True)
+            target_latent, _, _ = self.forward_encoder(in_context, masking=False)
+            # latent shape: # bn*nvar, L//patch_size + 1, embed_dim
+            # masked_patches: bs*nvar, L, patch_size
+            # reconstruction loss
+            recon_loss = F.mse_loss(masked_latent, target_latent, reduction='none') # bs_nvar, num_patches, embed_size
+            # scale up masked area
+            if masked_patches is not None:
+                latent_masked_patches = masked_patches.mean(dim=-1)[:, :, None] # bs*nvar, L, 1
+                recon_loss = recon_loss*(latent_masked_patches)
+            # reduce
+            recon_loss = recon_loss.mean()
+            # step-wise sigreg (anti-collapse)
+            # NOTE: SIGReg take proj: (T, B, D) as input (= seq_length, batch_size, embed_dim)
+            sigreg_loss = self.SIGReg(masked_latent.permute(1, 0, 2)) # SIGReg already take mean
+            # aggregate loss
+            loss = recon_loss + (lambd * sigreg_loss)
+            # if integrate the decoder loss
+            # decoder forward
+            pred = self.forward_decoder(masked_latent, ids_restore, masked_patches) # bs*nvar, L
+            pred = pred.reshape(target_context.shape) # bs,nvar, L
+            raw_recon_loss = self.forward_loss(target_context, pred, masked_patches=masked_patches)
+            loss = (0.5*loss) + raw_recon_loss
+            # # check
+            # print(loss)
+            # exit()
+            return loss
+        else: # for training the decoder only
+            # encoder forward
+            with torch.no_grad():
+                masked_latent, ids_restore, masked_patches = self.forward_encoder(in_context, masking=False)
+            # decoder forward
+            pred = self.forward_decoder(masked_latent, ids_restore, masked_patches) # bs*nvar, L
+            pred = pred.reshape(target_context.shape) # bs,nvar, L
+            # regular loss
+            # print("Reconstruct loss here!")
+            # exit()
+            return self.forward_loss(target_context, pred, masked_patches=masked_patches)
+    def predict(self, context_tensor, prediction_length, max_pred_length=None,
+                lookback_window=None, **kwargs):
+        # context_tensor: 1, L, nvar
+        # output: 1, pred_length, nvar
+        # determine the auto-regressive steps
+        if max_pred_length is None:
+            max_pred_length = min(128, max(
+                self.patch_size,
+                mean_centroid(context_tensor[0].T, patch_size=self.patch_size), # this function take (nvar, L) as input
+            ))
+        # if lookback_window is None:
+        #     lookback_window = 4*max_pred_length
+        # determine the observed context length
+        max_observed_context_length = min(
+            context_tensor.shape[1],
+            int(2*(self.max_in_length * self.patch_size))
+        ) # note really mater after use averge-or-interpolate mechanism
+        if context_tensor.shape[1] > max_observed_context_length:
+            context_tensor = context_tensor[:, -max_observed_context_length:, :]
+        # z-normalize context tensor
+        loc = context_tensor.mean(dim=1, keepdims=True)
+        scale = context_tensor.std(dim=1, keepdims=True)
+        scale[scale == 0] = 1.0
+        scale += 1e-8
+        context_tensor = (context_tensor - loc) / scale
+        # recursively generate
+        forecasted_tensor, kv_cache = self.generate(context_tensor, max_pred_length,
+                                                    kv_cache=None, lookback_window=lookback_window) # 1, Lf, nvar
+        all_forecast = forecasted_tensor
+        while all_forecast.shape[1] < prediction_length:
+            # concat forecasted part from previous round
+            context_tensor = torch.concatenate((context_tensor, forecasted_tensor), dim=1) # 1, L+Lf, nvar
+            # clip observed context
+            if context_tensor.shape[1] > max_observed_context_length:
+                context_tensor = context_tensor[:, -max_observed_context_length:, :]
+            # forecast
+            forecasted_tensor, kv_cache = self.generate(context_tensor, max_pred_length,
+                                                        kv_cache=kv_cache, lookback_window=lookback_window) # 1, Lf, nvar
+            # update all forecast
+            all_forecast = torch.concatenate((all_forecast, forecasted_tensor), dim=1)
+        # wrap up final output
+        all_forecast = all_forecast[:, :prediction_length, :] # clip
+        all_forecast = (all_forecast * scale) + loc # de-normalize back
+        return all_forecast
+    def generate(self, context_tensor, prediction_length, kv_cache=None,
+                 lookback_window=None,**kwargs):
+        # context_tensor: 1, L, nvar
+        # output: 1, pred_length, nvar
+        # # z-normalize context tensor
+        # loc = context_tensor.mean(dim=1, keepdims=True)
+        # scale = context_tensor.std(dim=1, keepdims=True)
+        # scale[scale == 0] = 1.0
+        # scale += 1e-8
+        # context_tensor = (context_tensor - loc) / scale
+        # reshape
+        context_tensor = context_tensor.permute(0, 2, 1) # 1, nvar, L
+        if lookback_window is not None:
+            lookback_window = min(lookback_window, context_tensor.shape[2])
+            context_tensor = context_tensor[:, :, -lookback_window:]
+        # pad context tensor
+        bn, nvar, context_length = context_tensor.shape
+        total_len = context_length+prediction_length
+        total_len = total_len + (self.patch_size-(total_len%self.patch_size)) # need to be multiple of patch_size=16
+        pad_context_tensor = torch.zeros(bn, nvar, total_len).to(context_tensor.device)
+        pad_context_tensor[:, :, :context_length] = context_tensor
+        with torch.no_grad():
+            # forward
+            enc_out, ids_restore, masked_patches = self.forward_encoder(pad_context_tensor, masking=False, context_length=context_length, kv_cache=kv_cache)
+            # enc_out shape:  bn*nvar, L//patch_size + 1, embed_dim
+            dec_out = self.forward_decoder(enc_out, ids_restore, masked_patches, kv_cache=kv_cache) # bn*nvar, L
+            # wrap-up predicted out
+            bn_nvar, total_L = dec_out.shape
+            pred_out = dec_out.reshape(bn, nvar, total_L)[:, :, context_length:context_length+prediction_length]
+            pred_out = pred_out.permute(0, 2, 1) # bn, L, nvar
+            # de-normalize
+            # pred_out = (pred_out * scale) + loc
+        return pred_out.detach(), kv_cache # 1, L, nvar
+    def simulate(self, context_tensor, all_visible_length=512,
+                 non_visible_channel=list(), ar_step=None, **kwargs):
+        # context_tensor: 1, L, nvar
+        # all_visible_length: length where all channel are observed
+        # non_visible_channel: [ch0. ch1, ...]
+        # output, 1, L, nvar
+        # mask
+        context_tensor[:, all_visible_length:, non_visible_channel] = 0
+        # adjust shape for successive operations
+        context_tensor = context_tensor.permute(0, 2, 1) # 1, nvar, L
+        # determine the optimal auto-regressive step size
+        if ar_step is None:
+            # ar_step = self.patch_size
+            ar_step = min(128, max(
+                self.patch_size,
+                mean_centroid(context_tensor[0, non_visible_channel, :all_visible_length], patch_size=self.patch_size), # this function take (nvar, L) as input
+            ))
+        print(f"{ar_step=}")
+        # normalize
+        loc = context_tensor.mean(dim=2, keepdims=True)
+        scale = context_tensor.std(dim=2, keepdims=True)
+        scale[scale == 0] = 1.0
+        scale += 1e-8
+        # calculate loc and scale for non visible channel separately
+        loc[:, non_visible_channel, :all_visible_length] = context_tensor[:, non_visible_channel, :all_visible_length].mean(dim=2, keepdims=True)
+        scale[:, non_visible_channel, :all_visible_length] = context_tensor[:, non_visible_channel, :all_visible_length].std(dim=2, keepdims=True)
+        # normalize
+        context_tensor = (context_tensor - loc) / scale
+        # make sure nonvisible part stay 0
+        context_tensor[:, non_visible_channel, all_visible_length:] = 0
+        # pad context tensor
+        bn, nvar, context_length = context_tensor.shape
+        total_len = context_length
+        total_len = total_len + (self.patch_size-(total_len%self.patch_size)) # need to be multiple of patch_size=16
+        pad_context_tensor = torch.zeros(bn, nvar, total_len).to(context_tensor.device)
+        pad_context_tensor[:, :, :context_length] = context_tensor
+        # auto-regressive simulate
+        with torch.no_grad():
+            for end_idx in range(all_visible_length+ar_step, context_length+1, ar_step):
+                # forward
+                enc_out, ids_restore, masked_patches = self.forward_encoder(pad_context_tensor[:, :, :end_idx],
+                                                                            masking=False, all_visible_length=end_idx-ar_step,
+                                                                            non_visible_channel=non_visible_channel)
+                # enc_out shape:  bn*nvar, L//patch_size + 1, embed_dim
+                dec_out = self.forward_decoder(enc_out, ids_restore, masked_patches).reshape(bn, nvar, end_idx) # bn*nvar, L (end_idx)
+                # update the stored global tensor
+                curr_max_possible_length = min(end_idx, pad_context_tensor.shape[-1])
+                pad_context_tensor[:, :, all_visible_length:curr_max_possible_length] = dec_out[:, :, all_visible_length:curr_max_possible_length]
+            pred_out = (pad_context_tensor * scale) + loc # bn, nvar, L
+        # # direct simulate
+        # with torch.no_grad():
+        #     # forward
+        #     enc_out, ids_restore, masked_patches = self.forward_encoder(pad_context_tensor, masking=False, context_length=context_length, all_visible_length=all_visible_length, non_visible_channel=non_visible_channel)
+        #     # enc_out shape:  bn*nvar, L//patch_size + 1, embed_dim
+        #     dec_out = self.forward_decoder(enc_out, ids_restore, masked_patches) # bn*nvar, L
+        #     bn_nvar, total_L = dec_out.shape
+        #     # predicted out
+        #     pred_out = dec_out.reshape(bn, nvar, total_L)[:, :, :context_length]
+        #     # de-normalize back
+        #     pred_out = (pred_out * scale) + loc
+        return pred_out.detach().permute(0, 2, 1) # bn, L, nvar
+    def get_embedding(self, sample_data, criteria='mean'): # default: criteria='mean'
+        # sample_data: (nvar, L) or (bn, nvar, L)
+        if len(sample_data.shape) == 2:
+            sample_data = sample_data.unsqueeze(0).float()
+        bn, nvar, L = sample_data.shape
+        # forward
+        out, _, _ = self.forward_encoder(sample_data, masking=False) # bn*nvar, P, E
+        bn_nvar, P, E = out.shape
+        out = out.reshape(bn, nvar, P, E)
+        # aggregate
+        if criteria == 'mean':
+            out = out.mean(dim=2) # bn, nvar, E
+        elif criteria == 'last':
+            out = out[:, :, -1, :] # bn, nvar, E
+        else:
+            raise ValueError("Unsupported aggregation criteria:", criteria)
+        return out.flatten(start_dim=1) # bn, nvar*E
+if __name__ == '__main__':
+    # python3 -m normwear_on_chaotic.normwear_opt
+    model = NormWear2(
+        patch_size=16,
+        depth=12,
+        mask_prob=0.0,
+        max_in_length=4096, # 2048 for all ckpts before
+        use_casual=True, # False for all ckpts before
+        prepend_cls=True,
+        token_level_fuse=True,
+    )
+    # construct random data of shape bn, L, nvar
+    # test_x = torch.rand(2, 32, 3)
+    test_x = torch.rand(2, 64, 3)
+    out_y = model.predict(test_x, 32, max_pred_length=16)
+    # verbose
+    print("Output shape:", out_y.shape)

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ef2770ccdca3f9c27ba4cc3220501620eeaa4b765e234dcad2882d7530924c9
+size 748287646

utils.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+def mean_centroid(x, sr=1.0, patch_size=16):
+    # x: nvar, L
+    f = torch.fft.rfft(x, dim=-1).abs()
+    freqs = torch.fft.rfftfreq(x.size(-1), 1/sr).to(x.device)
+    return int(((1 / (((f * freqs).sum(-1) / f.sum(-1)).mean())) // patch_size) * patch_size)
+def generate_reservoir_matrix(n, sparsity=0.05, spectral_radius=0.9, seed=None):
+    if seed is not None:
+        torch.manual_seed(seed)
+    # Step 1: Random matrix with values in [-1, 1]
+    W = torch.rand(n, n) * 2 - 1
+    # Step 2: Apply sparsity mask
+    mask = (torch.rand(n, n) < sparsity).float()
+    W *= mask
+    # Step 3: Normalize to desired spectral radius
+    eigenvalues = torch.linalg.eigvals(W).abs()
+    max_eigenvalue = torch.max(eigenvalues)
+    if max_eigenvalue > 0:
+        W *= spectral_radius / max_eigenvalue
+    return W