| import os |
| from typing import Any |
| import numpy as np |
| import torch |
| from sentence_transformers.models import InputModule |
|
|
| from collections import OrderedDict |
|
|
| from safetensors.torch import save_file as save_safetensors |
|
|
| from luxical.embedder import Embedder, _pack_int_dict, _unpack_int_dict |
| from luxical.sparse_to_dense_neural_nets import SparseToDenseEmbedder |
| from luxical.tokenization import ArrowTokenizer |
|
|
|
|
| class Transformer(InputModule): |
| config_keys: list[str] = [] |
|
|
| def __init__(self, embedder: Embedder, **kwargs): |
| super().__init__() |
| self.embedder = embedder |
|
|
| def tokenize(self, texts: list[str], **kwargs) -> dict[str, torch.Tensor | Any]: |
| return {"inputs": self.embedder.tokenize(texts)} |
|
|
| def forward( |
| self, features: dict[str, torch.Tensor], **kwargs |
| ) -> dict[str, torch.Tensor]: |
| tokenized_docs = features["inputs"] |
| bow = self.embedder.bow_from_tokens(tokenized_docs) |
| tfidf = self.embedder.tfidf_from_bow(bow) |
| embeddings = self.embedder.bow_to_dense_embedder(tfidf) |
| embeddings = torch.from_numpy(embeddings) |
| features["sentence_embedding"] = embeddings |
| return features |
|
|
| def get_sentence_embedding_dimension(self) -> int: |
| return self.embedder.embedding_dim |
|
|
| @classmethod |
| def load( |
| cls, |
| model_name_or_path: str, |
| subfolder: str = "", |
| token: bool | str | None = None, |
| cache_folder: str | None = None, |
| revision: str | None = None, |
| local_files_only: bool = False, |
| **kwargs, |
| ): |
| state_dict = cls.load_torch_weights( |
| model_name_or_path, |
| subfolder=subfolder, |
| token=token, |
| cache_folder=cache_folder, |
| revision=revision, |
| local_files_only=local_files_only, |
| ) |
| embedder = _embedder_from_state_dict(state_dict) |
| return cls(embedder=embedder, **kwargs) |
|
|
| def save(self, output_path, *args, safe_serialization=True, **kwargs) -> None: |
| state_dict = _embedder_to_state_dict(self.embedder) |
| if safe_serialization: |
| save_safetensors(state_dict, os.path.join(output_path, "model.safetensors")) |
| else: |
| torch.save(state_dict, os.path.join(output_path, "pytorch_model.bin")) |
|
|
|
|
| def _embedder_from_state_dict(state_dict: OrderedDict[str, torch.Tensor]) -> Embedder: |
| version = int(state_dict["embedder.version"][0].item()) |
| if version != 1: |
| raise NotImplementedError(f"Unsupported embedder version: {version}") |
| tok_bytes = bytes( |
| state_dict["embedder.tokenizer"].cpu().numpy().astype(np.uint8).tolist() |
| ) |
| tokenizer = ArrowTokenizer(tok_bytes.decode("utf-8")) |
| recognized_ngrams = ( |
| state_dict["embedder.recognized_ngrams"] |
| .cpu() |
| .numpy() |
| .astype(np.int64, copy=False) |
| ) |
| keys = state_dict["embedder.ngram_keys"].cpu().numpy().astype(np.int64, copy=False) |
| vals = state_dict["embedder.ngram_vals"].cpu().numpy().astype(np.int64, copy=False) |
| ngram_map = _pack_int_dict(keys, vals) |
| idf_values = ( |
| state_dict["embedder.idf_values"].cpu().numpy().astype(np.float32, copy=False) |
| ) |
| num_layers = int(state_dict["embedder.num_layers"][0].item()) |
| layers = [ |
| state_dict[f"embedder.nn_layer_{i}"] |
| .cpu() |
| .numpy() |
| .astype(np.float32, copy=False) |
| for i in range(num_layers) |
| ] |
| s2d = SparseToDenseEmbedder(layers=layers) |
| return Embedder( |
| tokenizer=tokenizer, |
| recognized_ngrams=recognized_ngrams, |
| ngram_hash_to_ngram_idx=ngram_map, |
| idf_values=idf_values, |
| bow_to_dense_embedder=s2d, |
| ) |
|
|
|
|
| def _embedder_to_state_dict(embedder: Embedder) -> OrderedDict[str, torch.Tensor]: |
| sd: "OrderedDict[str, torch.Tensor]" = OrderedDict() |
| |
| sd["embedder.version"] = torch.tensor([1], dtype=torch.long) |
| |
| tok_bytes = np.frombuffer( |
| embedder.tokenizer.to_str().encode("utf-8"), dtype=np.uint8 |
| ) |
| sd["embedder.tokenizer"] = torch.from_numpy(tok_bytes.copy()) |
| |
| sd["embedder.recognized_ngrams"] = torch.from_numpy( |
| embedder.recognized_ngrams.astype(np.int64, copy=False) |
| ) |
| |
| keys, vals = _unpack_int_dict(embedder.ngram_hash_to_ngram_idx) |
| sd["embedder.ngram_keys"] = torch.from_numpy(keys.astype(np.int64, copy=False)) |
| sd["embedder.ngram_vals"] = torch.from_numpy(vals.astype(np.int64, copy=False)) |
| |
| sd["embedder.idf_values"] = torch.from_numpy( |
| embedder.idf_values.astype(np.float32, copy=False) |
| ) |
| |
| layers = embedder.bow_to_dense_embedder.layers |
| sd["embedder.num_layers"] = torch.tensor([len(layers)], dtype=torch.long) |
| for i, layer in enumerate(layers): |
| sd[f"embedder.nn_layer_{i}"] = torch.from_numpy( |
| layer.astype(np.float32, copy=False) |
| ) |
| return sd |
|
|