import numpy as np import torch from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer from transformers import AutoModelForMaskedLM from scoring.functions.binding import BindingAffinity from scoring.functions.permeability import Permeability from scoring.functions.solubility import Solubility from scoring.functions.hemolysis import Hemolysis from scoring.functions.nonfouling import Nonfouling base_path = 'To Be Added' def resolve_device(requested): if requested is None or str(requested).lower() == "auto": if torch.cuda.is_available() and torch.cuda.device_count() > 0: return torch.device("cuda:0") return torch.device("cpu") try: device = torch.device(requested) except Exception: return torch.device("cpu") if device.type != "cuda": return device if not torch.cuda.is_available() or torch.cuda.device_count() == 0: return torch.device("cpu") index = device.index if device.index is not None else 0 if index is None or index < 0 or index >= torch.cuda.device_count(): return torch.device("cuda:0") return torch.device(f"cuda:{index}") class ScoringFunctions: def __init__(self, score_func_names=None, prot_seqs=None, device=None): """ Class for generating score vectors given generated sequence Args: score_func_names: list of scoring function names to be evaluated score_weights: weights to scale scores (default: 1) target_protein: sequence of target protein binder """ device = resolve_device(device) emb_model = AutoModelForMaskedLM.from_pretrained( 'aaronfeller/PeptideCLM-23M-all' ).roformer.to(device).eval() tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/tr2d2-pep/tokenizer/new_vocab.txt', f'{base_path}/tr2d2-pep/tokenizer/new_splits.txt') prot_seqs = prot_seqs if prot_seqs is not None else [] if score_func_names is None: # just do unmasking based on validity of peptide bonds self.score_func_names = [] else: self.score_func_names = score_func_names # self.weights = np.array([1] * len(self.score_func_names) if score_weights is None else score_weights) # binding affinities self.target_protein = prot_seqs print(len(prot_seqs)) if ('binding_affinity1' in score_func_names) and (len(prot_seqs) == 1): binding_affinity1 = BindingAffinity(prot_seqs[0], tokenizer=tokenizer, base_path=base_path, device=device) binding_affinity2 = None elif ('binding_affinity1' in score_func_names) and ('binding_affinity2' in score_func_names) and (len(prot_seqs) == 2): binding_affinity1 = BindingAffinity(prot_seqs[0], tokenizer=tokenizer, base_path=base_path, device=device) binding_affinity2 = BindingAffinity(prot_seqs[1], tokenizer=tokenizer, base_path=base_path, device=device) else: print("here") binding_affinity1 = None binding_affinity2 = None permeability = Permeability(tokenizer=tokenizer, base_path=base_path, device=device, emb_model=emb_model) sol = Solubility(tokenizer=tokenizer, base_path=base_path, device=device, emb_model=emb_model) nonfouling = Nonfouling(tokenizer=tokenizer, base_path=base_path, device=device, emb_model=emb_model) hemo = Hemolysis(tokenizer=tokenizer, base_path=base_path, device=device, emb_model=emb_model) self.all_funcs = {'binding_affinity1': binding_affinity1, 'binding_affinity2': binding_affinity2, 'permeability': permeability, 'nonfouling': nonfouling, 'solubility': sol, 'hemolysis': hemo } def forward(self, input_seqs): scores = [] for i, score_func in enumerate(self.score_func_names): score = self.all_funcs[score_func](input_seqs = input_seqs) scores.append(score) # convert to numpy arrays with shape (num_sequences, num_functions) scores = np.float32(scores).T return scores def __call__(self, input_seqs: list): return self.forward(input_seqs)