File size: 4,457 Bytes
ee6da62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import numpy as np
import torch
from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
from transformers import AutoModelForMaskedLM
from scoring.functions.binding import BindingAffinity
from scoring.functions.permeability import Permeability
from scoring.functions.solubility import Solubility
from scoring.functions.hemolysis import Hemolysis
from scoring.functions.nonfouling import Nonfouling

base_path = 'To Be Added'

def resolve_device(requested):
    if requested is None or str(requested).lower() == "auto":
        if torch.cuda.is_available() and torch.cuda.device_count() > 0:
            return torch.device("cuda:0")
        return torch.device("cpu")

    try:
        device = torch.device(requested)
    except Exception:
        return torch.device("cpu")

    if device.type != "cuda":
        return device

    if not torch.cuda.is_available() or torch.cuda.device_count() == 0:
        return torch.device("cpu")

    index = device.index if device.index is not None else 0
    if index is None or index < 0 or index >= torch.cuda.device_count():
        return torch.device("cuda:0")

    return torch.device(f"cuda:{index}")

class ScoringFunctions:
    def __init__(self, score_func_names=None, prot_seqs=None, device=None):
        """
        Class for generating score vectors given generated sequence

        Args:
            score_func_names: list of scoring function names to be evaluated
            score_weights: weights to scale scores (default: 1)
            target_protein: sequence of target protein binder
        """
        device = resolve_device(device)
        emb_model = AutoModelForMaskedLM.from_pretrained(
            'aaronfeller/PeptideCLM-23M-all'
        ).roformer.to(device).eval()
        tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/tr2d2-pep/tokenizer/new_vocab.txt',
                                        f'{base_path}/tr2d2-pep/tokenizer/new_splits.txt')
        prot_seqs = prot_seqs if prot_seqs is not None else []
        
        if score_func_names is None:
            # just do unmasking based on validity of peptide bonds
            self.score_func_names = []
        else:
            self.score_func_names = score_func_names
                
        # self.weights = np.array([1] * len(self.score_func_names) if score_weights is None else score_weights)
        
        # binding affinities
        self.target_protein = prot_seqs
        print(len(prot_seqs))
        
        if ('binding_affinity1' in score_func_names) and (len(prot_seqs) == 1):
            binding_affinity1 = BindingAffinity(prot_seqs[0], tokenizer=tokenizer, base_path=base_path, device=device)
            binding_affinity2 = None
        elif ('binding_affinity1' in score_func_names) and ('binding_affinity2' in score_func_names) and (len(prot_seqs) == 2):
            binding_affinity1 = BindingAffinity(prot_seqs[0], tokenizer=tokenizer, base_path=base_path, device=device)
            binding_affinity2 = BindingAffinity(prot_seqs[1], tokenizer=tokenizer, base_path=base_path, device=device)
        else:
            print("here")
            binding_affinity1 = None
            binding_affinity2 = None

        permeability = Permeability(tokenizer=tokenizer, base_path=base_path, device=device, emb_model=emb_model)
        sol = Solubility(tokenizer=tokenizer, base_path=base_path, device=device, emb_model=emb_model)
        nonfouling = Nonfouling(tokenizer=tokenizer, base_path=base_path, device=device, emb_model=emb_model)
        hemo = Hemolysis(tokenizer=tokenizer, base_path=base_path, device=device, emb_model=emb_model)

        self.all_funcs = {'binding_affinity1': binding_affinity1,
                          'binding_affinity2': binding_affinity2,
                          'permeability': permeability,
                          'nonfouling': nonfouling, 
                          'solubility': sol, 
                          'hemolysis': hemo
                          } 
        
    def forward(self, input_seqs):
        scores = []
        
        for i, score_func in enumerate(self.score_func_names): 
            score = self.all_funcs[score_func](input_seqs = input_seqs)
        
            scores.append(score)
            
        # convert to numpy arrays with shape (num_sequences, num_functions)
        scores = np.float32(scores).T
        
        return scores
    
    def __call__(self, input_seqs: list):
        return self.forward(input_seqs)