Update README.md
Browse files
README.md
CHANGED
|
@@ -81,7 +81,7 @@ def load_vesm(model_name="VESM_3B", local_dir="vesm", device='cuda'):
|
|
| 81 |
```py
|
| 82 |
# scoring functions
|
| 83 |
import torch.nn.functional as F
|
| 84 |
-
#
|
| 85 |
def get_llrs(sequence_logits, input_ids):
|
| 86 |
token_probs = torch.log_softmax(sequence_logits, dim=-1)
|
| 87 |
wt_positions = F.one_hot(input_ids, num_classes=token_probs.shape[-1])
|
|
@@ -91,23 +91,23 @@ def get_llrs(sequence_logits, input_ids):
|
|
| 91 |
llrs = token_probs - wt_probs.expand(token_probs.shape)
|
| 92 |
return llrs
|
| 93 |
|
| 94 |
-
# compute
|
| 95 |
-
def
|
| 96 |
-
|
| 97 |
-
for mut in
|
| 98 |
_, idx, mt = mut[0], int(mut[1:-1]), mut[-1]
|
| 99 |
pred = llrs[idx, sequence_vocabs[mt]]
|
| 100 |
-
|
| 101 |
-
return
|
| 102 |
```
|
| 103 |
|
| 104 |
#### Sequence-only Models
|
| 105 |
|
| 106 |
-
Here, we provide sample scripts to compute
|
| 107 |
```py
|
| 108 |
-
# sequence and
|
| 109 |
sequence = "MVNSTHRGMHTSLHLWNRSSYRLHSNASESLGKGYSDGGCYEQLFVSPEVFVTLGVISLLENILV"
|
| 110 |
-
|
| 111 |
```
|
| 112 |
|
| 113 |
```py
|
|
@@ -123,21 +123,18 @@ def inference(model, tokenizer, sequence, device):
|
|
| 123 |
outputs = model(**tokens)
|
| 124 |
logits = outputs['logits'][0]
|
| 125 |
input_ids = tokens['input_ids'][0]
|
| 126 |
-
#
|
| 127 |
llrs = get_llrs(logits, input_ids)
|
| 128 |
return llrs
|
| 129 |
|
| 130 |
-
|
| 131 |
-
Prediction with VESM models
|
| 132 |
-
"""
|
| 133 |
-
# load vesm models
|
| 134 |
model_name = 'VESM_3B'
|
| 135 |
model, tokenizer = load_vesm(model_name, local_dir=local_dir, device=device)
|
| 136 |
sequence_vocabs = tokenizer.get_vocab()
|
| 137 |
-
#
|
| 138 |
llrs = inference(model, tokenizer, sequence, device)
|
| 139 |
-
|
| 140 |
-
print(f"Predicted score by {model_name}: ",
|
| 141 |
```
|
| 142 |
|
| 143 |
|
|
@@ -149,13 +146,13 @@ from esm.sdk.api import ESMProtein
|
|
| 149 |
# !wget https://alphafold.ebi.ac.uk/files/AF-P32245-F1-model_v6.pdb
|
| 150 |
pdb_file = "AF-P32245-F1-model_v6.pdb"
|
| 151 |
protein = ESMProtein.from_pdb(pdb_file)
|
| 152 |
-
|
| 153 |
```
|
| 154 |
|
| 155 |
```py
|
| 156 |
# load model
|
| 157 |
model, tokenizer = load_vesm('VESM3', local_dir=local_dir, device=device)
|
| 158 |
-
sequence_vocabs =
|
| 159 |
|
| 160 |
# inference
|
| 161 |
tokens = model.encode(protein)
|
|
@@ -168,13 +165,13 @@ with torch.no_grad():
|
|
| 168 |
|
| 169 |
# calculate log-likelihood ratio from the logits
|
| 170 |
llrs = get_llrs(logits, input_ids)
|
| 171 |
-
# compute
|
| 172 |
-
|
| 173 |
-
print("
|
| 174 |
```
|
| 175 |
|
| 176 |
|
| 177 |
## License <a name="license"></a>
|
| 178 |
|
| 179 |
-
The source code and model weights for
|
| 180 |
The VESM3 model is a fine-tuned version of ESM3-Open (EvolutionaryScale) and is available under a [non-commercial license agreement](https://www.evolutionaryscale.ai/policies/cambrian-open-license-agreement).
|
|
|
|
| 81 |
```py
|
| 82 |
# scoring functions
|
| 83 |
import torch.nn.functional as F
|
| 84 |
+
# calculate log-likelihood ratio from the logits
|
| 85 |
def get_llrs(sequence_logits, input_ids):
|
| 86 |
token_probs = torch.log_softmax(sequence_logits, dim=-1)
|
| 87 |
wt_positions = F.one_hot(input_ids, num_classes=token_probs.shape[-1])
|
|
|
|
| 91 |
llrs = token_probs - wt_probs.expand(token_probs.shape)
|
| 92 |
return llrs
|
| 93 |
|
| 94 |
+
# compute mutation score
|
| 95 |
+
def score_mutation(llrs, mutation, sequence_vocabs):
|
| 96 |
+
mutation_score = 0
|
| 97 |
+
for mut in mutation.split(":"):
|
| 98 |
_, idx, mt = mut[0], int(mut[1:-1]), mut[-1]
|
| 99 |
pred = llrs[idx, sequence_vocabs[mt]]
|
| 100 |
+
mutation_score += pred.item()
|
| 101 |
+
return mutation_score
|
| 102 |
```
|
| 103 |
|
| 104 |
#### Sequence-only Models
|
| 105 |
|
| 106 |
+
Here, we provide sample scripts to compute mutation scores.
|
| 107 |
```py
|
| 108 |
+
# sequence and mutation
|
| 109 |
sequence = "MVNSTHRGMHTSLHLWNRSSYRLHSNASESLGKGYSDGGCYEQLFVSPEVFVTLGVISLLENILV"
|
| 110 |
+
mutation = "M1Y:V2T"
|
| 111 |
```
|
| 112 |
|
| 113 |
```py
|
|
|
|
| 123 |
outputs = model(**tokens)
|
| 124 |
logits = outputs['logits'][0]
|
| 125 |
input_ids = tokens['input_ids'][0]
|
| 126 |
+
# calculate log-likelihood ratio from the logits
|
| 127 |
llrs = get_llrs(logits, input_ids)
|
| 128 |
return llrs
|
| 129 |
|
| 130 |
+
# Prediction with VESM
|
|
|
|
|
|
|
|
|
|
| 131 |
model_name = 'VESM_3B'
|
| 132 |
model, tokenizer = load_vesm(model_name, local_dir=local_dir, device=device)
|
| 133 |
sequence_vocabs = tokenizer.get_vocab()
|
| 134 |
+
# compute mutation score
|
| 135 |
llrs = inference(model, tokenizer, sequence, device)
|
| 136 |
+
mutation_score = score_mutation(llrs, mutation, sequence_vocabs)
|
| 137 |
+
print(f"Predicted score by {model_name}: ", mutation_score)
|
| 138 |
```
|
| 139 |
|
| 140 |
|
|
|
|
| 146 |
# !wget https://alphafold.ebi.ac.uk/files/AF-P32245-F1-model_v6.pdb
|
| 147 |
pdb_file = "AF-P32245-F1-model_v6.pdb"
|
| 148 |
protein = ESMProtein.from_pdb(pdb_file)
|
| 149 |
+
mutation = "M1Y:V2T"
|
| 150 |
```
|
| 151 |
|
| 152 |
```py
|
| 153 |
# load model
|
| 154 |
model, tokenizer = load_vesm('VESM3', local_dir=local_dir, device=device)
|
| 155 |
+
sequence_vocabs = tokenizer.get_vocab()
|
| 156 |
|
| 157 |
# inference
|
| 158 |
tokens = model.encode(protein)
|
|
|
|
| 165 |
|
| 166 |
# calculate log-likelihood ratio from the logits
|
| 167 |
llrs = get_llrs(logits, input_ids)
|
| 168 |
+
# compute mutation score
|
| 169 |
+
mutation_score = score_mutation(llrs, mutation, sequence_vocabs)
|
| 170 |
+
print("mutation score: ", mutation_score)
|
| 171 |
```
|
| 172 |
|
| 173 |
|
| 174 |
## License <a name="license"></a>
|
| 175 |
|
| 176 |
+
The source code and model weights for VESM models are distributed under the MIT License.
|
| 177 |
The VESM3 model is a fine-tuned version of ESM3-Open (EvolutionaryScale) and is available under a [non-commercial license agreement](https://www.evolutionaryscale.ai/policies/cambrian-open-license-agreement).
|