tuandinh commited on
Commit
946749e
·
verified ·
1 Parent(s): c350f8f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +21 -24
README.md CHANGED
@@ -81,7 +81,7 @@ def load_vesm(model_name="VESM_3B", local_dir="vesm", device='cuda'):
81
  ```py
82
  # scoring functions
83
  import torch.nn.functional as F
84
- # calcualte log-likelihood ratio from the logits
85
  def get_llrs(sequence_logits, input_ids):
86
  token_probs = torch.log_softmax(sequence_logits, dim=-1)
87
  wt_positions = F.one_hot(input_ids, num_classes=token_probs.shape[-1])
@@ -91,23 +91,23 @@ def get_llrs(sequence_logits, input_ids):
91
  llrs = token_probs - wt_probs.expand(token_probs.shape)
92
  return llrs
93
 
94
- # compute mutant score
95
- def score_mutant(llrs, mutant, sequence_vocabs):
96
- mutant_score = 0
97
- for mut in mutant.split(":"):
98
  _, idx, mt = mut[0], int(mut[1:-1]), mut[-1]
99
  pred = llrs[idx, sequence_vocabs[mt]]
100
- mutant_score += pred.item()
101
- return mutant_score
102
  ```
103
 
104
  #### Sequence-only Models
105
 
106
- Here, we provide sample scripts to compute mutant scores with VESM models
107
  ```py
108
- # sequence and mutant
109
  sequence = "MVNSTHRGMHTSLHLWNRSSYRLHSNASESLGKGYSDGGCYEQLFVSPEVFVTLGVISLLENILV"
110
- mutant = "M1Y:V2T"
111
  ```
112
 
113
  ```py
@@ -123,21 +123,18 @@ def inference(model, tokenizer, sequence, device):
123
  outputs = model(**tokens)
124
  logits = outputs['logits'][0]
125
  input_ids = tokens['input_ids'][0]
126
- # calcualte log-likelihood ratio from the logits
127
  llrs = get_llrs(logits, input_ids)
128
  return llrs
129
 
130
- """
131
- Prediction with VESM models
132
- """
133
- # load vesm models
134
  model_name = 'VESM_3B'
135
  model, tokenizer = load_vesm(model_name, local_dir=local_dir, device=device)
136
  sequence_vocabs = tokenizer.get_vocab()
137
- # inference
138
  llrs = inference(model, tokenizer, sequence, device)
139
- mutant_score = score_mutant(llrs, mutant, sequence_vocabs)
140
- print(f"Predicted score by {model_name}: ", mutant_score)
141
  ```
142
 
143
 
@@ -149,13 +146,13 @@ from esm.sdk.api import ESMProtein
149
  # !wget https://alphafold.ebi.ac.uk/files/AF-P32245-F1-model_v6.pdb
150
  pdb_file = "AF-P32245-F1-model_v6.pdb"
151
  protein = ESMProtein.from_pdb(pdb_file)
152
- mutant = "M1Y:V2T"
153
  ```
154
 
155
  ```py
156
  # load model
157
  model, tokenizer = load_vesm('VESM3', local_dir=local_dir, device=device)
158
- sequence_vocabs = model.tokenizers.sequence.vocab
159
 
160
  # inference
161
  tokens = model.encode(protein)
@@ -168,13 +165,13 @@ with torch.no_grad():
168
 
169
  # calculate log-likelihood ratio from the logits
170
  llrs = get_llrs(logits, input_ids)
171
- # compute mutant score
172
- mutant_score = score_mutant(llrs, mutant, sequence_vocabs)
173
- print("Mutant score: ", mutant_score)
174
  ```
175
 
176
 
177
  ## License <a name="license"></a>
178
 
179
- The source code and model weights for VESM1 and VESM2 are distributed under the MIT License.
180
  The VESM3 model is a fine-tuned version of ESM3-Open (EvolutionaryScale) and is available under a [non-commercial license agreement](https://www.evolutionaryscale.ai/policies/cambrian-open-license-agreement).
 
81
  ```py
82
  # scoring functions
83
  import torch.nn.functional as F
84
+ # calculate log-likelihood ratio from the logits
85
  def get_llrs(sequence_logits, input_ids):
86
  token_probs = torch.log_softmax(sequence_logits, dim=-1)
87
  wt_positions = F.one_hot(input_ids, num_classes=token_probs.shape[-1])
 
91
  llrs = token_probs - wt_probs.expand(token_probs.shape)
92
  return llrs
93
 
94
+ # compute mutation score
95
+ def score_mutation(llrs, mutation, sequence_vocabs):
96
+ mutation_score = 0
97
+ for mut in mutation.split(":"):
98
  _, idx, mt = mut[0], int(mut[1:-1]), mut[-1]
99
  pred = llrs[idx, sequence_vocabs[mt]]
100
+ mutation_score += pred.item()
101
+ return mutation_score
102
  ```
103
 
104
  #### Sequence-only Models
105
 
106
+ Here, we provide sample scripts to compute mutation scores.
107
  ```py
108
+ # sequence and mutation
109
  sequence = "MVNSTHRGMHTSLHLWNRSSYRLHSNASESLGKGYSDGGCYEQLFVSPEVFVTLGVISLLENILV"
110
+ mutation = "M1Y:V2T"
111
  ```
112
 
113
  ```py
 
123
  outputs = model(**tokens)
124
  logits = outputs['logits'][0]
125
  input_ids = tokens['input_ids'][0]
126
+ # calculate log-likelihood ratio from the logits
127
  llrs = get_llrs(logits, input_ids)
128
  return llrs
129
 
130
+ # Prediction with VESM
 
 
 
131
  model_name = 'VESM_3B'
132
  model, tokenizer = load_vesm(model_name, local_dir=local_dir, device=device)
133
  sequence_vocabs = tokenizer.get_vocab()
134
+ # compute mutation score
135
  llrs = inference(model, tokenizer, sequence, device)
136
+ mutation_score = score_mutation(llrs, mutation, sequence_vocabs)
137
+ print(f"Predicted score by {model_name}: ", mutation_score)
138
  ```
139
 
140
 
 
146
  # !wget https://alphafold.ebi.ac.uk/files/AF-P32245-F1-model_v6.pdb
147
  pdb_file = "AF-P32245-F1-model_v6.pdb"
148
  protein = ESMProtein.from_pdb(pdb_file)
149
+ mutation = "M1Y:V2T"
150
  ```
151
 
152
  ```py
153
  # load model
154
  model, tokenizer = load_vesm('VESM3', local_dir=local_dir, device=device)
155
+ sequence_vocabs = tokenizer.get_vocab()
156
 
157
  # inference
158
  tokens = model.encode(protein)
 
165
 
166
  # calculate log-likelihood ratio from the logits
167
  llrs = get_llrs(logits, input_ids)
168
+ # compute mutation score
169
+ mutation_score = score_mutation(llrs, mutation, sequence_vocabs)
170
+ print("mutation score: ", mutation_score)
171
  ```
172
 
173
 
174
  ## License <a name="license"></a>
175
 
176
+ The source code and model weights for VESM models are distributed under the MIT License.
177
  The VESM3 model is a fine-tuned version of ESM3-Open (EvolutionaryScale) and is available under a [non-commercial license agreement](https://www.evolutionaryscale.ai/policies/cambrian-open-license-agreement).