| import os |
| from argparse import FileType, ArgumentParser |
|
|
| import numpy as np |
| import pandas as pd |
| from Bio.PDB import PDBParser |
| from Bio.Seq import Seq |
| from Bio.SeqRecord import SeqRecord |
| from tqdm import tqdm |
| from Bio import SeqIO |
|
|
|
|
|
|
| def esm_embedding_prep(out_file, protein_path): |
| biopython_parser = PDBParser() |
|
|
| three_to_one = { |
| "ALA": "A", |
| "ARG": "R", |
| "ASN": "N", |
| "ASP": "D", |
| "CYS": "C", |
| "GLN": "Q", |
| "GLU": "E", |
| "GLY": "G", |
| "HIS": "H", |
| "ILE": "I", |
| "LEU": "L", |
| "LYS": "K", |
| "MET": "M", |
| "MSE": "M", |
| "PHE": "F", |
| "PRO": "P", |
| "PYL": "O", |
| "SER": "S", |
| "SEC": "U", |
| "THR": "T", |
| "TRP": "W", |
| "TYR": "Y", |
| "VAL": "V", |
| "ASX": "B", |
| "GLX": "Z", |
| "XAA": "X", |
| "XLE": "J", |
| } |
|
|
| file_paths = [protein_path] |
| sequences = [] |
| ids = [] |
| for file_path in tqdm(file_paths): |
| structure = biopython_parser.get_structure("random_id", file_path) |
| structure = structure[0] |
| for i, chain in enumerate(structure): |
| seq = "" |
| for res_idx, residue in enumerate(chain): |
| if residue.get_resname() == "HOH": |
| continue |
| residue_coords = [] |
| c_alpha, n, c = None, None, None |
| for atom in residue: |
| if atom.name == "CA": |
| c_alpha = list(atom.get_vector()) |
| if atom.name == "N": |
| n = list(atom.get_vector()) |
| if atom.name == "C": |
| c = list(atom.get_vector()) |
| if ( |
| c_alpha != None and n != None and c != None |
| ): |
| try: |
| seq += three_to_one[residue.get_resname()] |
| except Exception as e: |
| seq += "-" |
| print( |
| "encountered unknown AA: ", |
| residue.get_resname(), |
| " in the complex ", |
| file_path, |
| ". Replacing it with a dash - .", |
| ) |
| sequences.append(seq) |
| ids.append(f"{os.path.basename(file_path)}_chain_{i}") |
| records = [] |
| for (index, seq) in zip(ids, sequences): |
| record = SeqRecord(Seq(seq), str(index)) |
| record.description = "" |
| records.append(record) |
| SeqIO.write(records, out_file, "fasta") |
|
|