| import os |
| import re |
| import gzip |
| import time |
| import shutil |
| import argparse |
| from copy import deepcopy |
| from tempfile import NamedTemporaryFile |
| import multiprocessing as mp |
|
|
| import numpy as np |
| from Bio.PDB import PDBParser,Chain,Model,Structure, PDBIO |
| from Bio.PDB.DSSP import dssp_dict_from_pdb_file |
| from Bio.SeqUtils.ProtParam import ProteinAnalysis |
| from rdkit.Chem.rdMolDescriptors import CalcTPSA |
| from freesasa import calcBioPDB |
| from rdkit.Chem import MolFromSmiles |
|
|
| from globals import CACHE_DIR, CONTACT_DIST |
| from utils.logger import print_log |
| from utils.file_utils import cnt_num_files, get_filename |
| from data.mmap_dataset import create_mmap |
| from data.converter.pdb_to_list_blocks import pdb_to_list_blocks |
| from data.converter.blocks_interface import blocks_cb_interface, blocks_interface |
|
|
| from .pepbind import clustering |
|
|
|
|
| def parse(): |
| parser = argparse.ArgumentParser(description='Filter peptide-like loop from monomers') |
| parser.add_argument('--database_dir', type=str, required=True, |
| help='Directory of pdb database processed in monomers') |
| parser.add_argument('--pdb_dir', type=str, required=True, help='Directory to PDB database') |
| parser.add_argument('--out_dir', type=str, required=True, help='Output directory') |
| parser.add_argument('--pocket_th', type=float, default=10.0, help='Threshold for determining pocket') |
| parser.add_argument('--n_cpu', type=int, default=4, help='Number of CPU to use') |
|
|
| return parser.parse_args() |
|
|
|
|
| |
| AA3TO1 = { |
| 'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M', |
| 'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K', |
| 'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H', |
| 'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G',} |
| |
| hydrophobic_residues=['V','I','L','M','F','W','C'] |
| charged_residues=['H','R','K','D','E'] |
|
|
| def add_cb(input_array): |
| |
| |
| N,CA,C,O = input_array |
| b = CA - N |
| c = C - CA |
| a = np.cross(b,c) |
| CB = np.around(-0.58273431*a + 0.56802827*b - 0.54067466*c + CA,3) |
| return CB |
|
|
| aaSMILES = {'G': 'NCC(=O)O', |
| 'A': 'N[C@@]([H])(C)C(=O)O', |
| 'R': 'N[C@@]([H])(CCCNC(=N)N)C(=O)O', |
| 'N': 'N[C@@]([H])(CC(=O)N)C(=O)O', |
| 'D': 'N[C@@]([H])(CC(=O)O)C(=O)O', |
| 'C': 'N[C@@]([H])(CS)C(=O)O', |
| 'E': 'N[C@@]([H])(CCC(=O)O)C(=O)O', |
| 'Q': 'N[C@@]([H])(CCC(=O)N)C(=O)O', |
| 'H': 'N[C@@]([H])(CC1=CN=C-N1)C(=O)O', |
| 'I': 'N[C@@]([H])(C(CC)C)C(=O)O', |
| 'L': 'N[C@@]([H])(CC(C)C)C(=O)O', |
| 'K': 'N[C@@]([H])(CCCCN)C(=O)O', |
| 'M': 'N[C@@]([H])(CCSC)C(=O)O', |
| 'F': 'N[C@@]([H])(Cc1ccccc1)C(=O)O', |
| 'P': 'N1[C@@]([H])(CCC1)C(=O)O', |
| 'S': 'N[C@@]([H])(CO)C(=O)O', |
| 'T': 'N[C@@]([H])(C(O)C)C(=O)O', |
| 'W': 'N[C@@]([H])(CC(=CN2)C1=C2C=CC=C1)C(=O)O', |
| 'Y': 'N[C@@]([H])(Cc1ccc(O)cc1)C(=O)O', |
| 'V': 'N[C@@]([H])(C(C)C)C(=O)O'} |
|
|
|
|
| class Filter: |
| def __init__( |
| self, |
| min_loop_len = 4, |
| max_loop_len = 25, |
| min_BSA = 400, |
| min_relBSA = 0.2, |
| max_relncBSA = 0.3, |
| saved_maxlen = 25, |
| saved_BSA = 400, |
| saved_relBSA = 0.2, |
| saved_helix_ratio = 1.0, |
| saved_strand_ratio = 1.0, |
| cyclic=False |
| ) -> None: |
|
|
| self.re_filter = re.compile(r'D[GPS]|[P]{2,}|C') |
| self.cache_dir = CACHE_DIR |
|
|
| self.min_loop_len = min_loop_len |
| self.max_loop_len = max_loop_len |
| self.min_BSA = min_BSA |
| self.min_relBSA = min_relBSA |
| self.max_relncBSA = max_relncBSA |
| self.saved_maxlen = saved_maxlen |
| self.saved_BSA = saved_BSA |
| self.saved_relBSA = saved_relBSA |
| self.saved_helix_ratio = saved_helix_ratio |
| self.saved_strand_ratio = saved_strand_ratio |
| self.cyclic = cyclic |
|
|
| @classmethod |
| def get_ss_info(cls, pdb_path: str): |
| dssp, keys = dssp_dict_from_pdb_file(pdb_path, DSSP='mkdssp') |
| ss_info = {} |
| for key in keys: |
| chain_id, value = key[0], dssp[key] |
| if chain_id not in ss_info: |
| ss_info[chain_id] = [] |
| ss_type = value[1] |
| if ss_type in ['H', 'G', 'I']: |
| ss_info[chain_id].append('a') |
| elif ss_type in ['B', 'E', 'T', 'S']: |
| ss_info[chain_id].append('b') |
| elif ss_type == '-': |
| ss_info[chain_id].append('c') |
| else: |
| raise ValueError(f'SS type {ss_type} cannot be recognized!') |
| return ss_info |
| |
| @classmethod |
| def get_bsa(self, receptor_chain: Chain.Chain, ligand_chain: Chain.Chain): |
| lig_chain_id = ligand_chain.get_id() |
| tmp_structure = Structure.Structure('tmp') |
| tmp_model = Model.Model(0) |
| tmp_structure.add(tmp_model) |
| tmp_model.add(ligand_chain) |
| unbounded_SASA = calcBioPDB(tmp_structure)[0].residueAreas()[lig_chain_id] |
| unbounded_SASA = [k.total for k in unbounded_SASA.values()] |
|
|
| tmp_model.add(receptor_chain) |
| bounded_SASA = calcBioPDB(tmp_structure)[0].residueAreas()[lig_chain_id] |
| bounded_SASA = [k.total for k in bounded_SASA.values()] |
|
|
| abs_bsa = sum(unbounded_SASA[1:-1]) - sum(bounded_SASA[1:-1]) |
| rel_bsa = abs_bsa / sum(unbounded_SASA[1:-1]) |
| rel_nc_bsa = (unbounded_SASA[0] + unbounded_SASA[-1] - bounded_SASA[0] - bounded_SASA[-1]) / (unbounded_SASA[0] + unbounded_SASA[-1]) |
|
|
| return abs_bsa, rel_bsa, rel_nc_bsa, tmp_structure |
| |
| def filter_pdb(self, pdb_path, selected_chains=None): |
| parser = PDBParser(QUIET=True) |
| ss_info = self.get_ss_info(pdb_path) |
| structure = parser.get_structure('anonym', pdb_path) |
| |
| for model in structure.get_models(): |
| structure = model |
| break |
|
|
| results = [] |
| for chain in structure.get_chains(): |
| if selected_chains is not None and chain.get_id() not in selected_chains: |
| continue |
| chain_ss_info = None if ss_info is None else ss_info[chain.get_id()] |
| results.extend(self.filter_chain(chain, chain_ss_info)) |
| |
| return results |
|
|
|
|
| def filter_chain(self, chain, ss_info=None): |
|
|
| non_standard = False |
| for res in chain: |
| if res.get_resname() not in AA3TO1: |
| non_standard = True |
| break |
| if non_standard: |
| return [] |
| |
| if len(ss_info) != len(chain): |
| return [] |
|
|
| cb_coord = [] |
| seq = '' |
| for res in chain: |
| seq += AA3TO1[res.get_resname()] |
| try: |
| cb_coord.append(res['CB'].get_coord()) |
| except: |
| tmp_coord = np.array([ |
| res['N'].get_coord(), |
| res['CA'].get_coord(), |
| res['C'].get_coord(), |
| res['O'].get_coord() |
| ]) |
| cb_coord.append(add_cb(tmp_coord)) |
| cb_coord = np.array(cb_coord) |
| cb_contact = np.linalg.norm(cb_coord[None,:,:,] - cb_coord[:,None,:],axis=-1) |
| if self.cyclic: |
| possible_ss = (cb_contact >= 3.5) & (cb_contact <= 5) |
| else: |
| possible_ss = np.ones(cb_contact.shape, dtype=bool) |
| possible_ss = np.triu(np.tril(possible_ss, self.max_loop_len - 1), self.min_loop_len - 1) |
| ss_pair = np.where(possible_ss) |
| accepted, saved_spans = [], [] |
| for i, j in zip(ss_pair[0],ss_pair[1]): |
| redundant = False |
| for exist_i, exist_j in saved_spans: |
| overlap = min(j, exist_j) - max(i, exist_i) + 1 |
| if overlap / (j - i + 1) > 0.4 or overlap / (exist_j - exist_i + 1) > 0.4: |
| redundant = True |
| break |
| if redundant: |
| continue |
| |
| min_dist = np.min(cb_contact[i : j + 1], axis=0) |
| min_dist[max(i - 5, 0):min(j + 6, len(seq))] = 21 |
| neighbors_20A = np.where(min_dist < 20)[0] |
| if len(neighbors_20A) < 16: |
| continue |
|
|
| |
| pep_seq = seq[i:j+1] |
| |
| if self.re_filter.search(pep_seq) is not None: |
| continue |
| prot_param=ProteinAnalysis(pep_seq) |
| aa_percent = prot_param.get_amino_acids_percent() |
| max_ratio = max(aa_percent.values()) |
| |
| if max_ratio > 0.25: |
| continue |
| hydrophobic_ratio = sum([aa_percent[k] for k in hydrophobic_residues]) |
| |
| if hydrophobic_ratio > 0.45: |
| continue |
| |
| charged_ratio = sum([aa_percent[k] for k in charged_residues]) |
| if charged_ratio > 0.45 or charged_ratio < 0.25: |
| continue |
| |
| if prot_param.instability_index() >= 40: |
| continue |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| receptor_chain = Chain.Chain('R') |
| ligand_chain = Chain.Chain('L') |
| for k,res in enumerate(chain): |
| if k >= i and k <= j: |
| ligand_chain.add(res.copy()) |
| elif k in neighbors_20A: |
| receptor_chain.add(res.copy()) |
| |
| abs_bsa, rel_bsa, rel_nc_bsa, tmp_structure = self.get_bsa(receptor_chain, ligand_chain) |
| if abs_bsa <= self.min_BSA or rel_bsa <= self.min_relBSA or (self.cyclic and rel_nc_bsa >= self.max_relncBSA): |
| continue |
| |
| |
| length = j - i + 1 |
| if ss_info is None: |
| helix_ratio = -1 |
| strand_ratio = -1 |
| coil_ratio = -1 |
| else: |
| ssa = ss_info[i:j+1] |
| helix_ratio = ssa.count('a') / length |
| strand_ratio = ssa.count('b') / length |
| coil_ratio = ssa.count('c') / length |
| |
| |
| |
| if length <= self.saved_maxlen and abs_bsa >= self.saved_BSA and rel_bsa >= self.saved_relBSA and helix_ratio <= self.saved_helix_ratio and strand_ratio <= self.saved_strand_ratio: |
| output_structure = deepcopy(tmp_structure) |
| else: |
| output_structure = None |
| accepted.append(( |
| i , j, length, abs_bsa, rel_bsa, helix_ratio, strand_ratio, coil_ratio, output_structure |
| )) |
| saved_spans.append((i, j)) |
|
|
| return accepted |
| |
|
|
| def get_non_redundant(mmap_dir): |
| np.random.seed(12) |
| index_path = os.path.join(mmap_dir, 'index.txt') |
| parent_dir = mmap_dir |
|
|
| |
| items = {} |
| with open(index_path, 'r') as fin: |
| lines = fin.readlines() |
| for line in lines: |
| values = line.strip().split('\t') |
| _id, seq = values[0], values[-1] |
| chain, pdb_file = _id.split('_') |
| items[_id] = (seq, chain, pdb_file) |
|
|
| |
| tmp_dir = os.path.join(parent_dir, 'tmp') |
| if not os.path.exists(tmp_dir): |
| os.makedirs(tmp_dir) |
| else: |
| raise ValueError(f'Working directory {tmp_dir} exists!') |
| |
| |
| fasta = os.path.join(tmp_dir, 'seq.fasta') |
| with open(fasta, 'w') as fout: |
| for _id in items: |
| fout.write(f'>{_id}\n{items[_id][0]}\n') |
| id2clu, clu2id = clustering(fasta, tmp_dir, 0.9) |
| non_redundant = [] |
| for clu in clu2id: |
| ids = clu2id[clu] |
| non_redundant.append(np.random.choice(ids)) |
| print_log(f'Non-redundant entries: {len(non_redundant)}') |
| shutil.rmtree(tmp_dir) |
|
|
| |
| indexes = {} |
| for _id in non_redundant: |
| _, chain, pdb_file = items[_id] |
| if pdb_file not in indexes: |
| indexes[pdb_file] = [] |
| indexes[pdb_file].append(chain) |
|
|
| return indexes |
|
|
|
|
| def mp_worker(data_dir, tmp_dir, pdb_file, selected_chains, pep_filter, pdb_out_dir, queue): |
| category = pdb_file[4:6] |
| category_dir = os.path.join(data_dir, category) |
| path = os.path.join(category_dir, pdb_file) |
| tmp_file = os.path.join(tmp_dir, f'{pdb_file}.decompressed') |
| pdb_id = get_filename(pdb_file.split('.')[0]) |
|
|
| |
| with gzip.open(path, 'rb') as fin: |
| with open(tmp_file, 'wb') as fout: |
| shutil.copyfileobj(fin, fout) |
|
|
| files = [] |
| try: |
| |
| |
| |
| results = pep_filter.filter_pdb(tmp_file, selected_chains=selected_chains) |
| for item in results: |
| i, j, struct = item[0], item[1], item[-1] |
| if struct is None: |
| continue |
| io = PDBIO() |
| io.set_structure(struct) |
| _id = pdb_id + f'_{i}_{j}' |
| save_path = os.path.join(pdb_out_dir, _id + '.pdb') |
| io.save(save_path) |
| files.append(save_path) |
| except Exception: |
| pass |
| queue.put((pdb_file, files)) |
| os.remove(tmp_file) |
| |
|
|
| def process_iterator(indexes, data_dir, tmp_dir, out_dir, pocket_th, n_cpu): |
| pdb_out_dir = os.path.join(out_dir, 'pdbs') |
| if not os.path.exists(pdb_out_dir): |
| os.makedirs(pdb_out_dir) |
| pep_filter = Filter() |
|
|
| file_cnt, pointer, filenames = 0, 0, list(indexes.keys()) |
| id2task = {} |
| queue = mp.Queue() |
| |
| for _ in range(n_cpu): |
| task_id = filenames[pointer] |
| id2task[task_id] = mp.Process( |
| target=mp_worker, |
| args=(data_dir, tmp_dir, task_id, indexes[task_id], pep_filter, pdb_out_dir, queue) |
| ) |
| id2task[task_id].start() |
| pointer += 1 |
|
|
| while True: |
| if len(id2task) == 0: |
| break |
|
|
| if not queue.qsize: |
| time.sleep(1) |
| continue |
|
|
| pdb_file, paths = queue.get() |
| file_cnt += 1 |
| id2task[pdb_file].join() |
| del id2task[pdb_file] |
|
|
| |
| if pointer < len(filenames): |
| task_id = filenames[pointer] |
| id2task[task_id] = mp.Process( |
| target=mp_worker, |
| args=(data_dir, tmp_dir, task_id, indexes[task_id], pep_filter, pdb_out_dir, queue) |
| ) |
| id2task[task_id].start() |
| pointer += 1 |
|
|
| |
| for save_path in paths: |
| _id = get_filename(save_path) |
|
|
| list_blocks, chains = pdb_to_list_blocks(save_path, return_chain_ids=True) |
| if chains[0] == 'L': |
| list_blocks, chains = (list_blocks[1], list_blocks[0]), (chains[1], chains[0]) |
|
|
| rec_blocks, lig_blocks = list_blocks |
| rec_chain, lig_chain = chains |
| try: |
| _, (pocket_idx, _) = blocks_cb_interface(rec_blocks, lig_blocks, pocket_th) |
| except KeyError: |
| continue |
| rec_num_units = sum([len(block) for block in rec_blocks]) |
| lig_num_units = sum([len(block) for block in lig_blocks]) |
| rec_data = [block.to_tuple() for block in rec_blocks] |
| lig_data = [block.to_tuple() for block in lig_blocks] |
| rec_seq = ''.join([AA3TO1[block.abrv] for block in rec_blocks]) |
| lig_seq = ''.join([AA3TO1[block.abrv] for block in lig_blocks]) |
|
|
| yield _id, (rec_data, lig_data), [ |
| len(rec_blocks), len(lig_blocks), rec_num_units, lig_num_units, |
| rec_chain, lig_chain, rec_seq, lig_seq, |
| ','.join([str(idx) for idx in pocket_idx]), |
| ], file_cnt |
|
|
|
|
| def main(args): |
| indexes = get_non_redundant(args.database_dir) |
| cnt = len(indexes) |
| tmp_dir = './tmp' |
| if not os.path.exists(tmp_dir): |
| os.makedirs(tmp_dir) |
|
|
| print_log(f'Processing data from directory: {args.pdb_dir}.') |
| print_log(f'Number of entries: {cnt}') |
| create_mmap( |
| process_iterator(indexes, args.pdb_dir, tmp_dir, args.out_dir, args.pocket_th, args.n_cpu), |
| args.out_dir, cnt) |
| |
| print_log('Finished!') |
|
|
| shutil.rmtree(tmp_dir) |
|
|
|
|
| if __name__ == '__main__': |
| main(parse()) |
|
|