PepGLAD / scripts /data_process /aug_from_monomer.py

添加PepGLAD初始代码

52007f8 5 months ago

17.9 kB

	import os
	import re
	import gzip
	import time
	import shutil
	import argparse
	from copy import deepcopy
	from tempfile import NamedTemporaryFile
	import multiprocessing as mp

	import numpy as np
	from Bio.PDB import PDBParser,Chain,Model,Structure, PDBIO
	from Bio.PDB.DSSP import dssp_dict_from_pdb_file
	from Bio.SeqUtils.ProtParam import ProteinAnalysis
	from rdkit.Chem.rdMolDescriptors import CalcTPSA
	from freesasa import calcBioPDB
	from rdkit.Chem import MolFromSmiles

	from globals import CACHE_DIR, CONTACT_DIST
	from utils.logger import print_log
	from utils.file_utils import cnt_num_files, get_filename
	from data.mmap_dataset import create_mmap
	from data.converter.pdb_to_list_blocks import pdb_to_list_blocks
	from data.converter.blocks_interface import blocks_cb_interface, blocks_interface

	from .pepbind import clustering


	def parse():
	parser = argparse.ArgumentParser(description='Filter peptide-like loop from monomers')
	parser.add_argument('--database_dir', type=str, required=True,
	help='Directory of pdb database processed in monomers')
	parser.add_argument('--pdb_dir', type=str, required=True, help='Directory to PDB database')
	parser.add_argument('--out_dir', type=str, required=True, help='Output directory')
	parser.add_argument('--pocket_th', type=float, default=10.0, help='Threshold for determining pocket')
	parser.add_argument('--n_cpu', type=int, default=4, help='Number of CPU to use')

	return parser.parse_args()


	# Constants
	AA3TO1 = {
	'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M',
	'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K',
	'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H',
	'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G',}

	hydrophobic_residues=['V','I','L','M','F','W','C']
	charged_residues=['H','R','K','D','E']

	def add_cb(input_array):
	#from protein mpnn
	#The virtual Cβ coordinates were calculated using ideal angle and bond length definitions: b = Cα - N, c = C - Cα, a = cross(b, c), Cβ = -0.58273431a + 0.56802827b - 0.54067466*c + Cα.
	N,CA,C,O = input_array
	b = CA - N
	c = C - CA
	a = np.cross(b,c)
	CB = np.around(-0.58273431a + 0.56802827b - 0.54067466*c + CA,3)
	return CB #np.array([N,CA,C,CB,O])

	aaSMILES = {'G': 'NCC(=O)O',
	'A': 'N[C@@]([H])(C)C(=O)O',
	'R': 'N[C@@]([H])(CCCNC(=N)N)C(=O)O',
	'N': 'N[C@@]([H])(CC(=O)N)C(=O)O',
	'D': 'N[C@@]([H])(CC(=O)O)C(=O)O',
	'C': 'N[C@@]([H])(CS)C(=O)O',
	'E': 'N[C@@]([H])(CCC(=O)O)C(=O)O',
	'Q': 'N[C@@]([H])(CCC(=O)N)C(=O)O',
	'H': 'N[C@@]([H])(CC1=CN=C-N1)C(=O)O',
	'I': 'N[C@@]([H])(C(CC)C)C(=O)O',
	'L': 'N[C@@]([H])(CC(C)C)C(=O)O',
	'K': 'N[C@@]([H])(CCCCN)C(=O)O',
	'M': 'N[C@@]([H])(CCSC)C(=O)O',
	'F': 'N[C@@]([H])(Cc1ccccc1)C(=O)O',
	'P': 'N1[C@@]([H])(CCC1)C(=O)O',
	'S': 'N[C@@]([H])(CO)C(=O)O',
	'T': 'N[C@@]([H])(C(O)C)C(=O)O',
	'W': 'N[C@@]([H])(CC(=CN2)C1=C2C=CC=C1)C(=O)O',
	'Y': 'N[C@@]([H])(Cc1ccc(O)cc1)C(=O)O',
	'V': 'N[C@@]([H])(C(C)C)C(=O)O'}


	class Filter:
	def __init__(
	self,
	min_loop_len = 4,
	max_loop_len = 25,
	min_BSA = 400,
	min_relBSA = 0.2,
	max_relncBSA = 0.3,
	saved_maxlen = 25,
	saved_BSA = 400,
	saved_relBSA = 0.2,
	saved_helix_ratio = 1.0,
	saved_strand_ratio = 1.0,
	cyclic=False
	) -> None:

	self.re_filter = re.compile(r'D[GPS]\|[P]{2,}\|C') #https://www.thermofisher.cn/cn/zh/home/life-science/protein-biology/protein-biology-learning-center/protein-biology-resource-library/pierce-protein-methods/peptide-design.html
	self.cache_dir = CACHE_DIR

	self.min_loop_len = min_loop_len
	self.max_loop_len = max_loop_len
	self.min_BSA = min_BSA
	self.min_relBSA = min_relBSA
	self.max_relncBSA = max_relncBSA
	self.saved_maxlen = saved_maxlen
	self.saved_BSA = saved_BSA
	self.saved_relBSA = saved_relBSA
	self.saved_helix_ratio = saved_helix_ratio
	self.saved_strand_ratio = saved_strand_ratio
	self.cyclic = cyclic

	@classmethod
	def get_ss_info(cls, pdb_path: str):
	dssp, keys = dssp_dict_from_pdb_file(pdb_path, DSSP='mkdssp')
	ss_info = {}
	for key in keys:
	chain_id, value = key[0], dssp[key]
	if chain_id not in ss_info:
	ss_info[chain_id] = []
	ss_type = value[1]
	if ss_type in ['H', 'G', 'I']:
	ss_info[chain_id].append('a')
	elif ss_type in ['B', 'E', 'T', 'S']:
	ss_info[chain_id].append('b')
	elif ss_type == '-':
	ss_info[chain_id].append('c')
	else:
	raise ValueError(f'SS type {ss_type} cannot be recognized!')
	return ss_info

	@classmethod
	def get_bsa(self, receptor_chain: Chain.Chain, ligand_chain: Chain.Chain):
	lig_chain_id = ligand_chain.get_id()
	tmp_structure = Structure.Structure('tmp')
	tmp_model = Model.Model(0)
	tmp_structure.add(tmp_model)
	tmp_model.add(ligand_chain)
	unbounded_SASA = calcBioPDB(tmp_structure)[0].residueAreas()[lig_chain_id]
	unbounded_SASA = [k.total for k in unbounded_SASA.values()]

	tmp_model.add(receptor_chain)
	bounded_SASA = calcBioPDB(tmp_structure)[0].residueAreas()[lig_chain_id]
	bounded_SASA = [k.total for k in bounded_SASA.values()]

	abs_bsa = sum(unbounded_SASA[1:-1]) - sum(bounded_SASA[1:-1])
	rel_bsa = abs_bsa / sum(unbounded_SASA[1:-1])
	rel_nc_bsa = (unbounded_SASA[0] + unbounded_SASA[-1] - bounded_SASA[0] - bounded_SASA[-1]) / (unbounded_SASA[0] + unbounded_SASA[-1])

	return abs_bsa, rel_bsa, rel_nc_bsa, tmp_structure

	def filter_pdb(self, pdb_path, selected_chains=None):
	parser = PDBParser(QUIET=True)
	ss_info = self.get_ss_info(pdb_path)
	structure = parser.get_structure('anonym', pdb_path)

	for model in structure.get_models(): # use model 1 only
	structure = model
	break

	results = []
	for chain in structure.get_chains():
	if selected_chains is not None and chain.get_id() not in selected_chains:
	continue
	chain_ss_info = None if ss_info is None else ss_info[chain.get_id()]
	results.extend(self.filter_chain(chain, chain_ss_info))

	return results


	def filter_chain(self, chain, ss_info=None):

	non_standard = False
	for res in chain:
	if res.get_resname() not in AA3TO1:
	non_standard = True
	break
	if non_standard:
	return []

	if len(ss_info) != len(chain):
	return []

	cb_coord = []
	seq = ''
	for res in chain:
	seq += AA3TO1[res.get_resname()]
	try:
	cb_coord.append(res['CB'].get_coord())
	except:
	tmp_coord = np.array([
	res['N'].get_coord(),
	res['CA'].get_coord(),
	res['C'].get_coord(),
	res['O'].get_coord()
	])
	cb_coord.append(add_cb(tmp_coord))
	cb_coord = np.array(cb_coord)
	cb_contact = np.linalg.norm(cb_coord[None,:,:,] - cb_coord[:,None,:],axis=-1)
	if self.cyclic:
	possible_ss = (cb_contact >= 3.5) & (cb_contact <= 5) #https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4987930/
	else:
	possible_ss = np.ones(cb_contact.shape, dtype=bool)
	possible_ss = np.triu(np.tril(possible_ss, self.max_loop_len - 1), self.min_loop_len - 1)
	ss_pair = np.where(possible_ss)
	accepted, saved_spans = [], []
	for i, j in zip(ss_pair[0],ss_pair[1]):
	redundant = False
	for exist_i, exist_j in saved_spans:
	overlap = min(j, exist_j) - max(i, exist_i) + 1
	if overlap / (j - i + 1) > 0.4 or overlap / (exist_j - exist_i + 1) > 0.4:
	redundant = True
	break
	if redundant:
	continue
	#20A neighbor
	min_dist = np.min(cb_contact[i : j + 1], axis=0)
	min_dist[max(i - 5, 0):min(j + 6, len(seq))] = 21
	neighbors_20A = np.where(min_dist < 20)[0]
	if len(neighbors_20A) < 16:
	continue

	#sequence filter
	pep_seq = seq[i:j+1]
	#cystine 2P and DGDA filter
	if self.re_filter.search(pep_seq) is not None:
	continue
	prot_param=ProteinAnalysis(pep_seq)
	aa_percent = prot_param.get_amino_acids_percent()
	max_ratio = max(aa_percent.values())
	#Discard if any amino acid represents more than 25% of the total sequence
	if max_ratio > 0.25:
	continue
	hydrophobic_ratio = sum([aa_percent[k] for k in hydrophobic_residues])
	#hydrophobic amino acids exceeds 45%
	if hydrophobic_ratio > 0.45:
	continue
	#charged amino acids exceeds 45% or less than 25%
	charged_ratio = sum([aa_percent[k] for k in charged_residues])
	if charged_ratio > 0.45 or charged_ratio < 0.25:
	continue
	#instablility index>40
	if prot_param.instability_index() >= 40:
	continue

	# #TPSA filter (for cell penetration)
	# mol_weight = prot_param.molecular_weight()
	# pepsmile='O'
	# for k in pep_seq:
	# pepsmile=pepsmile[:-1] + aaSMILES[k]
	# pepsmile = MolFromSmiles(pepsmile)
	# tpsa = CalcTPSA(pepsmile)
	# if tpsa <= mol_weight * 0.2:
	# continue

	#build structure and get BSA
	receptor_chain = Chain.Chain('R')
	ligand_chain = Chain.Chain('L')
	for k,res in enumerate(chain):
	if k >= i and k <= j:
	ligand_chain.add(res.copy())
	elif k in neighbors_20A:
	receptor_chain.add(res.copy())

	abs_bsa, rel_bsa, rel_nc_bsa, tmp_structure = self.get_bsa(receptor_chain, ligand_chain)
	if abs_bsa <= self.min_BSA or rel_bsa <= self.min_relBSA or (self.cyclic and rel_nc_bsa >= self.max_relncBSA):
	continue

	#prepare for output
	length = j - i + 1
	if ss_info is None:
	helix_ratio = -1
	strand_ratio = -1
	coil_ratio = -1
	else:
	ssa = ss_info[i:j+1]
	helix_ratio = ssa.count('a') / length
	strand_ratio = ssa.count('b') / length
	coil_ratio = ssa.count('c') / length
	# helix_ratio = (ssa.count("G") + ssa.count("H") + ssa.count("I") + ssa.count("T")) / length
	# strand_ratio = (ssa.count("E") + ssa.count("B")) / length
	# coil_ratio = (ssa.count("S")+ssa.count("C")) / length
	if length <= self.saved_maxlen and abs_bsa >= self.saved_BSA and rel_bsa >= self.saved_relBSA and helix_ratio <= self.saved_helix_ratio and strand_ratio <= self.saved_strand_ratio:
	output_structure = deepcopy(tmp_structure)
	else:
	output_structure = None
	accepted.append((
	i , j, length, abs_bsa, rel_bsa, helix_ratio, strand_ratio, coil_ratio, output_structure
	))
	saved_spans.append((i, j))

	return accepted


	def get_non_redundant(mmap_dir):
	np.random.seed(12)
	index_path = os.path.join(mmap_dir, 'index.txt')
	parent_dir = mmap_dir

	# load index file
	items = {}
	with open(index_path, 'r') as fin:
	lines = fin.readlines()
	for line in lines:
	values = line.strip().split('\t')
	_id, seq = values[0], values[-1]
	chain, pdb_file = _id.split('_')
	items[_id] = (seq, chain, pdb_file)

	# make temporary directory
	tmp_dir = os.path.join(parent_dir, 'tmp')
	if not os.path.exists(tmp_dir):
	os.makedirs(tmp_dir)
	else:
	raise ValueError(f'Working directory {tmp_dir} exists!')

	# 1. get non-redundant dimer by 90% seq-id
	fasta = os.path.join(tmp_dir, 'seq.fasta')
	with open(fasta, 'w') as fout:
	for _id in items:
	fout.write(f'>{_id}\n{items[_id][0]}\n')
	id2clu, clu2id = clustering(fasta, tmp_dir, 0.9)
	non_redundant = []
	for clu in clu2id:
	ids = clu2id[clu]
	non_redundant.append(np.random.choice(ids))
	print_log(f'Non-redundant entries: {len(non_redundant)}')
	shutil.rmtree(tmp_dir)

	# 2. construct non_redundant items
	indexes = {}
	for _id in non_redundant:
	_, chain, pdb_file = items[_id]
	if pdb_file not in indexes:
	indexes[pdb_file] = []
	indexes[pdb_file].append(chain)

	return indexes


	def mp_worker(data_dir, tmp_dir, pdb_file, selected_chains, pep_filter, pdb_out_dir, queue):
	category = pdb_file[4:6]
	category_dir = os.path.join(data_dir, category)
	path = os.path.join(category_dir, pdb_file)
	tmp_file = os.path.join(tmp_dir, f'{pdb_file}.decompressed')
	pdb_id = get_filename(pdb_file.split('.')[0])

	# uncompress the file to the tmp file
	with gzip.open(path, 'rb') as fin:
	with open(tmp_file, 'wb') as fout:
	shutil.copyfileobj(fin, fout)

	files = []
	try:
	# biotitie_pdb_file = biotite_pdb.PDBFile.read(tmp_file)
	# biotite_struct = biotitie_pdb_file.get_structure(model=1)
	# ss_info = { chain: annotate_sse(biotite_struct, chain_id=chain) for chain in selected_chains }
	results = pep_filter.filter_pdb(tmp_file, selected_chains=selected_chains)
	for item in results:
	i, j, struct = item[0], item[1], item[-1]
	if struct is None:
	continue
	io = PDBIO()
	io.set_structure(struct)
	_id = pdb_id + f'_{i}_{j}'
	save_path = os.path.join(pdb_out_dir, _id + '.pdb')
	io.save(save_path)
	files.append(save_path)
	except Exception: # pdbs with missing backbone coordinates or DSSP failed
	pass
	queue.put((pdb_file, files))
	os.remove(tmp_file)


	def process_iterator(indexes, data_dir, tmp_dir, out_dir, pocket_th, n_cpu):
	pdb_out_dir = os.path.join(out_dir, 'pdbs')
	if not os.path.exists(pdb_out_dir):
	os.makedirs(pdb_out_dir)
	pep_filter = Filter()

	file_cnt, pointer, filenames = 0, 0, list(indexes.keys())
	id2task = {}
	queue = mp.Queue()
	# initialize tasks
	for _ in range(n_cpu):
	task_id = filenames[pointer]
	id2task[task_id] = mp.Process(
	target=mp_worker,
	args=(data_dir, tmp_dir, task_id, indexes[task_id], pep_filter, pdb_out_dir, queue)
	)
	id2task[task_id].start()
	pointer += 1

	while True:
	if len(id2task) == 0:
	break

	if not queue.qsize: # no finished ones
	time.sleep(1)
	continue

	pdb_file, paths = queue.get()
	file_cnt += 1
	id2task[pdb_file].join()
	del id2task[pdb_file]

	# add the next task
	if pointer < len(filenames):
	task_id = filenames[pointer]
	id2task[task_id] = mp.Process(
	target=mp_worker,
	args=(data_dir, tmp_dir, task_id, indexes[task_id], pep_filter, pdb_out_dir, queue)
	)
	id2task[task_id].start()
	pointer += 1

	# handle processed data
	for save_path in paths:
	_id = get_filename(save_path)

	list_blocks, chains = pdb_to_list_blocks(save_path, return_chain_ids=True)
	if chains[0] == 'L':
	list_blocks, chains = (list_blocks[1], list_blocks[0]), (chains[1], chains[0])

	rec_blocks, lig_blocks = list_blocks
	rec_chain, lig_chain = chains
	try:
	_, (pocket_idx, _) = blocks_cb_interface(rec_blocks, lig_blocks, pocket_th)
	except KeyError:
	continue
	rec_num_units = sum([len(block) for block in rec_blocks])
	lig_num_units = sum([len(block) for block in lig_blocks])
	rec_data = [block.to_tuple() for block in rec_blocks]
	lig_data = [block.to_tuple() for block in lig_blocks]
	rec_seq = ''.join([AA3TO1[block.abrv] for block in rec_blocks])
	lig_seq = ''.join([AA3TO1[block.abrv] for block in lig_blocks])

	yield _id, (rec_data, lig_data), [
	len(rec_blocks), len(lig_blocks), rec_num_units, lig_num_units,
	rec_chain, lig_chain, rec_seq, lig_seq,
	','.join([str(idx) for idx in pocket_idx]),
	], file_cnt


	def main(args):
	indexes = get_non_redundant(args.database_dir)
	cnt = len(indexes)
	tmp_dir = './tmp'
	if not os.path.exists(tmp_dir):
	os.makedirs(tmp_dir)

	print_log(f'Processing data from directory: {args.pdb_dir}.')
	print_log(f'Number of entries: {cnt}')
	create_mmap(
	process_iterator(indexes, args.pdb_dir, tmp_dir, args.out_dir, args.pocket_th, args.n_cpu),
	args.out_dir, cnt)

	print_log('Finished!')

	shutil.rmtree(tmp_dir)


	if __name__ == '__main__':
	main(parse())