| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """Parses the mmCIF file format.""" |
| import collections |
| import dataclasses |
| import io |
| import json |
| import logging |
| import os |
| from typing import Any, Mapping, Optional, Sequence, Tuple |
|
|
| from Bio import PDB |
| from Bio.Data import SCOPData |
| import numpy as np |
|
|
| from openfold.data.errors import MultipleChainsError |
| import openfold.np.residue_constants as residue_constants |
|
|
|
|
| |
| ChainId = str |
| PdbHeader = Mapping[str, Any] |
| PdbStructure = PDB.Structure.Structure |
| SeqRes = str |
| MmCIFDict = Mapping[str, Sequence[str]] |
|
|
|
|
| @dataclasses.dataclass(frozen=True) |
| class Monomer: |
| id: str |
| num: int |
|
|
|
|
| |
| |
| @dataclasses.dataclass(frozen=True) |
| class AtomSite: |
| residue_name: str |
| author_chain_id: str |
| mmcif_chain_id: str |
| author_seq_num: str |
| mmcif_seq_num: int |
| insertion_code: str |
| hetatm_atom: str |
| model_num: int |
|
|
|
|
| |
| @dataclasses.dataclass(frozen=True) |
| class ResiduePosition: |
| chain_id: str |
| residue_number: int |
| insertion_code: str |
|
|
|
|
| @dataclasses.dataclass(frozen=True) |
| class ResidueAtPosition: |
| position: Optional[ResiduePosition] |
| name: str |
| is_missing: bool |
| hetflag: str |
|
|
|
|
| @dataclasses.dataclass(frozen=True) |
| class MmcifObject: |
| """Representation of a parsed mmCIF file. |
| |
| Contains: |
| file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all |
| files being processed. |
| header: Biopython header. |
| structure: Biopython structure. |
| chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g. |
| {'A': 'ABCDEFG'} |
| seqres_to_structure: Dict; for each chain_id contains a mapping between |
| SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition, |
| 1: ResidueAtPosition, |
| ...}} |
| raw_string: The raw string used to construct the MmcifObject. |
| """ |
|
|
| file_id: str |
| header: PdbHeader |
| structure: PdbStructure |
| chain_to_seqres: Mapping[ChainId, SeqRes] |
| seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]] |
| raw_string: Any |
|
|
|
|
| @dataclasses.dataclass(frozen=True) |
| class ParsingResult: |
| """Returned by the parse function. |
| |
| Contains: |
| mmcif_object: A MmcifObject, may be None if no chain could be successfully |
| parsed. |
| errors: A dict mapping (file_id, chain_id) to any exception generated. |
| """ |
|
|
| mmcif_object: Optional[MmcifObject] |
| errors: Mapping[Tuple[str, str], Any] |
|
|
|
|
| class ParseError(Exception): |
| """An error indicating that an mmCIF file could not be parsed.""" |
|
|
|
|
| def mmcif_loop_to_list( |
| prefix: str, parsed_info: MmCIFDict |
| ) -> Sequence[Mapping[str, str]]: |
| """Extracts loop associated with a prefix from mmCIF data as a list. |
| |
| Reference for loop_ in mmCIF: |
| http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html |
| |
| Args: |
| prefix: Prefix shared by each of the data items in the loop. |
| e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num, |
| _entity_poly_seq.mon_id. Should include the trailing period. |
| parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython |
| parser. |
| |
| Returns: |
| Returns a list of dicts; each dict represents 1 entry from an mmCIF loop. |
| """ |
| cols = [] |
| data = [] |
| for key, value in parsed_info.items(): |
| if key.startswith(prefix): |
| cols.append(key) |
| data.append(value) |
|
|
| assert all([len(xs) == len(data[0]) for xs in data]), ( |
| "mmCIF error: Not all loops are the same length: %s" % cols |
| ) |
|
|
| return [dict(zip(cols, xs)) for xs in zip(*data)] |
|
|
|
|
| def mmcif_loop_to_dict( |
| prefix: str, |
| index: str, |
| parsed_info: MmCIFDict, |
| ) -> Mapping[str, Mapping[str, str]]: |
| """Extracts loop associated with a prefix from mmCIF data as a dictionary. |
| |
| Args: |
| prefix: Prefix shared by each of the data items in the loop. |
| e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num, |
| _entity_poly_seq.mon_id. Should include the trailing period. |
| index: Which item of loop data should serve as the key. |
| parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython |
| parser. |
| |
| Returns: |
| Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop, |
| indexed by the index column. |
| """ |
| entries = mmcif_loop_to_list(prefix, parsed_info) |
| return {entry[index]: entry for entry in entries} |
|
|
|
|
| def parse( |
| *, file_id: str, mmcif_string: str, catch_all_errors: bool = True |
| ) -> ParsingResult: |
| """Entry point, parses an mmcif_string. |
| |
| Args: |
| file_id: A string identifier for this file. Should be unique within the |
| collection of files being processed. |
| mmcif_string: Contents of an mmCIF file. |
| catch_all_errors: If True, all exceptions are caught and error messages are |
| returned as part of the ParsingResult. If False exceptions will be allowed |
| to propagate. |
| |
| Returns: |
| A ParsingResult. |
| """ |
| errors = {} |
| try: |
| parser = PDB.MMCIFParser(QUIET=True) |
| handle = io.StringIO(mmcif_string) |
| full_structure = parser.get_structure("", handle) |
| first_model_structure = _get_first_model(full_structure) |
| |
| |
| parsed_info = parser._mmcif_dict |
|
|
| |
| for key, value in parsed_info.items(): |
| if not isinstance(value, list): |
| parsed_info[key] = [value] |
|
|
| header = _get_header(parsed_info) |
|
|
| |
| |
| valid_chains = _get_protein_chains(parsed_info=parsed_info) |
| if not valid_chains: |
| return ParsingResult( |
| None, {(file_id, ""): "No protein chains found in this file."} |
| ) |
| seq_start_num = { |
| chain_id: min([monomer.num for monomer in seq]) |
| for chain_id, seq in valid_chains.items() |
| } |
|
|
| |
| |
| |
| |
| mmcif_to_author_chain_id = {} |
| seq_to_structure_mappings = {} |
| for atom in _get_atom_site_list(parsed_info): |
| if atom.model_num != "1": |
| |
| continue |
|
|
| mmcif_to_author_chain_id[atom.mmcif_chain_id] = atom.author_chain_id |
|
|
| if atom.mmcif_chain_id in valid_chains: |
| hetflag = " " |
| if atom.hetatm_atom == "HETATM": |
| |
| |
| |
| if atom.residue_name in ("HOH", "WAT"): |
| hetflag = "W" |
| else: |
| hetflag = "H_" + atom.residue_name |
| insertion_code = atom.insertion_code |
| if not _is_set(atom.insertion_code): |
| insertion_code = " " |
| position = ResiduePosition( |
| chain_id=atom.author_chain_id, |
| residue_number=int(atom.author_seq_num), |
| insertion_code=insertion_code, |
| ) |
| seq_idx = ( |
| int(atom.mmcif_seq_num) - seq_start_num[atom.mmcif_chain_id] |
| ) |
| current = seq_to_structure_mappings.get( |
| atom.author_chain_id, {} |
| ) |
| current[seq_idx] = ResidueAtPosition( |
| position=position, |
| name=atom.residue_name, |
| is_missing=False, |
| hetflag=hetflag, |
| ) |
| seq_to_structure_mappings[atom.author_chain_id] = current |
|
|
| |
| for chain_id, seq_info in valid_chains.items(): |
| author_chain = mmcif_to_author_chain_id[chain_id] |
| current_mapping = seq_to_structure_mappings[author_chain] |
| for idx, monomer in enumerate(seq_info): |
| if idx not in current_mapping: |
| current_mapping[idx] = ResidueAtPosition( |
| position=None, |
| name=monomer.id, |
| is_missing=True, |
| hetflag=" ", |
| ) |
|
|
| author_chain_to_sequence = {} |
| for chain_id, seq_info in valid_chains.items(): |
| author_chain = mmcif_to_author_chain_id[chain_id] |
| seq = [] |
| for monomer in seq_info: |
| code = SCOPData.protein_letters_3to1.get(monomer.id, "X") |
| seq.append(code if len(code) == 1 else "X") |
| seq = "".join(seq) |
| author_chain_to_sequence[author_chain] = seq |
|
|
| mmcif_object = MmcifObject( |
| file_id=file_id, |
| header=header, |
| structure=first_model_structure, |
| chain_to_seqres=author_chain_to_sequence, |
| seqres_to_structure=seq_to_structure_mappings, |
| raw_string=parsed_info, |
| ) |
|
|
| return ParsingResult(mmcif_object=mmcif_object, errors=errors) |
| except Exception as e: |
| errors[(file_id, "")] = e |
| if not catch_all_errors: |
| raise |
| return ParsingResult(mmcif_object=None, errors=errors) |
|
|
|
|
| def _get_first_model(structure: PdbStructure) -> PdbStructure: |
| """Returns the first model in a Biopython structure.""" |
| return next(structure.get_models()) |
|
|
|
|
| _MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDE = 21 |
|
|
|
|
| def get_release_date(parsed_info: MmCIFDict) -> str: |
| """Returns the oldest revision date.""" |
| revision_dates = parsed_info["_pdbx_audit_revision_history.revision_date"] |
| return min(revision_dates) |
|
|
|
|
| def _get_header(parsed_info: MmCIFDict) -> PdbHeader: |
| """Returns a basic header containing method, release date and resolution.""" |
| header = {} |
|
|
| experiments = mmcif_loop_to_list("_exptl.", parsed_info) |
| header["structure_method"] = ",".join( |
| [experiment["_exptl.method"].lower() for experiment in experiments] |
| ) |
|
|
| |
| |
| if "_pdbx_audit_revision_history.revision_date" in parsed_info: |
| header["release_date"] = get_release_date(parsed_info) |
| else: |
| logging.warning( |
| "Could not determine release_date: %s", parsed_info["_entry.id"] |
| ) |
|
|
| header["resolution"] = 0.00 |
| for res_key in ( |
| "_refine.ls_d_res_high", |
| "_em_3d_reconstruction.resolution", |
| "_reflns.d_resolution_high", |
| ): |
| if res_key in parsed_info: |
| try: |
| raw_resolution = parsed_info[res_key][0] |
| header["resolution"] = float(raw_resolution) |
| except ValueError: |
| logging.info( |
| "Invalid resolution format: %s", parsed_info[res_key] |
| ) |
|
|
| return header |
|
|
|
|
| def _get_atom_site_list(parsed_info: MmCIFDict) -> Sequence[AtomSite]: |
| """Returns list of atom sites; contains data not present in the structure.""" |
| return [ |
| AtomSite(*site) |
| for site in zip( |
| parsed_info["_atom_site.label_comp_id"], |
| parsed_info["_atom_site.auth_asym_id"], |
| parsed_info["_atom_site.label_asym_id"], |
| parsed_info["_atom_site.auth_seq_id"], |
| parsed_info["_atom_site.label_seq_id"], |
| parsed_info["_atom_site.pdbx_PDB_ins_code"], |
| parsed_info["_atom_site.group_PDB"], |
| parsed_info["_atom_site.pdbx_PDB_model_num"], |
| ) |
| ] |
|
|
|
|
| def _get_protein_chains( |
| *, parsed_info: Mapping[str, Any] |
| ) -> Mapping[ChainId, Sequence[Monomer]]: |
| """Extracts polymer information for protein chains only. |
| |
| Args: |
| parsed_info: _mmcif_dict produced by the Biopython parser. |
| |
| Returns: |
| A dict mapping mmcif chain id to a list of Monomers. |
| """ |
| |
| entity_poly_seqs = mmcif_loop_to_list("_entity_poly_seq.", parsed_info) |
|
|
| polymers = collections.defaultdict(list) |
| for entity_poly_seq in entity_poly_seqs: |
| polymers[entity_poly_seq["_entity_poly_seq.entity_id"]].append( |
| Monomer( |
| id=entity_poly_seq["_entity_poly_seq.mon_id"], |
| num=int(entity_poly_seq["_entity_poly_seq.num"]), |
| ) |
| ) |
|
|
| |
| |
| chem_comps = mmcif_loop_to_dict("_chem_comp.", "_chem_comp.id", parsed_info) |
|
|
| |
| |
| struct_asyms = mmcif_loop_to_list("_struct_asym.", parsed_info) |
|
|
| entity_to_mmcif_chains = collections.defaultdict(list) |
| for struct_asym in struct_asyms: |
| chain_id = struct_asym["_struct_asym.id"] |
| entity_id = struct_asym["_struct_asym.entity_id"] |
| entity_to_mmcif_chains[entity_id].append(chain_id) |
|
|
| |
| valid_chains = {} |
| for entity_id, seq_info in polymers.items(): |
| chain_ids = entity_to_mmcif_chains[entity_id] |
|
|
| |
| if any( |
| [ |
| "peptide" in chem_comps[monomer.id]["_chem_comp.type"] |
| for monomer in seq_info |
| ] |
| ): |
| for chain_id in chain_ids: |
| valid_chains[chain_id] = seq_info |
| return valid_chains |
|
|
|
|
| def _is_set(data: str) -> bool: |
| """Returns False if data is a special mmCIF character indicating 'unset'.""" |
| return data not in (".", "?") |
|
|
|
|
| def get_atom_coords( |
| mmcif_object: MmcifObject, |
| chain_id: str, |
| _zero_center_positions: bool = True |
| ) -> Tuple[np.ndarray, np.ndarray]: |
| |
| chains = list(mmcif_object.structure.get_chains()) |
| relevant_chains = [c for c in chains if c.id == chain_id] |
| if len(relevant_chains) != 1: |
| raise MultipleChainsError( |
| f"Expected exactly one chain in structure with id {chain_id}." |
| ) |
| chain = relevant_chains[0] |
|
|
| |
| num_res = len(mmcif_object.chain_to_seqres[chain_id]) |
| all_atom_positions = np.zeros( |
| [num_res, residue_constants.atom_type_num, 3], dtype=np.float32 |
| ) |
| all_atom_mask = np.zeros( |
| [num_res, residue_constants.atom_type_num], dtype=np.float32 |
| ) |
| for res_index in range(num_res): |
| pos = np.zeros([residue_constants.atom_type_num, 3], dtype=np.float32) |
| mask = np.zeros([residue_constants.atom_type_num], dtype=np.float32) |
| res_at_position = mmcif_object.seqres_to_structure[chain_id][res_index] |
| if not res_at_position.is_missing: |
| res = chain[ |
| ( |
| res_at_position.hetflag, |
| res_at_position.position.residue_number, |
| res_at_position.position.insertion_code, |
| ) |
| ] |
| for atom in res.get_atoms(): |
| atom_name = atom.get_name() |
| x, y, z = atom.get_coord() |
| if atom_name in residue_constants.atom_order.keys(): |
| pos[residue_constants.atom_order[atom_name]] = [x, y, z] |
| mask[residue_constants.atom_order[atom_name]] = 1.0 |
| elif atom_name.upper() == "SE" and res.get_resname() == "MSE": |
| |
| pos[residue_constants.atom_order["SD"]] = [x, y, z] |
| mask[residue_constants.atom_order["SD"]] = 1.0 |
|
|
| all_atom_positions[res_index] = pos |
| all_atom_mask[res_index] = mask |
|
|
| if _zero_center_positions: |
| binary_mask = all_atom_mask.astype(bool) |
| translation_vec = all_atom_positions[binary_mask].mean(axis=0) |
| all_atom_positions[binary_mask] -= translation_vec |
|
|
| return all_atom_positions, all_atom_mask |
|
|