Irwiny123 commited on Nov 9, 2025

Commit

7a1bbaf

1 Parent(s): a4262ae

添加PhysDock初始代码

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +204 -0
.idea/.gitignore +8 -0
.idea/PhysDock.iml +12 -0
.idea/inspectionProfiles/Project_Default.xml +24 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +7 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
License +21 -0
PhysDock/__init__.py +3 -0
PhysDock/configs.py +195 -0
PhysDock/configs_old.py +245 -0
PhysDock/data/__init__.py +109 -0
PhysDock/data/alignment_runner.py +937 -0
PhysDock/data/alignment_runner_v2.py +327 -0
PhysDock/data/constants/PDBData.py +348 -0
PhysDock/data/constants/__init__.py +0 -0
PhysDock/data/constants/periodic_table.py +27 -0
PhysDock/data/constants/residue_constants.py +562 -0
PhysDock/data/constants/restype_constants.py +107 -0
PhysDock/data/feature_loader.py +1283 -0
PhysDock/data/feature_loader_plinder.py +1258 -0
PhysDock/data/generate_system.py +148 -0
PhysDock/data/relaxation.py +259 -0
PhysDock/data/tools/PDBData.py +348 -0
PhysDock/data/tools/__init__.py +0 -0
PhysDock/data/tools/alignment_runner.py +588 -0
PhysDock/data/tools/convert_unifold_template_to_stfold.py +127 -0
PhysDock/data/tools/dataset_manager.py +570 -0
PhysDock/data/tools/feature_processing_multimer.py +257 -0
PhysDock/data/tools/get_metrics.py +294 -0
PhysDock/data/tools/hhblits.py +175 -0
PhysDock/data/tools/hhsearch.py +126 -0
PhysDock/data/tools/hmmalign.py +66 -0
PhysDock/data/tools/hmmbuild.py +165 -0
PhysDock/data/tools/hmmsearch.py +137 -0
PhysDock/data/tools/jackhmmer.py +262 -0
PhysDock/data/tools/kalign.py +114 -0
PhysDock/data/tools/mmcif_parsing.py +519 -0
PhysDock/data/tools/msa_identifiers.py +90 -0
PhysDock/data/tools/msa_pairing.py +496 -0
PhysDock/data/tools/nhmmer.py +257 -0
PhysDock/data/tools/parse_msas.py +328 -0
PhysDock/data/tools/parsers.py +727 -0
PhysDock/data/tools/rdkit.py +220 -0
PhysDock/data/tools/residue_constants.py +604 -0
PhysDock/data/tools/templates.py +1357 -0
PhysDock/data/tools/utils.py +48 -0
PhysDock/models/__init__.py +0 -0
PhysDock/models/layers/__init__.py +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,204 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Streamlit
+.streamlit/secrets.toml
+params/params.pt

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/PhysDock.iml ADDED Viewed

	@@ -0,0 +1,12 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="PhysDock" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="GOOGLE" />
+    <option name="myDocStringFormat" value="Google" />
+  </component>
+</module>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,24 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="11">
+            <item index="0" class="java.lang.String" itemvalue="tqdm" />
+            <item index="1" class="java.lang.String" itemvalue="scipy" />
+            <item index="2" class="java.lang.String" itemvalue="deepspeed" />
+            <item index="3" class="java.lang.String" itemvalue="PyYAML" />
+            <item index="4" class="java.lang.String" itemvalue="pytorch_lightning" />
+            <item index="5" class="java.lang.String" itemvalue="ml-collections" />
+            <item index="6" class="java.lang.String" itemvalue="torch" />
+            <item index="7" class="java.lang.String" itemvalue="typing-extensions" />
+            <item index="8" class="java.lang.String" itemvalue="numpy" />
+            <item index="9" class="java.lang.String" itemvalue="requests" />
+            <item index="10" class="java.lang.String" itemvalue="dm-tree" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="PhysDock" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="PhysDock" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/PhysDock.iml" filepath="$PROJECT_DIR$/.idea/PhysDock.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

License ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 ShanghaiTech University
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

PhysDock/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from PhysDock.models.model import PhysDock
+from PhysDock.models.loss import PhysDockLoss
+from PhysDock.configs import PhysDockConfig

PhysDock/configs.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import ml_collections as mlc
+def PhysDockConfig(
+        inference_mode=True,
+        model_name="medium",
+        num_augmentation_sample=48,
+        crop_size=256,
+        atom_crop_size=256 * 8,
+        alpha_confifdence=1e-4,
+        alpha_diffusion=4,
+        alpha_bond=0,
+        alpha_distogram=3e-2,
+        alpha_pae=0,
+        inf=1e9,
+        eps=1e-8,
+        # Inference Config
+        infer_pocket_type="atom",  # "ca"
+        infer_pocket_cutoff=6,  # 8 10 12
+        infer_pocket_dist_type="ligand",  # "ligand_centre"
+        infer_use_pocket=True,
+        infer_use_key_res=True,
+        # Training Config
+        train_pocket_type_atom_ratio=0.5,
+        train_pocket_cutoff_ligand_min=6,
+        train_pocket_cutoff_ligand_max=12,
+        train_pocket_cutoff_ligand_centre_min=10,
+        train_pocket_cutoff_ligand_centre_max=16,
+        train_pocket_dist_type_ligand_ratio=0.5,
+        train_use_pocket_ratio=0.5,
+        train_use_key_res_ratio=0.5,
+        train_shuffle_sym_id=True,
+        train_spatial_crop_ligand_ratio=0.2,
+        train_spatial_crop_interface_ratio=0.4,
+        train_spatial_crop_interface_threshold=15.,
+        train_charility_augmentation_ratio=0.1,
+        train_use_template_ratio=0.75,
+        train_template_mask_max_ratio=0.4,
+        # Other Configs
+        max_msa_clusters=128,
+        key_res_random_mask_ratio=0.5,
+        token_bond_threshold=2.4,
+        sigma_data=16.,
+):
+    ref_dim = 167
+    target_dim = 65
+    msa_dim = 34
+    inf = inf
+    eps = eps
+    c_m = 256  # 256
+    c_s = 512  # 1024
+    c_z = 128  # 64 | 128
+    c_a = 128  # 128
+    c_ap = 16  # 16 | 32
+    if model_name == "toy":
+        no_blocks_atom = 2
+        no_blocks_evoformer = 2
+        no_blocks_pairformer = 2
+        no_blocks_dit = 2
+        no_blocks_heads = 2
+    elif model_name == "tiny":
+        no_blocks_atom = 2
+        no_blocks_evoformer = 2
+        no_blocks_pairformer = 8
+        no_blocks_dit = 4
+        no_blocks_heads = 2
+    elif model_name == "small":
+        no_blocks_atom = 2
+        no_blocks_evoformer = 3
+        no_blocks_pairformer = 16
+        no_blocks_dit = 8
+        no_blocks_heads = 2
+    elif model_name == "medium":
+        no_blocks_atom = 3
+        no_blocks_evoformer = 4
+        no_blocks_pairformer = 24
+        no_blocks_dit = 12
+        no_blocks_heads = 3
+    elif model_name == "full":
+        no_blocks_atom = 3
+        no_blocks_evoformer = 4
+        no_blocks_pairformer = 48
+        no_blocks_dit = 24
+        no_blocks_heads = 4
+    else:
+        raise ValueError("Unknown model name")
+    config = {
+        "inference_mode": inference_mode,
+        "sigma_data": sigma_data,
+        "data": {
+            "crop_size": crop_size,
+            "atom_crop_size": atom_crop_size,
+            "max_msa_seqs": 16384,
+            "max_uniprot_msa_seqs": 8192,
+            "interface_threshold": 15,
+            "token_bond_threshold": token_bond_threshold,
+            "covalent_bond_threshold": 1.8,
+            "max_msa_clusters": max_msa_clusters,
+            "resample_msa_in_recycling": True,
+        },
+        "model": {
+            "c_z": c_z,
+            "num_augmentation_sample": num_augmentation_sample,
+            "diffusion_conditioning": {
+                "ref_dim": ref_dim,
+                "target_dim": target_dim,
+                "msa_dim": msa_dim,
+                "c_a": c_a,
+                "c_ap": c_ap,
+                "c_s": c_s,
+                "c_m": c_m,
+                "c_z": c_z,
+                "inf": inf,
+                "eps": eps,
+                "no_blocks_atom": no_blocks_atom,
+                "no_blocks_evoformer": no_blocks_evoformer,
+                "no_blocks_pairformer": no_blocks_pairformer
+            },
+            "dit": {
+                "c_a": c_a,
+                "c_ap": c_ap,
+                "c_s": c_s,
+                "c_z": c_z,
+                "inf": inf,
+                "eps": eps,
+                "no_blocks_atom": no_blocks_atom,
+                "no_blocks_dit": no_blocks_dit,
+                "sigma_data": sigma_data
+            },
+            "confidence_module": {
+                "c_a": c_a,
+                "c_ap": c_ap,
+                "c_s": c_s,
+                "c_z": c_z,
+                "inf": inf,
+                "eps": eps,
+                "no_blocks_heads": no_blocks_heads,
+                "no_blocks_atom": no_blocks_atom,
+            }
+        },
+        "loss": {
+            "weighted_mse_loss": {
+                "weight": alpha_diffusion,
+                "sigma_data": sigma_data,
+                "alpha_dna": 5.0,
+                "alpha_rna": 5.0,
+                "alpha_ligand": 10.0,
+            },
+            "smooth_lddt_loss": {
+                "weight": alpha_diffusion,
+                "max_clamp_distance": 15.,
+            },
+            "bond_loss": {
+                "weight": alpha_diffusion * alpha_bond,
+                "sigma_data": sigma_data,
+            },
+            "key_res_loss": {
+                "weight": alpha_diffusion * alpha_bond,
+                "sigma_data": sigma_data,
+            },
+            "distogram_loss": {
+                "weight": alpha_distogram,
+                "min_bin": 3.25,
+                "max_bin": 50.75,
+                "no_bins": 39,
+                "eps": 1e-9,
+            },
+            "plddt_loss": {
+                "weight": alpha_confifdence,
+                "no_bins": 50,
+            },
+            "pae_loss": {
+                "weight": alpha_confifdence * alpha_pae,
+            },
+            "pde_loss": {
+                "weight": alpha_confifdence,
+                "min_bin": 0,
+                "max_bin": 32,
+                "no_bins": 64,
+            },
+        }
+    }
+    return mlc.ConfigDict(config)

PhysDock/configs_old.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import ml_collections as mlc
+def model_config(
+        model_name="full",
+        max_recycling_iters=1,  # 0
+        max_msa_clusters=128,  # 32
+        crop_size=256,  #
+        num_augmentation_sample=48,  # 128
+        alpha_confifdence=1e-4,
+        alpha_diffusion=4,
+        alpha_bond=0,
+        alpha_distogram=3e-2,
+        alpha_pae=0,
+        use_template=True,  # False
+        use_mini_rollout=True,  # False
+        use_flash_attn=False,  # False
+        custom_rel_token=-1,  # 42
+        ref_dim=1 + 2 + 2 + 128 + 256,  # 167
+        mini_rollout_steps=20,
+        atom_attention_type="full",
+        templ_dim=108,
+        interaction_aware=True,
+):
+    sigma_data = 16
+    # ref_dim = 1 + 2 + 2 + 128 + 256
+    msa_dim = 34
+    templ_dim = templ_dim
+    inf = 1e9
+    eps = 1e-8
+    pair_dropout = 0.25
+    msa_dropout = 0.15
+    c_m = 256  # 256
+    c_s = 768  # 1024
+    c_z = 128  # 64 | 128
+    c_tz = 64
+    c_a = 128  # 128
+    c_ap = 16  # 16 | 32
+    no_blocks_templ = 2
+    no_blocks_evo = 48
+    no_blocks_atom = 3
+    no_blocks_dit = 24
+    no_blocks_heads = 4
+    if model_name == "small_toy":
+        no_blocks_templ = 1
+        no_blocks_evo = 1
+        no_blocks_atom = 1
+        no_blocks_dit = 1
+        no_blocks_heads = 1
+    elif model_name == "toy":
+        no_blocks_templ = 2
+        no_blocks_evo = 2
+        no_blocks_atom = 2
+        no_blocks_dit = 2
+        no_blocks_heads = 2
+    elif model_name == "small":
+        no_blocks_templ = 2
+        no_blocks_evo = 4
+        no_blocks_atom = 2
+        no_blocks_dit = 2
+        no_blocks_heads = 2
+    elif model_name == "docking":
+        no_blocks_templ = 2
+        no_blocks_evo = 8
+        no_blocks_atom = 2
+        no_blocks_dit = 4
+        no_blocks_heads = 2
+    elif model_name == "medium":
+        no_blocks_templ = 2
+        no_blocks_evo = 16
+        no_blocks_atom = 3
+        no_blocks_dit = 8
+        no_blocks_heads = 2
+    elif model_name == "large":
+        no_blocks_templ = 2
+        no_blocks_evo = 24
+        no_blocks_atom = 3
+        no_blocks_dit = 12
+        no_blocks_heads = 4
+    elif model_name == "full":
+        no_blocks_templ = 2
+        no_blocks_evo = 48
+        no_blocks_atom = 3
+        no_blocks_dit = 24
+        no_blocks_heads = 4
+    return mlc.ConfigDict({
+        "use_template": use_template,
+        "use_mini_rollout": use_mini_rollout,
+        "mini_rollout_steps": mini_rollout_steps,
+        "data": {
+            "crop_size": crop_size,
+            "atom_crop_factor": 10,
+            "max_msa_seqs": 16384,
+            "max_uniprot_msa_seqs": 8192,
+            "interface_threshold": 15,
+            "token_bond_threshold": 2.4,
+            "covalent_bond_threshold": 1.8,
+            "max_msa_clusters": max_msa_clusters,
+            "resample_msa_in_recycling": True,
+            "max_recycling_iters": max_recycling_iters,  # TODO 3
+            "sample_msa": {
+                "max_msa_clusters": 128,
+                "resample_msa_in_recycling": True,
+            },
+            "make_crop_ids": {
+                "crop_size": 384
+            }
+        },
+        "model": {
+            "input_feature_embedder": {
+                "msa_dim": msa_dim,
+                "ref_dim": ref_dim,
+                "c_s": c_s,
+                "c_m": c_m,
+                "c_z": c_z,
+                "c_ap": c_ap,
+                "c_a": c_a,
+                "no_heads": 4,
+                "c_hidden": 16,
+                "inf": inf,
+                "eps": eps,
+                "no_blocks": 3,
+                "interaction_aware": interaction_aware,
+                "custom_rel_token": custom_rel_token,
+            },
+            "template_pair_embedder": {
+                "templ_dim": templ_dim,
+                "c_z": c_z,
+                "c_tz": c_tz,
+                "c_hidden_tz": 16,
+                "no_heads_tz": 4,
+                "inf": inf,
+                "eps": eps,
+                "no_blocks": no_blocks_templ,
+            },
+            "recycling_embedder": {
+                "c_m": c_m,
+                "c_z": c_z,
+            },
+            "evoformer_stack": {
+                "c_m": c_m,
+                "c_z": c_z,
+                "c_hidden_m": 32,
+                "no_heads_m": 8,
+                "c_hidden_z": 32,
+                "no_heads_z": 4,
+                "c_hidden_opm": 32,
+                "inf": inf,
+                "eps": eps,
+                "no_blocks": no_blocks_evo,
+                "single_mode": False,
+            },
+            "diffusion_module": {
+                "ref_dim": ref_dim,
+                "c_m": c_m,
+                "c_s": c_s,
+                "c_z": c_z,
+                "c_a": c_a,
+                "c_ap": c_ap,
+                "no_heads_atom": 4,
+                "c_hidden_atom": 16,
+                "no_heads": c_ap,
+                "c_hidden": 32,
+                "inf": inf,
+                "eps": eps,
+                "no_blocks": no_blocks_dit,
+                "no_blocks_atom": no_blocks_atom,
+                "num_augmentation_sample": num_augmentation_sample,
+                "custom_rel_token": custom_rel_token,
+                "use_flash_attn": use_flash_attn,
+                "atom_attention_type": atom_attention_type
+            },
+            "confidence_module": {
+                "c_a": c_a,
+                "c_ap": c_ap,
+                "c_s": c_s,
+                "c_m": c_m,
+                "c_z": c_z,
+                "no_heads_a": 4,
+                "c_hidden_a": 16,
+                "c_hidden_m": 32,
+                "no_heads_m": 8,
+                "c_hidden_z": 32,
+                "no_heads_z": 4,
+                "c_hidden_opm": 32,
+                "inf": inf,
+                "eps": eps,
+                "no_blocks": no_blocks_heads,
+                "no_blocks_atom": no_blocks_atom,
+                "c_pae": 64,
+                "c_pde": 64,
+                "c_plddt": 50,
+                "c_distogram": 39,
+            },
+            "loss": {
+                "weighted_mse_loss": {
+                    "weight": alpha_diffusion,
+                    "sigma_data": sigma_data,
+                    "alpha_dna": 5.0,
+                    "alpha_rna": 5.0,
+                    "alpha_ligand": 10.0,
+                },
+                "smooth_lddt_loss": {
+                    "weight": alpha_diffusion,
+                    "max_clamp_distance": 15.,
+                },
+                "clamp_distance_loss": {
+                    "weight": alpha_diffusion * 0.2,
+                    "max_clamp_distance": 10,
+                },
+                "bond_loss": {
+                    "weight": alpha_diffusion * alpha_bond,
+                    "sigma_data": sigma_data,
+                },
+                "distogram_loss": {
+                    "weight": alpha_distogram,
+                    "min_bin": 3.25,
+                    "max_bin": 50.75,
+                    "no_bins": 39,
+                    "eps": 1e-9,
+                },
+                "plddt_loss": {
+                    "weight": alpha_confifdence,
+                    "no_bins": 50,
+                },
+                "pae_loss": {
+                    "weight": alpha_confifdence * alpha_pae,
+                },
+                "pde_loss": {
+                    "weight": alpha_confifdence,
+                    "min_bin": 0,
+                    "max_bin": 32,
+                    "no_bins": 64,
+                },
+            }
+        }
+    })

PhysDock/data/__init__.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from typing import Dict, Union, Any
+import torch
+import numpy as np
+from scipy.sparse.coo import coo_matrix
+# TODO: Keep Only ref_mask eq 1 for all atomwise feature
+'''
+    Notation:
+    batch,
+    token dimension: i, j, k
+    flat atom dimension: l, m | WARNING: We should flatten due to local atom attention mask
+    sequence dimension: s(msa) t(time)
+    head dimension: h
+    ####################
+    z_ij: pair repr
+    {z_ij}: all pair repr
+    x: atom position
+    {x_l}: flat atom list, full atomic structure
+    exist a mapping: flat atom index -> token index and within token atom index: l -> i, a
+    ####################
+    a: atom representation
+        have the same shape as s
+        exist a transform: flat atom representation -> atom represenation
+    s: token representation
+    z: pair representation
+'''
+FeatureDict = Dict[str, Union[np.ndarray, coo_matrix, None, Any]]
+TensorDict = Dict[str, Union[torch.Tensor, Any]]
+PDB_CHAIN_IDS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+NUM_CONFORMER = "num conformer placeholder"
+NUM_TOKEN = "num tokens placeholder"
+NUM_ATOM = "num atoms placeholder"
+NUM_SEQ = "num MSAs placeholder"
+NUM_TEMPL = "num templates placeholder"
+NUM_RECYCLING = "num recycling placeholder"
+NUM_SAMPLE = "num sample placeholder"
+SHAPE_SCHIME = {
+    ################################################################
+    # Conformerwise Feature
+    # Tokenwise Feature
+    "residue_index": [NUM_TOKEN],
+    "restype": [NUM_TOKEN],
+    "token_index": [NUM_TOKEN],
+    "s_mask": [NUM_TOKEN],
+    "is_protein": [NUM_TOKEN],
+    "is_rna": [NUM_TOKEN],
+    "is_dna": [NUM_TOKEN],
+    "is_ligand": [NUM_TOKEN],
+    "token_id_to_centre_atom_id": [NUM_TOKEN],
+    "token_id_to_pseudo_beta_atom_id": [NUM_TOKEN],
+    "token_id_to_chunk_sizes": [NUM_TOKEN],
+    "token_id_to_conformer_id": [NUM_TOKEN],
+    "asym_id": [NUM_TOKEN],
+    "entity_id": [NUM_TOKEN],
+    "sym_id": [NUM_TOKEN],
+    "token_bonds": [NUM_TOKEN, NUM_TOKEN],
+    "target_feat": [NUM_TOKEN],
+    "token_exists": [NUM_TOKEN],
+    "spatial_crop_target_res_mask": [NUM_TOKEN],
+    # Atomwise features
+    "ref_space_uid": [NUM_ATOM],
+    "atom_index": [NUM_ATOM],
+    "ref_feat": [NUM_ATOM, 389],
+    "ref_pos": [NUM_ATOM, 3],
+    "a_mask": [NUM_ATOM],
+    "atom_id_to_token_id": [NUM_ATOM],
+    "x_gt": [NUM_ATOM, 3],
+    "x_exists": [NUM_ATOM],
+    "rec_mask": [NUM_ATOM, NUM_ATOM],
+    "msa": [NUM_SEQ, NUM_TOKEN],
+    "deletion_matrix": [NUM_SEQ, NUM_TOKEN],
+    "msa_feat": [NUM_SEQ, NUM_TOKEN, None],
+    "crop_idx": [None],
+    "crop_idx_atom": [None],
+    #
+    "x_centre": [None],
+    # # Template features
+    # "template_restype": [NUM_TEMPLATES, NUM_TOKEN],
+    # "template_pseudo_beta_mask": [NUM_TEMPLATES, NUM_TOKEN],
+    # "template_backbone_frame_mask": [NUM_TEMPLATES, NUM_TOKEN],
+    # "template_distogram": [NUM_TEMPLATES, NUM_TOKEN, NUM_TOKEN, 39],
+    # "template_unit_vector": [NUM_TEMPLATES, NUM_TOKEN, NUM_TOKEN, 3],
+    ###########################################################
+}
+SUPERVISED_FEATURES = [
+]
+UNSUPERVISED_FEATURES = [
+]

PhysDock/data/alignment_runner.py ADDED Viewed

	@@ -0,0 +1,937 @@

+import logging
+import os.path
+import shutil
+from functools import partial
+import tqdm
+from typing import Optional, Mapping, Any, Union
+from PhysDock.data.tools import jackhmmer, nhmmer, hhblits, kalign, hmmalign, parsers, hmmbuild, hhsearch, templates
+from PhysDock.utils.io_utils import load_pkl, load_txt, load_json, run_pool_tasks, convert_md5_string, dump_pkl
+from PhysDock.data.tools.parsers import parse_fasta
+TemplateSearcher = Union[hhsearch.HHSearch]
+class AlignmentRunner:
+    def __init__(
+            self,
+            # Homo Search Tools
+            jackhmmer_binary_path: Optional[str] = None,
+            hhblits_binary_path: Optional[str] = None,
+            nhmmer_binary_path: Optional[str] = None,
+            hmmbuild_binary_path: Optional[str] = None,
+            hmmalign_binary_path: Optional[str] = None,
+            kalign_binary_path: Optional[str] = None,
+            # Templ Search Tools
+            hhsearch_binary_path: Optional[str] = None,
+            template_searcher: Optional[TemplateSearcher] = None,
+            template_featurizer: Optional[templates.TemplateHitFeaturizer] = None,
+            # Databases
+            uniref90_database_path: Optional[str] = None,
+            uniprot_database_path: Optional[str] = None,
+            uniclust30_database_path: Optional[str] = None,
+            uniref30_database_path: Optional[str] = None,
+            bfd_database_path: Optional[str] = None,
+            reduced_bfd_database_path: Optional[str] = None,
+            mgnify_database_path: Optional[str] = None,
+            rfam_database_path: Optional[str] = None,
+            rnacentral_database_path: Optional[str] = None,
+            nt_database_path: Optional[str] = None,
+            #
+            no_cpus: int = 8,
+            # Limitations
+            uniref90_seq_limit: int = 100000,
+            uniprot_seq_limit: int = 500000,
+            reduced_bfd_seq_limit: int = 50000,
+            mgnify_seq_limit: int = 50000,
+            uniref90_max_hits: int = 10000,
+            uniprot_max_hits: int = 50000,
+            reduced_bfd_max_hits: int = 5000,
+            mgnify_max_hits: int = 5000,
+            rfam_max_hits: int = 10000,
+            rnacentral_max_hits: int = 10000,
+            nt_max_hits: int = 10000,
+    ):
+        self.uniref90_jackhmmer_runner = None
+        self.uniprot_jackhmmer_runner = None
+        self.reduced_bfd_jackhmmer_runner = None
+        self.mgnify_jackhmmer_runner = None
+        self.bfd_uniref30_hhblits_runner = None
+        self.bfd_uniclust30_hhblits_runner = None
+        self.rfam_nhmmer_runner = None
+        self.rnacentral_nhmmer_runner = None
+        self.nt_nhmmer_runner = None
+        self.rna_realign_runner = None
+        self.template_searcher = template_searcher
+        self.template_featurizer = template_featurizer
+        def _all_exists(*objs, hhblits_mode=False):
+            if not hhblits_mode:
+                for obj in objs:
+                    if obj is None or not os.path.exists(obj):
+                        return False
+            else:
+                for obj in objs:
+                    if obj is None or not os.path.exists(os.path.split(obj)[0]):
+                        return False
+            return True
+        def _run_msa_tool(
+                fasta_path: str,
+                msa_out_path: str,
+                msa_runner,
+                msa_format: str,
+                max_sto_sequences: Optional[int] = None,
+        ) -> Mapping[str, Any]:
+            """Runs an MSA tool, checking if output already exists first."""
+            if (msa_format == "sto" and max_sto_sequences is not None):
+                result = msa_runner.query(fasta_path, max_sto_sequences)[0]
+            else:
+                result = msa_runner.query(fasta_path)[0]
+            assert msa_out_path.split('.')[-1] == msa_format
+            with open(msa_out_path, "w") as f:
+                f.write(result[msa_format])
+            return result
+        def _run_rna_realign_tool(
+                fasta_path: str,
+                msa_in_path: str,
+                msa_out_path: str,
+                use_precompute=True,
+        ):
+            runner = hmmalign.Hmmalign(
+                hmmbuild_binary_path=hmmbuild_binary_path,
+                hmmalign_binary_path=hmmalign_binary_path,
+            )
+            if os.path.exists(msa_in_path) and os.path.getsize(msa_in_path) == 0:
+                # print("MSA sto file is 0")
+                with open(msa_out_path, "w") as f:
+                    pass
+                return
+            if use_precompute:
+                if os.path.exists(msa_in_path) and os.path.exists(msa_out_path):
+                    if os.path.getsize(msa_in_path) > 0 and os.path.getsize(msa_out_path) == 0:
+                        logging.warning(f"The msa realign file size is zero but the origin file size is over 0! "
+                                        f"fasta: {fasta_path} msa_in_file: {msa_in_path}")
+                        runner.realign_sto_with_fasta(fasta_path, msa_in_path, msa_out_path)
+                else:
+                    runner.realign_sto_with_fasta(fasta_path, msa_in_path, msa_out_path)
+            else:
+                runner.realign_sto_with_fasta(fasta_path, msa_in_path, msa_out_path)
+            # with open(msa_out_path, "w") as f:
+            #     f.write(msa_out)
+        assert uniclust30_database_path is None or uniref30_database_path is None, "Only one used"
+        # Jackhmmer
+        if _all_exists(jackhmmer_binary_path, uniref90_database_path):
+            self.uniref90_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=uniref90_database_path,
+                    seq_limit=uniref90_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=uniref90_max_hits
+            )
+        if _all_exists(jackhmmer_binary_path, uniprot_database_path):
+            self.uniprot_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=uniprot_database_path,
+                    seq_limit=uniprot_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=uniprot_max_hits
+            )
+        if _all_exists(jackhmmer_binary_path, reduced_bfd_database_path):
+            self.reduced_bfd_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=reduced_bfd_database_path,
+                    seq_limit=reduced_bfd_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=reduced_bfd_max_hits
+            )
+        if _all_exists(jackhmmer_binary_path, mgnify_database_path):
+            self.mgnify_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=mgnify_database_path,
+                    seq_limit=mgnify_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=mgnify_max_hits
+            )
+        # HHblits
+        if _all_exists(hhblits_binary_path, bfd_database_path, uniref30_database_path, hhblits_mode=True):
+            self.bfd_uniref30_hhblits_runner = partial(
+                _run_msa_tool,
+                msa_runner=hhblits.HHBlits(
+                    binary_path=hhblits_binary_path,
+                    databases=[bfd_database_path, uniref30_database_path],
+                    n_cpu=no_cpus,
+                ),
+                msa_format="a3m",
+            )
+        elif _all_exists(hhblits_binary_path, bfd_database_path, uniclust30_database_path, hhblits_mode=True):
+            self.bfd_uniclust30_hhblits_runner = partial(
+                _run_msa_tool,
+                msa_runner=hhblits.HHBlits(
+                    binary_path=hhblits_binary_path,
+                    databases=[bfd_database_path, uniclust30_database_path],
+                    n_cpu=no_cpus,
+                ),
+                msa_format="a3m",
+            )
+        # Nhmmer
+        if _all_exists(nhmmer_binary_path, rfam_database_path):
+            self.rfam_nhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=nhmmer.Nhmmer(
+                    binary_path=nhmmer_binary_path,
+                    database_path=rfam_database_path,
+                    n_cpu=no_cpus
+                ),
+                msa_format="sto",
+                max_sto_sequences=rfam_max_hits
+            )
+        if _all_exists(nhmmer_binary_path, rnacentral_database_path):
+            self.rnacentral_nhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=nhmmer.Nhmmer(
+                    binary_path=nhmmer_binary_path,
+                    database_path=rnacentral_database_path,
+                    n_cpu=no_cpus
+                ),
+                msa_format="sto",
+                max_sto_sequences=rnacentral_max_hits
+            )
+        if _all_exists(nhmmer_binary_path, nt_database_path):
+            self.nt_nhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=nhmmer.Nhmmer(
+                    binary_path=nhmmer_binary_path,
+                    database_path=nt_database_path,
+                    n_cpu=no_cpus
+                ),
+                msa_format="sto",
+                max_sto_sequences=nt_max_hits
+            )
+        # def _run_rna_hmm(
+        #         fasta_path: str,
+        #         hmm_out_path: str,
+        # ):
+        #     runner = hmmbuild.Hmmbuild(binary_path=hmmbuild_binary_path)
+        #     hmm = runner.build_rna_profile_from_fasta(fasta_path)
+        #     with open(hmm_out_path, "w") as f:
+        #         f.write(hmm)
+        if _all_exists(hmmbuild_binary_path, hmmalign_binary_path):
+            self.rna_realign_runner = _run_rna_realign_tool
+    def run(self, input_fasta_path, output_msas_dir, use_precompute=True):
+        os.makedirs(output_msas_dir, exist_ok=True)
+        templates_out_path = os.path.join(output_msas_dir, "templates")
+        uniref90_out_path = os.path.join(output_msas_dir, "uniref90_hits.sto")
+        uniprot_out_path = os.path.join(output_msas_dir, "uniprot_hits.sto")
+        reduced_bfd_out_path = os.path.join(output_msas_dir, "reduced_bfd_hits.sto")
+        mgnify_out_path = os.path.join(output_msas_dir, "mgnify_hits.sto")
+        bfd_uniref30_out_path = os.path.join(output_msas_dir, f"bfd_uniref30_hits.a3m")
+        bfd_uniclust30_out_path = os.path.join(output_msas_dir, f"bfd_uniclust30_hits.a3m")
+        rfam_out_path = os.path.join(output_msas_dir, f"rfam_hits.sto")
+        rfam_out_realigned_path = os.path.join(output_msas_dir, f"rfam_hits_realigned.sto")
+        rnacentral_out_path = os.path.join(output_msas_dir, f"rnacentral_hits.sto")
+        rnacentral_out_realigned_path = os.path.join(output_msas_dir, f"rnacentral_hits_realigned.sto")
+        nt_out_path = os.path.join(output_msas_dir, f"nt_hits.sto")
+        nt_out_realigned_path = os.path.join(output_msas_dir, f"nt_hits_realigned.sto")
+        seqs, decs = parse_fasta(load_txt(input_fasta_path))
+        prefix = "protein"
+        md5 = convert_md5_string(f"{prefix}:{seqs[0]}")
+        output_feature = os.path.dirname(output_msas_dir)
+        output_feature = os.path.dirname(output_feature)
+        pkl_save_path_msa = os.path.join(output_feature, "msa_features", f"{md5}.pkl.gz")
+        pkl_save_path_msa_uni = os.path.join(output_feature, "uniprot_msa_features", f"{md5}.pkl.gz")
+        pkl_save_path_temp = os.path.join(output_feature, "template_features", f"{md5}.pkl.gz")
+        if self.uniref90_jackhmmer_runner is not None and not os.path.exists(pkl_save_path_temp):
+            if not os.path.exists(uniref90_out_path) or not use_precompute or not os.path.exists(templates_out_path):
+                # print(input_fasta_path, uniref90_out_path)
+                if not os.path.exists(uniref90_out_path):
+                    print(uniref90_out_path)
+                    self.uniref90_jackhmmer_runner(input_fasta_path, uniref90_out_path)
+                print("begin templates")
+                if templates_out_path is not None:
+                    try:
+                        os.makedirs(templates_out_path, exist_ok=True)
+                        seq, dec = parsers.parse_fasta(load_txt(input_fasta_path))
+                        input_sequence = seq[0]
+                        # msa_for_templates = jackhmmer_uniref90_result["sto"]
+                        msa_for_templates = parsers.truncate_stockholm_msa(
+                            uniref90_out_path, max_sequences=10000
+                        )
+                        msa_for_templates = parsers.deduplicate_stockholm_msa(msa_for_templates)
+                        msa_for_templates = parsers.remove_empty_columns_from_stockholm_msa(
+                            msa_for_templates
+                        )
+                        if self.template_searcher.input_format == "sto":
+                            pdb_templates_result = self.template_searcher.query(msa_for_templates)
+                        elif self.template_searcher.input_format == "a3m":
+                            uniref90_msa_as_a3m = parsers.convert_stockholm_to_a3m(msa_for_templates)
+                            pdb_templates_result = self.template_searcher.query(uniref90_msa_as_a3m)
+                        else:
+                            raise ValueError(
+                                "Unrecognized template input format: "
+                                f"{self.template_searcher.input_format}"
+                            )
+                        pdb_hits_out_path = os.path.join(
+                            templates_out_path, f"pdb_hits.{self.template_searcher.output_format}.pkl.gz"
+                        )
+                        with open(os.path.join(
+                                templates_out_path, f"pdb_hits.{self.template_searcher.output_format}"
+                        ), "w") as f:
+                            f.write(pdb_templates_result)
+                        pdb_template_hits = self.template_searcher.get_template_hits(
+                            output_string=pdb_templates_result, input_sequence=input_sequence
+                        )
+                        templates_result = self.template_featurizer.get_templates(
+                            query_sequence=input_sequence, hits=pdb_template_hits
+                        )
+                    except Exception as e:
+                        logging.exception("An error in template searching")
+                    dump_pkl(templates_result.features, pdb_hits_out_path, compress=True)
+        if self.uniprot_jackhmmer_runner is not None and not os.path.exists(pkl_save_path_msa_uni):
+            if not os.path.exists(uniprot_out_path) or not use_precompute:
+                self.uniprot_jackhmmer_runner(input_fasta_path, uniprot_out_path)
+        if self.reduced_bfd_jackhmmer_runner is not None and not os.path.exists(pkl_save_path_msa):
+            if not os.path.exists(reduced_bfd_out_path) or not use_precompute:
+                self.reduced_bfd_jackhmmer_runner(input_fasta_path, reduced_bfd_out_path)
+        if self.mgnify_jackhmmer_runner is not None and not os.path.exists(pkl_save_path_msa):
+            if not os.path.exists(mgnify_out_path) or not use_precompute:
+                self.mgnify_jackhmmer_runner(input_fasta_path, mgnify_out_path)
+        if self.bfd_uniref30_hhblits_runner is not None and not os.path.exists(pkl_save_path_msa):
+            if not os.path.exists(bfd_uniref30_out_path) or not use_precompute:
+                self.bfd_uniref30_hhblits_runner(input_fasta_path, bfd_uniref30_out_path)
+        if self.bfd_uniclust30_hhblits_runner is not None and not os.path.exists(pkl_save_path_msa):
+            if not os.path.exists(bfd_uniclust30_out_path) or not use_precompute:
+                self.bfd_uniclust30_hhblits_runner(input_fasta_path, bfd_uniclust30_out_path)
+        # if self.rfam_nhmmer_runner is not None:
+        #     if not os.path.exists(rfam_out_path) or not use_precompute:
+        #         self.rfam_nhmmer_runner(input_fasta_path, rfam_out_path)
+        # # print(self.rna_realign_runner is not None, os.path.exists(rfam_out_path))
+        # if self.rna_realign_runner is not None and os.path.exists(rfam_out_path):
+        #     self.rna_realign_runner(input_fasta_path, rfam_out_path, rfam_out_realigned_path)
+        # if self.rnacentral_nhmmer_runner is not None:
+        #     if not os.path.exists(rnacentral_out_path) or not use_precompute:
+        #         self.rnacentral_nhmmer_runner(input_fasta_path, rnacentral_out_path)
+        # if self.rna_realign_runner is not None and os.path.exists(rnacentral_out_path):
+        #     self.rna_realign_runner(input_fasta_path, rnacentral_out_path, rnacentral_out_realigned_path)
+        # if self.nt_nhmmer_runner is not None:
+        #     if not os.path.exists(nt_out_path) or not use_precompute:
+        #         self.nt_nhmmer_runner(input_fasta_path, nt_out_path)
+        # if self.rna_realign_runner is not None and os.path.exists(nt_out_path):
+        #     # print("realign",nt_out_path,nt_out_realigned_path)
+        #     self.rna_realign_runner(input_fasta_path, nt_out_path, nt_out_realigned_path)
+class DataProcessor:
+    def __init__(
+            self,
+            alphafold3_database_path,
+            jackhmmer_binary_path: Optional[str] = None,
+            hhblits_binary_path: Optional[str] = None,
+            nhmmer_binary_path: Optional[str] = None,
+            kalign_binary_path: Optional[str] = None,
+            hmmbuild_binary_path: Optional[str] = None,
+            hmmalign_binary_path: Optional[str] = None,
+            hhsearch_binary_path: Optional[str] = None,
+            template_searcher: Optional[TemplateSearcher] = None,
+            template_featurizer: Optional[templates.TemplateHitFeaturizer] = None,
+            n_cpus: int = 8,
+            n_workers: int = 1,
+    ):
+        '''
+        Database Versions:
+            Training:
+                uniref90:   v2022_05
+                bfd:
+                reduces_bfd:
+                uniclust30: v2018_08
+                uniprot:    v2020_05
+                mgnify:     v2022_05
+                rfam:       v14.9
+                rnacentral: v21.0
+                nt:         v2023_02_23
+            Inference:
+                uniref90:   v2022_05
+                bfd:
+                reduces_bfd:
+                uniclust30: v2018_08
+                uniprot:    v2021_04   *
+                mgnify:     v2022_05
+                rfam:       v14.9
+                rnacentral: v21.0
+                nt:         v2023_02_23
+            Inference Ligand:
+                uniref90:   v2020_01   *
+                bfd:
+                reduces_bfd:
+                uniclust30: v2018_08
+                uniprot:    v2020_05
+                mgnify:     v2018_12   *
+                rfam:       v14.9
+                rnacentral: v21.0
+                nt:         v2023_02_23
+        Args:
+            alphafold3_database_path: Database dir that contains all alphafold3 databases
+            jackhmmer_binary_path:
+            hhblits_binary_path:
+            nhmmer_binary_path:
+            kalign_binary_path:
+            hmmaligh_binary_path:
+            n_cpus:
+            n_workers:
+        '''
+        self.jackhmmer_binary_path = jackhmmer_binary_path
+        self.hhblits_binary_path = hhblits_binary_path
+        self.nhmmer_binary_path = nhmmer_binary_path
+        self.hmmbuild_binary_path = hmmbuild_binary_path
+        self.hmmalign_binary_path = hmmalign_binary_path
+        self.hhsearch_binary_path = hhsearch_binary_path
+        self.template_searcher = template_searcher
+        self.template_featurizer = template_featurizer
+        self.n_cpus = n_cpus
+        self.n_workers = n_workers
+        self.uniref90_database_path = os.path.join(
+            alphafold3_database_path, "uniref90", "uniref90.fasta"
+        )
+        ################### TODO: DEBUG
+        self.uniprot_database_path = os.path.join(
+            alphafold3_database_path, "uniprot", "uniprot.fasta"
+        )
+        self.bfd_database_path = os.path.join(
+            alphafold3_database_path, "bfd", "bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt"
+        )
+        self.uniclust30_database_path = os.path.join(
+            alphafold3_database_path, "uniclust30", "uniclust30_2018_08", "uniclust30_2018_08"
+        )
+        ################### TODO: check alphafold2 multimer uniref30 version
+        self.uniref_30_database_path = os.path.join(
+            alphafold3_database_path, "uniref30", "v2020_06"
+        )
+        # self.reduced_bfd_database_path = os.path.join(
+        #     alphafold3_database_path,"reduced_bfd"
+        # )
+        self.mgnify_database_path = os.path.join(
+            alphafold3_database_path, "mgnify", "mgnify", "mgy_clusters.fa"
+        )
+        self.rfam_database_path = os.path.join(
+            alphafold3_database_path, "rfam", "v14.9", "Rfam_af3_clustered_rep_seq.fasta"
+        )
+        self.rnacentral_database_path = os.path.join(
+            alphafold3_database_path, "rnacentral", "v21.0", "rnacentral_db_rep_seq.fasta"
+        )
+        self.nt_database_path = os.path.join(
+            # alphafold3_database_path, "nt", "v2023_02_23", "nt_af3_clustered_rep_seq.fasta" # DEBUG
+            alphafold3_database_path, "nt", "v2023_02_23", "nt.fasta"
+        )
+        self.runner_args_map = {
+            "uniref90": {
+                "jackhmmer_binary_path": self.jackhmmer_binary_path,
+                "uniref90_database_path": self.uniref90_database_path,
+            },
+            "bfd_uniclust30": {
+                "hhblits_binary_path": self.hhblits_binary_path,
+                "bfd_database_path": self.bfd_database_path,
+                "uniclust30_database_path": self.uniclust30_database_path
+            },
+            "bfd_uniref30": {
+                "hhblits_binary_path": self.hhblits_binary_path,
+                "bfd_database_path": self.bfd_database_path,
+                "uniref_30_database_path": self.uniref_30_database_path
+            },
+            "mgnify": {
+                "jackhmmer_binary_path": self.jackhmmer_binary_path,
+                "mgnify_database_path": self.mgnify_database_path,
+            },
+            "uniprot": {
+                "jackhmmer_binary_path": self.jackhmmer_binary_path,
+                "uniprot_database_path": self.uniprot_database_path,
+            },
+            ###################### RNA ########################
+            "rfam": {
+                "nhmmer_binary_path": self.nhmmer_binary_path,
+                "rfam_database_path": self.rfam_database_path,
+                "hmmbuild_binary_path": self.hmmbuild_binary_path,
+                "hmmalign_binary_path": self.hmmalign_binary_path,
+            },
+            "rnacentral": {
+                "nhmmer_binary_path": self.nhmmer_binary_path,
+                "rnacentral_database_path": self.rnacentral_database_path,
+                "hmmbuild_binary_path": self.hmmbuild_binary_path,
+                "hmmalign_binary_path": self.hmmalign_binary_path,
+            },
+            "nt": {
+                "nhmmer_binary_path": self.nhmmer_binary_path,
+                "nt_database_path": self.nt_database_path,
+                "hmmbuild_binary_path": self.hmmbuild_binary_path,
+                "hmmalign_binary_path": self.hmmalign_binary_path,
+            },
+            ###################################################
+            "alphafold2": {
+                "jackhmmer_binary_path": self.jackhmmer_binary_path,
+                "hhblits_binary_path": self.hhblits_binary_path,
+                "uniref90_database_path": self.uniref90_database_path,
+                "bfd_database_path": self.bfd_database_path,
+                "uniclust30_database_path": self.uniclust30_database_path,
+                "mgnify_database_path": self.mgnify_database_path,
+            },
+            "alphafold2_multimer": {
+                "jackhmmer_binary_path": self.jackhmmer_binary_path,
+                "hhblits_binary_path": self.hhblits_binary_path,
+                "uniref90_database_path": self.uniref90_database_path,
+                "bfd_database_path": self.bfd_database_path,
+                "uniref_30_database_path": self.uniref_30_database_path,
+                "mgnify_database_path": self.mgnify_database_path,
+                "uniprot_database_path": self.uniprot_database_path,
+            },
+            "alphafold3": {
+                "jackhmmer_binary_path": self.jackhmmer_binary_path,
+                "hhblits_binary_path": self.hhblits_binary_path,
+                "template_searcher": self.template_searcher,
+                "template_featurizer": self.template_featurizer,
+                "uniref90_database_path": self.uniref90_database_path,
+                "bfd_database_path": self.bfd_database_path,
+                "uniclust30_database_path": self.uniclust30_database_path,
+                "mgnify_database_path": self.mgnify_database_path,
+                "uniprot_database_path": self.uniprot_database_path,
+            },
+            "rna": {
+                "nhmmer_binary_path": self.nhmmer_binary_path,
+                "rfam_database_path": self.rfam_database_path,
+                "rnacentral_database_path": self.rnacentral_database_path,
+                "hmmbuild_binary_path": self.hmmbuild_binary_path,
+                "hmmalign_binary_path": self.hmmalign_binary_path,
+            },
+        }
+    def _parse_io_tuples(self, input_fasta_path, output_dir, convert_md5=True, prefix="protein"):
+        os.makedirs(output_dir, exist_ok=True)
+        if isinstance(input_fasta_path, list):
+            input_fasta_paths = input_fasta_path
+        elif os.path.isdir(input_fasta_path):
+            input_fasta_paths = [os.path.join(input_fasta_path, i) for i in os.listdir(input_fasta_path)]
+        elif os.path.isfile(input_fasta_path):
+            input_fasta_paths = [input_fasta_path]
+        else:
+            input_fasta_paths = []
+            Exception("Can't parse input fasta path!")
+        seqs = [parse_fasta(load_txt(i))[0][0] for i in input_fasta_paths]
+        # sequences = [parsers.parse_fasta(load_txt(path))[0][0] for path in input_fasta_paths]
+        # TODO: debug
+        if convert_md5:
+            output_msas_dirs = [os.path.join(output_dir, convert_md5_string(f"{prefix}:{i}")) for i in
+                                seqs]
+        else:
+            output_msas_dirs = [os.path.join(output_dir, os.path.split(i)[1].split(".")[0]) for i in input_fasta_paths]
+        io_tuples = [(i, o) for i, o in zip(input_fasta_paths, output_msas_dirs)]
+        return io_tuples
+    def _process_iotuple(self, io_tuple, msas_type, use_precompute=True):
+        i, o = io_tuple
+        alignment_runner = AlignmentRunner(
+            **self.runner_args_map[msas_type],
+            no_cpus=self.n_cpus
+        )
+        try:
+            alignment_runner.run(i, o, use_precompute=use_precompute)
+        except:
+            logging.warning(f"{i}:{o} task failed!")
+    def process(self, input_fasta_path, output_dir, msas_type="rfam", convert_md5=True, use_precompute=True):
+        prefix = "rna" if msas_type in ["rfam", "rnacentral", "nt", "rna"] else "protein"
+        io_tuples = self._parse_io_tuples(input_fasta_path, output_dir, convert_md5=convert_md5, prefix=prefix)
+        run_pool_tasks(partial(self._process_iotuple, msas_type=msas_type, use_precompute=use_precompute), io_tuples,
+                       num_workers=self.n_workers,
+                       return_dict=False)
+    def convert_output_to_md5(self, input_fasta_path, output_dir, md5_output_dir, prefix="protein"):
+        io_tuples = self._parse_io_tuples(input_fasta_path, output_dir, convert_md5=False, prefix=prefix)
+        io_tuples_md5 = self._parse_io_tuples(input_fasta_path, md5_output_dir, convert_md5=True, prefix=prefix)
+        for io0, io1 in tqdm.tqdm(zip(io_tuples, io_tuples_md5)):
+            o, o_md5 = io0[1], io1[1]
+            os.system(f"cp -r {os.path.abspath(o)} {os.path.abspath(o_md5)}")
+def run_homo_search(
+        out_dir,
+        save_dir,
+        feature_dir,
+        pdb_70_dir,
+        template_mmcif_dir,
+        max_template_date="2021-09-30",
+        obsolete_pdbs_path=None,
+        use_precompute=True,
+):
+    # save_dir = os.path.join(out_dir,"cache")
+    data_processor = DataProcessor(
+        alphafold3_database_path=feature_dir,
+        # nhmmer_binary_path="/usr/bin/nhmmer",
+        jackhmmer_binary_path="/usr/bin/jackhmmer",
+        hhblits_binary_path="/usr/bin/hhblits",
+        hhsearch_binary_path="/usr/bin/hhsearch",
+        template_searcher=hhsearch.HHSearch(
+            binary_path="/usr/bin/hhsearch",
+            databases=[pdb_70_dir]
+        ),
+        template_featurizer=templates.HhsearchHitFeaturizer(
+            mmcif_dir=template_mmcif_dir,
+            max_template_date=max_template_date,
+            max_hits=20,
+            kalign_binary_path="/usr/bin/kalign",
+            release_dates_path=None,
+            obsolete_pdbs_path=obsolete_pdbs_path,
+        ),
+        n_cpus=32,
+        n_workers=12
+    )
+    output_dir = os.path.join(out_dir, "features/msas")
+    # output_dir = "/2022133002/data/stfold-data-v5/features/msas"
+    # output_dir = "/2022133002/data/benchmark/stfold/dta/features/msas"
+    # output_dir = "/2022133002/data/benchmark/features/msas"
+    os.makedirs(output_dir, exist_ok=True)
+    files = os.listdir(save_dir)
+    files = [os.path.join(save_dir, file) for file in files]
+    # files = chunk_lists(files,num_workers=4)[3]
+    try:
+        data_processor.process(
+            input_fasta_path=files,
+            output_dir=output_dir,
+            msas_type="alphafold3",
+            convert_md5=True,
+            use_precompute=use_precompute
+        )
+        print(f"save msa to {output_dir}")
+    except Exception as e:
+        print(e)
+        pass
+    # msa_dir = "/2022133002/data/stfold-data-v5/features/msa_features"
+    msa_dir = os.path.join(out_dir, "features/msa_features")
+    os.makedirs(msa_dir, exist_ok=True)
+    from PhysDock.data.tools.dataset_manager import DatasetManager
+    from PhysDock.data.tools.convert_unifold_template_to_stfold import \
+        convert_unifold_template_feature_to_stfold_unifold_feature
+    try:
+        out = DatasetManager.convert_msas_out_to_msa_features(
+            input_fasta_path=save_dir,
+            output_dir=output_dir,
+            msa_feature_dir=msa_dir,
+            convert_md5=True,
+            num_workers=2
+        )
+        print(f"save msa feature to {msa_dir}")
+    except:
+        pass
+    try:
+        msa_dir_uni = os.path.join(out_dir, "features/uniprot_msa_features")
+        # msa_dir_uni = "/2022133002/data/stfold-data-v5/features/uniprot_msa_features"
+        os.makedirs(msa_dir_uni, exist_ok=True)
+        out = DatasetManager.convert_msas_out_to_uniprot_msa_features(
+            input_fasta_path=save_dir,
+            output_dir=output_dir,
+            uniprot_msa_feature_dir=msa_dir_uni,
+            convert_md5=True,
+            num_workers=2
+        )
+        print(f"save uni msa feature to {msa_dir_uni}")
+    except Exception as e:
+        print(e)
+        pass
+    templ_dir_uni = os.path.join(out_dir, "features/template_features")
+    # templ_dir_uni = "/2022133002/data/stfold-data-v5/features/template_features"
+    os.makedirs(templ_dir_uni, exist_ok=True)
+    try:
+        files = os.listdir(save_dir)
+        files = [os.path.join(out_dir, file) for file in files[::-1]]
+        run_pool_tasks(convert_unifold_template_feature_to_stfold_unifold_feature, files, num_workers=16)
+    except:
+        pass
+class STDockAlignmentRunner():
+    def __init__(
+            self,
+            # Homo Search Tools Path
+            jackhmmer_binary_path: Optional[str] = None,
+            hhblits_binary_path: Optional[str] = None,
+            kalign_binary_path: Optional[str] = None,
+            hhsearch_binary_path: Optional[str] = None,
+            # Databases
+            uniref90_database_path: Optional[str] = None,
+            uniprot_database_path: Optional[str] = None,
+            uniclust30_database_path: Optional[str] = None,
+            bfd_database_path: Optional[str] = None,
+            mgnify_database_path: Optional[str] = None,
+            pdb_70_database_path: Optional[str] = None,
+            mmcif_files_path: Optional[str] = None,
+            obsolete_pdbs_path: Optional[str] = None,
+            # Settings
+            max_template_date: str = "2021-09-30",
+            max_template_hits: int = 20,
+            #
+            no_cpus: int = 8,
+            # Limitations
+            uniref90_seq_limit: int = 100000,
+            uniprot_seq_limit: int = 500000,
+            mgnify_seq_limit: int = 50000,
+            uniref90_max_hits: int = 10000,
+            uniprot_max_hits: int = 50000,
+            mgnify_max_hits: int = 5000,
+    ):
+        super().__init__()
+        #
+        self.jackhmmer_binary_path = jackhmmer_binary_path
+        self.hhblits_binary_path = hhblits_binary_path
+        self.uniref90_database_path = uniref90_database_path
+        self.mgnify_database_path = mgnify_database_path
+        self.uniclust30_database_path = uniclust30_database_path
+        self.bfd_database_path = bfd_database_path
+        self.uniprot_database_path = uniprot_database_path
+        self.template_searcher = hhsearch.HHSearch(
+            binary_path=hhsearch_binary_path,
+            databases=[pdb_70_database_path]
+        )
+        self.template_featurizer = templates.HhsearchHitFeaturizer(
+            mmcif_dir=mmcif_files_path,
+            max_template_date=max_template_date,
+            max_hits=max_template_hits,
+            kalign_binary_path=kalign_binary_path,
+            obsolete_pdbs_path=obsolete_pdbs_path,
+        )
+        def _all_exists(*objs, hhblits_mode=False):
+            if not hhblits_mode:
+                for obj in objs:
+                    if obj is None or not os.path.exists(obj):
+                        return False
+            else:
+                for obj in objs:
+                    if obj is None or not os.path.exists(os.path.split(obj)[0]):
+                        return False
+            return True
+        def _run_msa_tool(
+                fasta_path: str,
+                msa_out_path: str,
+                msa_runner,
+                msa_format: str,
+                max_sto_sequences: Optional[int] = None,
+        ) -> Mapping[str, Any]:
+            """Runs an MSA tool, checking if output already exists first."""
+            if (msa_format == "sto" and max_sto_sequences is not None):
+                result = msa_runner.query(fasta_path, max_sto_sequences)[0]
+            else:
+                result = msa_runner.query(fasta_path)[0]
+            assert msa_out_path.split('.')[-1] == msa_format
+            with open(msa_out_path, "w") as f:
+                f.write(result[msa_format])
+            return result
+        # Jackhmmer
+        if _all_exists(jackhmmer_binary_path, uniref90_database_path):
+            self.uniref90_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=uniref90_database_path,
+                    seq_limit=uniref90_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=uniref90_max_hits
+            )
+        if _all_exists(jackhmmer_binary_path, uniprot_database_path):
+            self.uniprot_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=uniprot_database_path,
+                    seq_limit=uniprot_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=uniprot_max_hits
+            )
+        if _all_exists(jackhmmer_binary_path, mgnify_database_path):
+            self.mgnify_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=mgnify_database_path,
+                    seq_limit=mgnify_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=mgnify_max_hits
+            )
+        # HHblits
+        if _all_exists(hhblits_binary_path, bfd_database_path, uniclust30_database_path, hhblits_mode=True):
+            self.bfd_uniclust30_hhblits_runner = partial(
+                _run_msa_tool,
+                msa_runner=hhblits.HHBlits(
+                    binary_path=hhblits_binary_path,
+                    databases=[bfd_database_path, uniclust30_database_path],
+                    n_cpu=no_cpus,
+                ),
+                msa_format="a3m",
+            )
+    def run_protein_msas(
+            self,
+            input_fasta_path,
+            output_msas_dir,
+            use_precompute=True,
+            copy_to_dataset=False,
+            dataset_path=None,
+    ):
+        os.makedirs(output_msas_dir, exist_ok=True)
+        os.makedirs(os.path.join(output_msas_dir, "features"), exist_ok=True)
+        os.makedirs(os.path.join(output_msas_dir, "features", "msa_features"), exist_ok=True)
+        os.makedirs(os.path.join(output_msas_dir, "features", "uniprot_msa_features"), exist_ok=True)
+        os.makedirs(os.path.join(output_msas_dir, "features", "template_features"), exist_ok=True)
+        templates_out_path = os.path.join(output_msas_dir, "templates")
+        uniref90_out_path = os.path.join(output_msas_dir, "uniref90_hits.sto")
+        uniprot_out_path = os.path.join(output_msas_dir, "uniprot_hits.sto")
+        mgnify_out_path = os.path.join(output_msas_dir, "mgnify_hits.sto")
+        bfd_uniclust30_out_path = os.path.join(output_msas_dir, f"bfd_uniclust30_hits.a3m")
+        seqs, decs = parse_fasta(load_txt(input_fasta_path))
+        prefix = "protein"
+        md5 = convert_md5_string(f"{prefix}:{seqs[0]}")
+        msa_md5_save_path = os.path.join(output_msas_dir, "features", "msa_features", f"{md5}.pkl.gz")
+        uniprot_msa_md5_save_path = os.path.join(output_msas_dir, "features", "uniprot_msa_features", f"{md5}.pkl.gz")
+        template_md5_save_path = os.path.join(output_msas_dir, "features", "template_features", f"{md5}.pkl.gz")
+        if dataset_path is not None:
+            dataset_msa_md5_save_path = os.path.join(
+                dataset_path, "features", "msa_features", f"{md5}.pkl.gz")
+            dataset_uniprot_msa_md5_save_path = os.path.join(
+                dataset_path, "features", "uniprot_msa_features", f"{md5}.pkl.gz")
+            dataset_template_md5_save_path = os.path.join(
+                dataset_path, "features", "template_features", f"{md5}.pkl.gz")
+            if os.path.exists(dataset_msa_md5_save_path):
+                shutil.copyfile(dataset_msa_md5_save_path, msa_md5_save_path)
+            if os.path.exists(dataset_uniprot_msa_md5_save_path):
+                shutil.copyfile(dataset_uniprot_msa_md5_save_path, uniprot_msa_md5_save_path)
+            if os.path.exists(dataset_template_md5_save_path):
+                shutil.copyfile(dataset_template_md5_save_path, template_md5_save_path)
+        if self.uniref90_jackhmmer_runner is not None and not os.path.exists(template_md5_save_path):
+            if not os.path.exists(uniref90_out_path) or not use_precompute or not os.path.exists(templates_out_path):
+                if not os.path.exists(uniref90_out_path):
+                    self.uniref90_jackhmmer_runner(input_fasta_path, uniref90_out_path)
+                if templates_out_path is not None:
+                    try:
+                        os.makedirs(templates_out_path, exist_ok=True)
+                        seq, dec = parsers.parse_fasta(load_txt(input_fasta_path))
+                        input_sequence = seq[0]
+                        # msa_for_templates = jackhmmer_uniref90_result["sto"]
+                        msa_for_templates = parsers.truncate_stockholm_msa(
+                            uniref90_out_path, max_sequences=10000
+                        )
+                        msa_for_templates = parsers.deduplicate_stockholm_msa(msa_for_templates)
+                        msa_for_templates = parsers.remove_empty_columns_from_stockholm_msa(
+                            msa_for_templates
+                        )
+                        if self.template_searcher.input_format == "sto":
+                            pdb_templates_result = self.template_searcher.query(msa_for_templates)
+                        elif self.template_searcher.input_format == "a3m":
+                            uniref90_msa_as_a3m = parsers.convert_stockholm_to_a3m(msa_for_templates)
+                            pdb_templates_result = self.template_searcher.query(uniref90_msa_as_a3m)
+                        else:
+                            raise ValueError(
+                                "Unrecognized template input format: "
+                                f"{self.template_searcher.input_format}"
+                            )
+                        pdb_hits_out_path = os.path.join(
+                            templates_out_path, f"pdb_hits.{self.template_searcher.output_format}.pkl.gz"
+                        )
+                        with open(os.path.join(
+                                templates_out_path, f"pdb_hits.{self.template_searcher.output_format}"
+                        ), "w") as f:
+                            f.write(pdb_templates_result)
+                        pdb_template_hits = self.template_searcher.get_template_hits(
+                            output_string=pdb_templates_result, input_sequence=input_sequence
+                        )
+                        templates_result = self.template_featurizer.get_templates(
+                            query_sequence=input_sequence, hits=pdb_template_hits
+                        )
+                        dump_pkl(templates_result.features, pdb_hits_out_path, compress=True)
+                    except Exception as e:
+                        logging.exception("An error in template searching")
+        if self.uniprot_jackhmmer_runner is not None and not os.path.exists(uniprot_msa_md5_save_path):
+            if not os.path.exists(uniprot_out_path) or not use_precompute:
+                self.uniprot_jackhmmer_runner(input_fasta_path, uniprot_out_path)
+        if self.mgnify_jackhmmer_runner is not None and not os.path.exists(msa_md5_save_path):
+            if not os.path.exists(mgnify_out_path) or not use_precompute:
+                self.mgnify_jackhmmer_runner(input_fasta_path, mgnify_out_path)
+        if self.bfd_uniclust30_hhblits_runner is not None and not os.path.exists(msa_md5_save_path):
+            if not os.path.exists(bfd_uniclust30_out_path) or not use_precompute:
+                self.bfd_uniclust30_hhblits_runner(input_fasta_path, bfd_uniclust30_out_path)

PhysDock/data/alignment_runner_v2.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import logging
+import os.path
+import shutil
+from functools import partial
+import tqdm
+from typing import Optional, Mapping, Any, Union
+from PhysDock.data.tools import jackhmmer, nhmmer, hhblits, kalign, hmmalign, parsers, hmmbuild, hhsearch, templates
+from PhysDock.utils.io_utils import load_pkl, load_txt, load_json, run_pool_tasks, convert_md5_string, dump_pkl
+from PhysDock.data.tools.parsers import parse_fasta
+from PhysDock.data.tools.dataset_manager import DatasetManager
+TemplateSearcher = Union[hhsearch.HHSearch]
+class AlignmentRunner:
+    def __init__(
+            self,
+            # Databases
+            uniref90_database_path: Optional[str] = None,
+            uniprot_database_path: Optional[str] = None,
+            uniclust30_database_path: Optional[str] = None,
+            bfd_database_path: Optional[str] = None,
+            mgnify_database_path: Optional[str] = None,
+            # Homo Search Tools
+            jackhmmer_binary_path: str = "/usr/bin/jackhmmer",
+            hhblits_binary_path: str = "/usr/bin/hhblits",
+            # Params
+            no_cpus: int = 8,
+            # Thresholds
+            uniref90_seq_limit: int = 100000,
+            uniprot_seq_limit: int = 500000,
+            mgnify_seq_limit: int = 50000,
+            uniref90_max_hits: int = 10000,
+            uniprot_max_hits: int = 50000,
+            mgnify_max_hits: int = 5000,
+    ):
+        self.uniref90_jackhmmer_runner = None
+        self.uniprot_jackhmmer_runner = None
+        self.mgnify_jackhmmer_runner = None
+        self.bfd_uniref30_hhblits_runner = None
+        self.bfd_uniclust30_hhblits_runner = None
+        def _all_exists(*objs, hhblits_mode=False):
+            if not hhblits_mode:
+                for obj in objs:
+                    if obj is None or not os.path.exists(obj):
+                        return False
+            else:
+                for obj in objs:
+                    if obj is None or not os.path.exists(os.path.split(obj)[0]):
+                        return False
+            return True
+        def _run_msa_tool(
+                fasta_path: str,
+                msa_out_path: str,
+                msa_runner,
+                msa_format: str,
+                max_sto_sequences: Optional[int] = None,
+        ) -> Mapping[str, Any]:
+            """Runs an MSA tool, checking if output already exists first."""
+            if (msa_format == "sto" and max_sto_sequences is not None):
+                result = msa_runner.query(fasta_path, max_sto_sequences)[0]
+            else:
+                result = msa_runner.query(fasta_path)[0]
+            assert msa_out_path.split('.')[-1] == msa_format
+            with open(msa_out_path, "w") as f:
+                f.write(result[msa_format])
+            return result
+        # Jackhmmer
+        if _all_exists(jackhmmer_binary_path, uniref90_database_path):
+            self.uniref90_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=uniref90_database_path,
+                    seq_limit=uniref90_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=uniref90_max_hits
+            )
+        if _all_exists(jackhmmer_binary_path, uniprot_database_path):
+            self.uniprot_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=uniprot_database_path,
+                    seq_limit=uniprot_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=uniprot_max_hits
+            )
+        if _all_exists(jackhmmer_binary_path, mgnify_database_path):
+            self.mgnify_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=mgnify_database_path,
+                    seq_limit=mgnify_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=mgnify_max_hits
+            )
+        # HHblits
+        if _all_exists(hhblits_binary_path, bfd_database_path, uniclust30_database_path, hhblits_mode=True):
+            self.bfd_uniclust30_hhblits_runner = partial(
+                _run_msa_tool,
+                msa_runner=hhblits.HHBlits(
+                    binary_path=hhblits_binary_path,
+                    databases=[bfd_database_path, uniclust30_database_path],
+                    n_cpu=no_cpus,
+                ),
+                msa_format="a3m",
+            )
+    def run(self, input_fasta_path, output_msas_dir, use_precompute=True):
+        os.makedirs(output_msas_dir, exist_ok=True)
+        uniref90_out_path = os.path.join(output_msas_dir, "uniref90_hits.sto")
+        uniprot_out_path = os.path.join(output_msas_dir, "uniprot_hits.sto")
+        mgnify_out_path = os.path.join(output_msas_dir, "mgnify_hits.sto")
+        bfd_uniclust30_out_path = os.path.join(output_msas_dir, f"bfd_uniclust30_hits.a3m")
+        seqs, decs = parse_fasta(load_txt(input_fasta_path))
+        prefix = "protein"
+        md5 = convert_md5_string(f"{prefix}:{seqs[0]}")
+        output_feature = os.path.dirname(output_msas_dir)
+        output_feature = os.path.dirname(output_feature)
+        pkl_save_path_msa = os.path.join(output_feature, "msa_features", f"{md5}.pkl.gz")
+        pkl_save_path_msa_uni = os.path.join(output_feature, "uniprot_msa_features", f"{md5}.pkl.gz")
+        if self.uniref90_jackhmmer_runner is not None and not os.path.exists(pkl_save_path_msa):
+            if not os.path.exists(uniref90_out_path) or not use_precompute:
+                self.uniref90_jackhmmer_runner(input_fasta_path, uniref90_out_path)
+        if self.uniprot_jackhmmer_runner is not None and not os.path.exists(pkl_save_path_msa_uni):
+            if not os.path.exists(uniprot_out_path) or not use_precompute:
+                self.uniprot_jackhmmer_runner(input_fasta_path, uniprot_out_path)
+        if self.mgnify_jackhmmer_runner is not None and not os.path.exists(pkl_save_path_msa):
+            if not os.path.exists(mgnify_out_path) or not use_precompute:
+                self.mgnify_jackhmmer_runner(input_fasta_path, mgnify_out_path)
+        if self.bfd_uniclust30_hhblits_runner is not None and not os.path.exists(pkl_save_path_msa):
+            if not os.path.exists(bfd_uniclust30_out_path) or not use_precompute:
+                self.bfd_uniclust30_hhblits_runner(input_fasta_path, bfd_uniclust30_out_path)
+class DataProcessor:
+    def __init__(
+            self,
+            bfd_database_path,
+            uniclust30_database_path,
+            uniref90_database_path,
+            mgnify_database_path,
+            uniprot_database_path,
+            jackhmmer_binary_path: Optional[str] = None,
+            hhblits_binary_path: Optional[str] = None,
+            n_cpus: int = 8,
+            n_workers: int = 1,
+    ):
+        '''
+        '''
+        self.jackhmmer_binary_path = jackhmmer_binary_path
+        self.hhblits_binary_path = hhblits_binary_path
+        self.n_cpus = n_cpus
+        self.n_workers = n_workers
+        self.uniref90_database_path = uniref90_database_path
+        self.uniprot_database_path = uniprot_database_path
+        self.bfd_database_path = bfd_database_path
+        self.uniclust30_database_path = uniclust30_database_path
+        self.mgnify_database_path = mgnify_database_path
+        # self.uniref90_database_path = os.path.join(
+        #     alphafold3_database_path, "uniref90", "uniref90.fasta"
+        # )
+        # self.uniprot_database_path = os.path.join(
+        #     alphafold3_database_path, "uniprot", "uniprot.fasta"
+        # )
+        # self.bfd_database_path = os.path.join(
+        #     alphafold3_database_path, "bfd", "bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt"
+        # )
+        # self.uniclust30_database_path = os.path.join(
+        #     alphafold3_database_path, "uniclust30", "uniclust30_2018_08", "uniclust30_2018_08"
+        # )
+        #
+        # self.mgnify_database_path = os.path.join(
+        #     alphafold3_database_path, "mgnify", "mgnify", "mgy_clusters.fa"
+        # )
+    def _parse_io_tuples(self, input_fasta_path, output_dir, convert_md5=True, prefix="protein"):
+        os.makedirs(output_dir, exist_ok=True)
+        if isinstance(input_fasta_path, list):
+            input_fasta_paths = input_fasta_path
+        elif os.path.isdir(input_fasta_path):
+            input_fasta_paths = [os.path.join(input_fasta_path, i) for i in os.listdir(input_fasta_path)]
+        elif os.path.isfile(input_fasta_path):
+            input_fasta_paths = [input_fasta_path]
+        else:
+            input_fasta_paths = []
+            Exception("Can't parse input fasta path!")
+        seqs = [parse_fasta(load_txt(i))[0][0] for i in input_fasta_paths]
+        # sequences = [parsers.parse_fasta(load_txt(path))[0][0] for path in input_fasta_paths]
+        # TODO: debug
+        if convert_md5:
+            output_msas_dirs = [os.path.join(output_dir, convert_md5_string(f"{prefix}:{i}")) for i in
+                                seqs]
+        else:
+            output_msas_dirs = [os.path.join(output_dir, os.path.split(i)[1].split(".")[0]) for i in input_fasta_paths]
+        io_tuples = [(i, o) for i, o in zip(input_fasta_paths, output_msas_dirs)]
+        return io_tuples
+    def _process_iotuple(self, io_tuple, use_precompute=True):
+        i, o = io_tuple
+        kwargs = {
+            "jackhmmer_binary_path": self.jackhmmer_binary_path,
+            "hhblits_binary_path": self.hhblits_binary_path,
+            "uniref90_database_path": self.uniref90_database_path,
+            "bfd_database_path": self.bfd_database_path,
+            "uniclust30_database_path": self.uniclust30_database_path,
+            "mgnify_database_path": self.mgnify_database_path,
+            "uniprot_database_path": self.uniprot_database_path,
+        }
+        alignment_runner = AlignmentRunner(
+            **kwargs,
+            no_cpus=self.n_cpus
+        )
+        try:
+            alignment_runner.run(i, o, use_precompute=use_precompute)
+        except:
+            logging.warning(f"{i}:{o} task failed!")
+    def process(self, input_fasta_path, output_dir, convert_md5=True, use_precompute=True):
+        prefix = "protein"
+        io_tuples = self._parse_io_tuples(input_fasta_path, output_dir, convert_md5=convert_md5, prefix=prefix)
+        run_pool_tasks(partial(self._process_iotuple, use_precompute=use_precompute), io_tuples,
+                       num_workers=self.n_workers,
+                       return_dict=False)
+    def convert_output_to_md5(self, input_fasta_path, output_dir, md5_output_dir, prefix="protein"):
+        io_tuples = self._parse_io_tuples(input_fasta_path, output_dir, convert_md5=False, prefix=prefix)
+        io_tuples_md5 = self._parse_io_tuples(input_fasta_path, md5_output_dir, convert_md5=True, prefix=prefix)
+        for io0, io1 in tqdm.tqdm(zip(io_tuples, io_tuples_md5)):
+            o, o_md5 = io0[1], io1[1]
+            os.system(f"cp -r {os.path.abspath(o)} {os.path.abspath(o_md5)}")
+def run_homo_search(
+        bfd_database_path,
+        uniclust30_database_path,
+        uniref90_database_path,
+        mgnify_database_path,
+        uniprot_database_path,
+        jackhmmer_binary_path,
+        hhblits_binary_path,
+        input_fasta_path,
+        out_dir,
+        n_cpus=16,
+        n_workers=1,
+):
+    # save_dir = os.path.join(out_dir,"cache")
+    data_processor = DataProcessor(
+        bfd_database_path,
+        uniclust30_database_path,
+        uniref90_database_path,
+        mgnify_database_path,
+        uniprot_database_path,
+        jackhmmer_binary_path=jackhmmer_binary_path,
+        hhblits_binary_path=hhblits_binary_path,
+        n_cpus=n_cpus,
+        n_workers=n_workers
+    )
+    output_dir = os.path.join(out_dir, "msas")
+    os.makedirs(output_dir, exist_ok=True)
+    if os.path.isfile(input_fasta_path):
+        files = [input_fasta_path]
+    else:
+        files = os.listdir(input_fasta_path)
+        files = [os.path.join(input_fasta_path, file) for file in files[::-1]]
+    data_processor.process(
+        input_fasta_path=files,
+        output_dir=output_dir,
+        convert_md5=True
+    )
+    print(f"save msa to {output_dir}")
+    msa_dir = os.path.join(out_dir, "msa_features")
+    os.makedirs(msa_dir, exist_ok=True)
+    out = DatasetManager.convert_msas_out_to_msa_features(
+        input_fasta_path=input_fasta_path,
+        output_dir=output_dir,
+        msa_feature_dir=msa_dir,
+        convert_md5=True,
+        num_workers=2
+    )
+    print(f"save msa feature to {msa_dir}")
+    msa_dir_uni = os.path.join(out_dir, "uniprot_msa_features")
+    os.makedirs(msa_dir_uni, exist_ok=True)
+    out = DatasetManager.convert_msas_out_to_uniprot_msa_features(
+        input_fasta_path=input_fasta_path,
+        output_dir=output_dir,
+        uniprot_msa_feature_dir=msa_dir_uni,
+        convert_md5=True,
+        num_workers=2
+    )
+    print(f"save uni msa feature to {msa_dir_uni}")

PhysDock/data/constants/PDBData.py ADDED Viewed

	@@ -0,0 +1,348 @@

+# Copyright 2000 Andrew Dalke.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Information about the IUPAC alphabets."""
+protein_letters = "ACDEFGHIKLMNPQRSTVWY"
+extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO"
+#   B = "Asx";  aspartic acid or asparagine (D or N)
+#   X = "Xxx";  unknown or 'other' amino acid
+#   Z = "Glx";  glutamic acid or glutamine (E or Q)
+#   http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212
+#
+#   J = "Xle";  leucine or isoleucine (L or I, used in NMR)
+#   Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
+#   Also the International Nucleotide Sequence Database Collaboration (INSDC)
+#   (i.e. GenBank, EMBL, DDBJ) adopted this in 2006
+#   http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html
+#
+#   Xle (J); Leucine or Isoleucine
+#   The residue abbreviations, Xle (the three-letter abbreviation) and J
+#   (the one-letter abbreviation) are reserved for the case that cannot
+#   experimentally distinguish leucine from isoleucine.
+#
+#   U = "Sec";  selenocysteine
+#   http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
+#
+#   O = "Pyl";  pyrrolysine
+#   http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35
+protein_letters_1to3 = {
+    "A": "Ala",
+    "C": "Cys",
+    "D": "Asp",
+    "E": "Glu",
+    "F": "Phe",
+    "G": "Gly",
+    "H": "His",
+    "I": "Ile",
+    "K": "Lys",
+    "L": "Leu",
+    "M": "Met",
+    "N": "Asn",
+    "P": "Pro",
+    "Q": "Gln",
+    "R": "Arg",
+    "S": "Ser",
+    "T": "Thr",
+    "V": "Val",
+    "W": "Trp",
+    "Y": "Tyr",
+}
+protein_letters_1to3 = {k.upper(): v.upper() for k, v in protein_letters_1to3.items()}
+protein_letters_3to1 = {v: k for k, v in protein_letters_1to3.items()}
+protein_letters_3to1_extended = {
+    "A5N": "N", "A8E": "V", "A9D": "S", "AA3": "A", "AA4": "A", "AAR": "R",
+    "ABA": "A", "ACL": "R", "AEA": "C", "AEI": "D", "AFA": "N", "AGM": "R",
+    "AGQ": "Y", "AGT": "C", "AHB": "N", "AHL": "R", "AHO": "A", "AHP": "A",
+    "AIB": "A", "AKL": "D", "AKZ": "D", "ALA": "A", "ALC": "A", "ALM": "A",
+    "ALN": "A", "ALO": "T", "ALS": "A", "ALT": "A", "ALV": "A", "ALY": "K",
+    "AME": "M", "AN6": "L", "AN8": "A", "API": "K", "APK": "K", "AR2": "R",
+    "AR4": "E", "AR7": "R", "ARG": "R", "ARM": "R", "ARO": "R", "AS7": "N",
+    "ASA": "D", "ASB": "D", "ASI": "D", "ASK": "D", "ASL": "D", "ASN": "N",
+    "ASP": "D", "ASQ": "D", "AYA": "A", "AZH": "A", "AZK": "K", "AZS": "S",
+    "AZY": "Y", "AVJ": "H", "A30": "Y", "A3U": "F", "ECC": "Q", "ECX": "C",
+    "EFC": "C", "EHP": "F", "ELY": "K", "EME": "E", "EPM": "M", "EPQ": "Q",
+    "ESB": "Y", "ESC": "M", "EXY": "L", "EXA": "K", "E0Y": "P", "E9V": "H",
+    "E9M": "W", "EJA": "C", "EUP": "T", "EZY": "G", "E9C": "Y", "EW6": "S",
+    "EXL": "W", "I2M": "I", "I4G": "G", "I58": "K", "IAM": "A", "IAR": "R",
+    "ICY": "C", "IEL": "K", "IGL": "G", "IIL": "I", "ILE": "I", "ILG": "E",
+    "ILM": "I", "ILX": "I", "ILY": "K", "IML": "I", "IOR": "R", "IPG": "G",
+    "IT1": "K", "IYR": "Y", "IZO": "M", "IC0": "G", "M0H": "C", "M2L": "K",
+    "M2S": "M", "M30": "G", "M3L": "K", "M3R": "K", "MA ": "A", "MAA": "A",
+    "MAI": "R", "MBQ": "Y", "MC1": "S", "MCL": "K", "MCS": "C", "MD3": "C",
+    "MD5": "C", "MD6": "G", "MDF": "Y", "ME0": "M", "MEA": "F", "MEG": "E",
+    "MEN": "N", "MEQ": "Q", "MET": "M", "MEU": "G", "MFN": "E", "MGG": "R",
+    "MGN": "Q", "MGY": "G", "MH1": "H", "MH6": "S", "MHL": "L", "MHO": "M",
+    "MHS": "H", "MHU": "F", "MIR": "S", "MIS": "S", "MK8": "L", "ML3": "K",
+    "MLE": "L", "MLL": "L", "MLY": "K", "MLZ": "K", "MME": "M", "MMO": "R",
+    "MNL": "L", "MNV": "V", "MP8": "P", "MPQ": "G", "MSA": "G", "MSE": "M",
+    "MSL": "M", "MSO": "M", "MT2": "M", "MTY": "Y", "MVA": "V", "MYK": "K",
+    "MYN": "R", "QCS": "C", "QIL": "I", "QMM": "Q", "QPA": "C", "QPH": "F",
+    "Q3P": "K", "QVA": "C", "QX7": "A", "Q2E": "W", "Q75": "M", "Q78": "F",
+    "QM8": "L", "QMB": "A", "QNQ": "C", "QNT": "C", "QNW": "C", "QO2": "C",
+    "QO5": "C", "QO8": "C", "QQ8": "Q", "U2X": "Y", "U3X": "F", "UF0": "S",
+    "UGY": "G", "UM1": "A", "UM2": "A", "UMA": "A", "UQK": "A", "UX8": "W",
+    "UXQ": "F", "YCM": "C", "YOF": "Y", "YPR": "P", "YPZ": "Y", "YTH": "T",
+    "Y1V": "L", "Y57": "K", "YHA": "K", "200": "F", "23F": "F", "23P": "A",
+    "26B": "T", "28X": "T", "2AG": "A", "2CO": "C", "2FM": "M", "2GX": "F",
+    "2HF": "H", "2JG": "S", "2KK": "K", "2KP": "K", "2LT": "Y", "2LU": "L",
+    "2ML": "L", "2MR": "R", "2MT": "P", "2OR": "R", "2P0": "P", "2QZ": "T",
+    "2R3": "Y", "2RA": "A", "2RX": "S", "2SO": "H", "2TY": "Y", "2VA": "V",
+    "2XA": "C", "2ZC": "S", "6CL": "K", "6CW": "W", "6GL": "A", "6HN": "K",
+    "60F": "C", "66D": "I", "6CV": "A", "6M6": "C", "6V1": "C", "6WK": "C",
+    "6Y9": "P", "6DN": "K", "DA2": "R", "DAB": "A", "DAH": "F", "DBS": "S",
+    "DBU": "T", "DBY": "Y", "DBZ": "A", "DC2": "C", "DDE": "H", "DDZ": "A",
+    "DI7": "Y", "DHA": "S", "DHN": "V", "DIR": "R", "DLS": "K", "DM0": "K",
+    "DMH": "N", "DMK": "D", "DNL": "K", "DNP": "A", "DNS": "K", "DNW": "A",
+    "DOH": "D", "DON": "L", "DP1": "R", "DPL": "P", "DPP": "A", "DPQ": "Y",
+    "DYS": "C", "D2T": "D", "DYA": "D", "DJD": "F", "DYJ": "P", "DV9": "E",
+    "H14": "F", "H1D": "M", "H5M": "P", "HAC": "A", "HAR": "R", "HBN": "H",
+    "HCM": "C", "HGY": "G", "HHI": "H", "HIA": "H", "HIC": "H", "HIP": "H",
+    "HIQ": "H", "HIS": "H", "HL2": "L", "HLU": "L", "HMR": "R", "HNC": "C",
+    "HOX": "F", "HPC": "F", "HPE": "F", "HPH": "F", "HPQ": "F", "HQA": "A",
+    "HR7": "R", "HRG": "R", "HRP": "W", "HS8": "H", "HS9": "H", "HSE": "S",
+    "HSK": "H", "HSL": "S", "HSO": "H", "HT7": "W", "HTI": "C", "HTR": "W",
+    "HV5": "A", "HVA": "V", "HY3": "P", "HYI": "M", "HYP": "P", "HZP": "P",
+    "HIX": "A", "HSV": "H", "HLY": "K", "HOO": "H", "H7V": "A", "L5P": "K",
+    "LRK": "K", "L3O": "L", "LA2": "K", "LAA": "D", "LAL": "A", "LBY": "K",
+    "LCK": "K", "LCX": "K", "LDH": "K", "LE1": "V", "LED": "L", "LEF": "L",
+    "LEH": "L", "LEM": "L", "LEN": "L", "LET": "K", "LEU": "L", "LEX": "L",
+    "LGY": "K", "LLO": "K", "LLP": "K", "LLY": "K", "LLZ": "K", "LME": "E",
+    "LMF": "K", "LMQ": "Q", "LNE": "L", "LNM": "L", "LP6": "K", "LPD": "P",
+    "LPG": "G", "LPS": "S", "LSO": "K", "LTR": "W", "LVG": "G", "LVN": "V",
+    "LWY": "P", "LYF": "K", "LYK": "K", "LYM": "K", "LYN": "K", "LYO": "K",
+    "LYP": "K", "LYR": "K", "LYS": "K", "LYU": "K", "LYX": "K", "LYZ": "K",
+    "LAY": "L", "LWI": "F", "LBZ": "K", "P1L": "C", "P2Q": "Y", "P2Y": "P",
+    "P3Q": "Y", "PAQ": "Y", "PAS": "D", "PAT": "W", "PBB": "C", "PBF": "F",
+    "PCA": "Q", "PCC": "P", "PCS": "F", "PE1": "K", "PEC": "C", "PF5": "F",
+    "PFF": "F", "PG1": "S", "PGY": "G", "PHA": "F", "PHD": "D", "PHE": "F",
+    "PHI": "F", "PHL": "F", "PHM": "F", "PKR": "P", "PLJ": "P", "PM3": "F",
+    "POM": "P", "PPN": "F", "PR3": "C", "PR4": "P", "PR7": "P", "PR9": "P",
+    "PRJ": "P", "PRK": "K", "PRO": "P", "PRS": "P", "PRV": "G", "PSA": "F",
+    "PSH": "H", "PTH": "Y", "PTM": "Y", "PTR": "Y", "PVH": "H", "PXU": "P",
+    "PYA": "A", "PYH": "K", "PYX": "C", "PH6": "P", "P9S": "C", "P5U": "S",
+    "POK": "R", "T0I": "Y", "T11": "F", "TAV": "D", "TBG": "V", "TBM": "T",
+    "TCQ": "Y", "TCR": "W", "TEF": "F", "TFQ": "F", "TH5": "T", "TH6": "T",
+    "THC": "T", "THR": "T", "THZ": "R", "TIH": "A", "TIS": "S", "TLY": "K",
+    "TMB": "T", "TMD": "T", "TNB": "C", "TNR": "S", "TNY": "T", "TOQ": "W",
+    "TOX": "W", "TPJ": "P", "TPK": "P", "TPL": "W", "TPO": "T", "TPQ": "Y",
+    "TQI": "W", "TQQ": "W", "TQZ": "C", "TRF": "W", "TRG": "K", "TRN": "W",
+    "TRO": "W", "TRP": "W", "TRQ": "W", "TRW": "W", "TRX": "W", "TRY": "W",
+    "TS9": "I", "TSY": "C", "TTQ": "W", "TTS": "Y", "TXY": "Y", "TY1": "Y",
+    "TY2": "Y", "TY3": "Y", "TY5": "Y", "TY8": "Y", "TY9": "Y", "TYB": "Y",
+    "TYC": "Y", "TYE": "Y", "TYI": "Y", "TYJ": "Y", "TYN": "Y", "TYO": "Y",
+    "TYQ": "Y", "TYR": "Y", "TYS": "Y", "TYT": "Y", "TYW": "Y", "TYY": "Y",
+    "T8L": "T", "T9E": "T", "TNQ": "W", "TSQ": "F", "TGH": "W", "X2W": "E",
+    "XCN": "C", "XPR": "P", "XSN": "N", "XW1": "A", "XX1": "K", "XYC": "A",
+    "XA6": "F", "11Q": "P", "11W": "E", "12L": "P", "12X": "P", "12Y": "P",
+    "143": "C", "1AC": "A", "1L1": "A", "1OP": "Y", "1PA": "F", "1PI": "A",
+    "1TQ": "W", "1TY": "Y", "1X6": "S", "56A": "H", "5AB": "A", "5CS": "C",
+    "5CW": "W", "5HP": "E", "5OH": "A", "5PG": "G", "51T": "Y", "54C": "W",
+    "5CR": "F", "5CT": "K", "5FQ": "A", "5GM": "I", "5JP": "S", "5T3": "K",
+    "5MW": "K", "5OW": "K", "5R5": "S", "5VV": "N", "5XU": "A", "55I": "F",
+    "999": "D", "9DN": "N", "9NE": "E", "9NF": "F", "9NR": "R", "9NV": "V",
+    "9E7": "K", "9KP": "K", "9WV": "A", "9TR": "K", "9TU": "K", "9TX": "K",
+    "9U0": "K", "9IJ": "F", "B1F": "F", "B27": "T", "B2A": "A", "B2F": "F",
+    "B2I": "I", "B2V": "V", "B3A": "A", "B3D": "D", "B3E": "E", "B3K": "K",
+    "B3U": "H", "B3X": "N", "B3Y": "Y", "BB6": "C", "BB7": "C", "BB8": "F",
+    "BB9": "C", "BBC": "C", "BCS": "C", "BCX": "C", "BFD": "D", "BG1": "S",
+    "BH2": "D", "BHD": "D", "BIF": "F", "BIU": "I", "BL2": "L", "BLE": "L",
+    "BLY": "K", "BMT": "T", "BNN": "F", "BOR": "R", "BP5": "A", "BPE": "C",
+    "BSE": "S", "BTA": "L", "BTC": "C", "BTK": "K", "BTR": "W", "BUC": "C",
+    "BUG": "V", "BYR": "Y", "BWV": "R", "BWB": "S", "BXT": "S", "F2F": "F",
+    "F2Y": "Y", "FAK": "K", "FB5": "A", "FB6": "A", "FC0": "F", "FCL": "F",
+    "FDL": "K", "FFM": "C", "FGL": "G", "FGP": "S", "FH7": "K", "FHL": "K",
+    "FHO": "K", "FIO": "R", "FLA": "A", "FLE": "L", "FLT": "Y", "FME": "M",
+    "FOE": "C", "FP9": "P", "FPK": "P", "FT6": "W", "FTR": "W", "FTY": "Y",
+    "FVA": "V", "FZN": "K", "FY3": "Y", "F7W": "W", "FY2": "Y", "FQA": "K",
+    "F7Q": "Y", "FF9": "K", "FL6": "D", "JJJ": "C", "JJK": "C", "JJL": "C",
+    "JLP": "K", "J3D": "C", "J9Y": "R", "J8W": "S", "JKH": "P", "N10": "S",
+    "N7P": "P", "NA8": "A", "NAL": "A", "NAM": "A", "NBQ": "Y", "NC1": "S",
+    "NCB": "A", "NEM": "H", "NEP": "H", "NFA": "F", "NIY": "Y", "NLB": "L",
+    "NLE": "L", "NLN": "L", "NLO": "L", "NLP": "L", "NLQ": "Q", "NLY": "G",
+    "NMC": "G", "NMM": "R", "NNH": "R", "NOT": "L", "NPH": "C", "NPI": "A",
+    "NTR": "Y", "NTY": "Y", "NVA": "V", "NWD": "A", "NYB": "C", "NYS": "C",
+    "NZH": "H", "N80": "P", "NZC": "T", "NLW": "L", "N0A": "F", "N9P": "A",
+    "N65": "K", "R1A": "C", "R4K": "W", "RE0": "W", "RE3": "W", "RGL": "R",
+    "RGP": "E", "RT0": "P", "RVX": "S", "RZ4": "S", "RPI": "R", "RVJ": "A",
+    "VAD": "V", "VAF": "V", "VAH": "V", "VAI": "V", "VAL": "V", "VB1": "K",
+    "VH0": "P", "VR0": "R", "V44": "C", "V61": "F", "VPV": "K", "V5N": "H",
+    "V7T": "K", "Z01": "A", "Z3E": "T", "Z70": "H", "ZBZ": "C", "ZCL": "F",
+    "ZU0": "T", "ZYJ": "P", "ZYK": "P", "ZZD": "C", "ZZJ": "A", "ZIQ": "W",
+    "ZPO": "P", "ZDJ": "Y", "ZT1": "K", "30V": "C", "31Q": "C", "33S": "F",
+    "33W": "A", "34E": "V", "3AH": "H", "3BY": "P", "3CF": "F", "3CT": "Y",
+    "3GA": "A", "3GL": "E", "3MD": "D", "3MY": "Y", "3NF": "Y", "3O3": "E",
+    "3PX": "P", "3QN": "K", "3TT": "P", "3XH": "G", "3YM": "Y", "3WS": "A",
+    "3WX": "P", "3X9": "C", "3ZH": "H", "7JA": "I", "73C": "S", "73N": "R",
+    "73O": "Y", "73P": "K", "74P": "K", "7N8": "F", "7O5": "A", "7XC": "F",
+    "7ID": "D", "7OZ": "A", "C1S": "C", "C1T": "C", "C1X": "K", "C22": "A",
+    "C3Y": "C", "C4R": "C", "C5C": "C", "C6C": "C", "CAF": "C", "CAS": "C",
+    "CAY": "C", "CCS": "C", "CEA": "C", "CGA": "E", "CGU": "E", "CGV": "C",
+    "CHP": "G", "CIR": "R", "CLE": "L", "CLG": "K", "CLH": "K", "CME": "C",
+    "CMH": "C", "CML": "C", "CMT": "C", "CR5": "G", "CS0": "C", "CS1": "C",
+    "CS3": "C", "CS4": "C", "CSA": "C", "CSB": "C", "CSD": "C", "CSE": "C",
+    "CSJ": "C", "CSO": "C", "CSP": "C", "CSR": "C", "CSS": "C", "CSU": "C",
+    "CSW": "C", "CSX": "C", "CSZ": "C", "CTE": "W", "CTH": "T", "CWD": "A",
+    "CWR": "S", "CXM": "M", "CY0": "C", "CY1": "C", "CY3": "C", "CY4": "C",
+    "CYA": "C", "CYD": "C", "CYF": "C", "CYG": "C", "CYJ": "K", "CYM": "C",
+    "CYQ": "C", "CYR": "C", "CYS": "C", "CYW": "C", "CZ2": "C", "CZZ": "C",
+    "CG6": "C", "C1J": "R", "C4G": "R", "C67": "R", "C6D": "R", "CE7": "N",
+    "CZS": "A", "G01": "E", "G8M": "E", "GAU": "E", "GEE": "G", "GFT": "S",
+    "GHC": "E", "GHG": "Q", "GHW": "E", "GL3": "G", "GLH": "Q", "GLJ": "E",
+    "GLK": "E", "GLN": "Q", "GLQ": "E", "GLU": "E", "GLY": "G", "GLZ": "G",
+    "GMA": "E", "GME": "E", "GNC": "Q", "GPL": "K", "GSC": "G", "GSU": "E",
+    "GT9": "C", "GVL": "S", "G3M": "R", "G5G": "L", "G1X": "Y", "G8X": "P",
+    "K1R": "C", "KBE": "K", "KCX": "K", "KFP": "K", "KGC": "K", "KNB": "A",
+    "KOR": "M", "KPI": "K", "KPY": "K", "KST": "K", "KYN": "W", "KYQ": "K",
+    "KCR": "K", "KPF": "K", "K5L": "S", "KEO": "K", "KHB": "K", "KKD": "D",
+    "K5H": "C", "K7K": "S", "OAR": "R", "OAS": "S", "OBS": "K", "OCS": "C",
+    "OCY": "C", "OHI": "H", "OHS": "D", "OLD": "H", "OLT": "T", "OLZ": "S",
+    "OMH": "S", "OMT": "M", "OMX": "Y", "OMY": "Y", "ONH": "A", "ORN": "A",
+    "ORQ": "R", "OSE": "S", "OTH": "T", "OXX": "D", "OYL": "H", "O7A": "T",
+    "O7D": "W", "O7G": "V", "O2E": "S", "O6H": "W", "OZW": "F", "S12": "S",
+    "S1H": "S", "S2C": "C", "S2P": "A", "SAC": "S", "SAH": "C", "SAR": "G",
+    "SBG": "S", "SBL": "S", "SCH": "C", "SCS": "C", "SCY": "C", "SD4": "N",
+    "SDB": "S", "SDP": "S", "SEB": "S", "SEE": "S", "SEG": "A", "SEL": "S",
+    "SEM": "S", "SEN": "S", "SEP": "S", "SER": "S", "SET": "S", "SGB": "S",
+    "SHC": "C", "SHP": "G", "SHR": "K", "SIB": "C", "SLL": "K", "SLZ": "K",
+    "SMC": "C", "SME": "M", "SMF": "F", "SNC": "C", "SNN": "N", "SOY": "S",
+    "SRZ": "S", "STY": "Y", "SUN": "S", "SVA": "S", "SVV": "S", "SVW": "S",
+    "SVX": "S", "SVY": "S", "SVZ": "S", "SXE": "S", "SKH": "K", "SNM": "S",
+    "SNK": "H", "SWW": "S", "WFP": "F", "WLU": "L", "WPA": "F", "WRP": "W",
+    "WVL": "V", "02K": "A", "02L": "N", "02O": "A", "02Y": "A", "033": "V",
+    "037": "P", "03Y": "C", "04U": "P", "04V": "P", "05N": "P", "07O": "C",
+    "0A0": "D", "0A1": "Y", "0A2": "K", "0A8": "C", "0A9": "F", "0AA": "V",
+    "0AB": "V", "0AC": "G", "0AF": "W", "0AG": "L", "0AH": "S", "0AK": "D",
+    "0AR": "R", "0BN": "F", "0CS": "A", "0E5": "T", "0EA": "Y", "0FL": "A",
+    "0LF": "P", "0NC": "A", "0PR": "Y", "0QL": "C", "0TD": "D", "0UO": "W",
+    "0WZ": "Y", "0X9": "R", "0Y8": "P", "4AF": "F", "4AR": "R", "4AW": "W",
+    "4BF": "F", "4CF": "F", "4CY": "M", "4DP": "W", "4FB": "P", "4FW": "W",
+    "4HL": "Y", "4HT": "W", "4IN": "W", "4MM": "M", "4PH": "F", "4U7": "A",
+    "41H": "F", "41Q": "N", "42Y": "S", "432": "S", "45F": "P", "4AK": "K",
+    "4D4": "R", "4GJ": "C", "4KY": "P", "4L0": "P", "4LZ": "Y", "4N7": "P",
+    "4N8": "P", "4N9": "P", "4OG": "W", "4OU": "F", "4OV": "S", "4OZ": "S",
+    "4PQ": "W", "4SJ": "F", "4WQ": "A", "4HH": "S", "4HJ": "S", "4J4": "C",
+    "4J5": "R", "4II": "F", "4VI": "R", "823": "N", "8SP": "S", "8AY": "A",
+}
+# Nucleic Acids
+nucleic_letters_3to1 = {
+    "A  ": "A", "C  ": "C", "G  ": "G", "U  ": "U",
+    "DA ": "A", "DC ": "C", "DG ": "G", "DT ": "T",
+}
+rna_letters_3to1 = {
+    "A  ": "A", "C  ": "C", "G  ": "G", "U  ": "U",
+}
+dna_letters_3to1 = {
+    "DA ": "A", "DC ": "C", "DG ": "G", "DT ": "T",
+}
+# fmt: off
+nucleic_letters_3to1_extended = {
+    "A  ": "A", "A23": "A", "A2L": "A", "A2M": "A", "A34": "A", "A35": "A",
+    "A38": "A", "A39": "A", "A3A": "A", "A3P": "A", "A40": "A", "A43": "A",
+    "A44": "A", "A47": "A", "A5L": "A", "A5M": "C", "A5O": "A", "A6A": "A",
+    "A6C": "C", "A6G": "G", "A6U": "U", "A7E": "A", "A9Z": "A", "ABR": "A",
+    "ABS": "A", "AD2": "A", "ADI": "A", "ADP": "A", "AET": "A", "AF2": "A",
+    "AFG": "G", "AMD": "A", "AMO": "A", "AP7": "A", "AS ": "A", "ATD": "T",
+    "ATL": "T", "ATM": "T", "AVC": "A", "AI5": "C", "E  ": "A", "E1X": "A",
+    "EDA": "A", "EFG": "G", "EHG": "G", "EIT": "T", "EXC": "C", "E3C": "C",
+    "E6G": "G", "E7G": "G", "EQ4": "G", "EAN": "T", "I5C": "C", "IC ": "C",
+    "IG ": "G", "IGU": "G", "IMC": "C", "IMP": "G", "IU ": "U", "I4U": "U",
+    "IOO": "G", "M1G": "G", "M2G": "G", "M4C": "C", "M5M": "C", "MA6": "A",
+    "MA7": "A", "MAD": "A", "MCY": "C", "ME6": "C", "MEP": "U", "MG1": "G",
+    "MGQ": "A", "MGT": "G", "MGV": "G", "MIA": "A", "MMT": "T", "MNU": "U",
+    "MRG": "G", "MTR": "T", "MTU": "A", "MFO": "G", "M7A": "A", "MHG": "G",
+    "MMX": "C", "QUO": "G", "QCK": "T", "QSQ": "A", "U  ": "U", "U25": "U",
+    "U2L": "U", "U2P": "U", "U31": "U", "U34": "U", "U36": "U", "U37": "U",
+    "U8U": "U", "UAR": "U", "UBB": "U", "UBD": "U", "UD5": "U", "UPV": "U",
+    "UR3": "U", "URD": "U", "US3": "T", "US5": "U", "UZR": "U", "UMO": "U",
+    "U23": "U", "U48": "C", "U7B": "C", "Y  ": "A", "YCO": "C", "YG ": "G",
+    "YYG": "G", "23G": "G", "26A": "A", "2AR": "A", "2AT": "T", "2AU": "U",
+    "2BT": "T", "2BU": "A", "2DA": "A", "2DT": "T", "2EG": "G", "2GT": "T",
+    "2JV": "G", "2MA": "A", "2MG": "G", "2MU": "U", "2NT": "T", "2OM": "U",
+    "2OT": "T", "2PR": "G", "2SG": "G", "2ST": "T", "63G": "G", "63H": "G",
+    "64T": "T", "68Z": "G", "6CT": "T", "6HA": "A", "6HB": "A", "6HC": "C",
+    "6HG": "G", "6HT": "T", "6IA": "A", "6MA": "A", "6MC": "A", "6MP": "A",
+    "6MT": "A", "6MZ": "A", "6OG": "G", "6PO": "G", "6FK": "G", "6NW": "A",
+    "6OO": "C", "D00": "C", "D3T": "T", "D4M": "T", "DA ": "A", "DC ": "C",
+    "DCG": "G", "DCT": "C", "DDG": "G", "DFC": "C", "DFG": "G", "DG ": "G",
+    "DG8": "G", "DGI": "G", "DGP": "G", "DHU": "U", "DNR": "C", "DOC": "C",
+    "DPB": "T", "DRT": "T", "DT ": "T", "DZM": "A", "D4B": "C", "H2U": "U",
+    "HN0": "G", "HN1": "G", "LC ": "C", "LCA": "A", "LCG": "G", "LG ": "G",
+    "LGP": "G", "LHU": "U", "LSH": "T", "LST": "T", "LDG": "G", "L3X": "A",
+    "LHH": "C", "LV2": "C", "L1J": "G", "P  ": "G", "P2T": "T", "P5P": "A",
+    "PG7": "G", "PGN": "G", "PGP": "G", "PMT": "C", "PPU": "A", "PPW": "G",
+    "PR5": "A", "PRN": "A", "PST": "T", "PSU": "U", "PU ": "A", "PVX": "C",
+    "PYO": "U", "PZG": "G", "P4U": "U", "P7G": "G", "T  ": "T", "T2S": "T",
+    "T31": "U", "T32": "T", "T36": "T", "T37": "T", "T38": "T", "T39": "T",
+    "T3P": "T", "T41": "T", "T48": "T", "T49": "T", "T4S": "T", "T5S": "T",
+    "T64": "T", "T6A": "A", "TA3": "T", "TAF": "T", "TBN": "A", "TC1": "C",
+    "TCP": "T", "TCY": "A", "TDY": "T", "TED": "T", "TFE": "T", "TFF": "T",
+    "TFO": "A", "TFT": "T", "TGP": "G", "TCJ": "C", "TLC": "T", "TP1": "T",
+    "TPC": "C", "TPG": "G", "TSP": "T", "TTD": "T", "TTM": "T", "TXD": "A",
+    "TXP": "A", "TC ": "C", "TG ": "G", "T0N": "G", "T0Q": "G", "X  ": "G",
+    "XAD": "A", "XAL": "A", "XCL": "C", "XCR": "C", "XCT": "C", "XCY": "C",
+    "XGL": "G", "XGR": "G", "XGU": "G", "XPB": "G", "XTF": "T", "XTH": "T",
+    "XTL": "T", "XTR": "T", "XTS": "G", "XUA": "A", "XUG": "G", "102": "G",
+    "10C": "C", "125": "U", "126": "U", "127": "U", "12A": "A", "16B": "C",
+    "18M": "G", "1AP": "A", "1CC": "C", "1FC": "C", "1MA": "A", "1MG": "G",
+    "1RN": "U", "1SC": "C", "5AA": "A", "5AT": "T", "5BU": "U", "5CG": "G",
+    "5CM": "C", "5FA": "A", "5FC": "C", "5FU": "U", "5HC": "C", "5HM": "C",
+    "5HT": "T", "5IC": "C", "5IT": "T", "5MC": "C", "5MU": "U", "5NC": "C",
+    "5PC": "C", "5PY": "T", "9QV": "U", "94O": "T", "9SI": "A", "9SY": "A",
+    "B7C": "C", "BGM": "G", "BOE": "T", "B8H": "U", "B8K": "G", "B8Q": "C",
+    "B8T": "C", "B8W": "G", "B9B": "G", "B9H": "C", "BGH": "G", "F3H": "T",
+    "F3N": "A", "F4H": "T", "FA2": "A", "FDG": "G", "FHU": "U", "FMG": "G",
+    "FNU": "U", "FOX": "G", "F2T": "U", "F74": "G", "F4Q": "G", "F7H": "C",
+    "F7K": "G", "JDT": "T", "JMH": "C", "J0X": "C", "N5M": "C", "N6G": "G",
+    "N79": "A", "NCU": "C", "NMS": "T", "NMT": "T", "NTT": "T", "N7X": "C",
+    "R  ": "A", "RBD": "A", "RDG": "G", "RIA": "A", "RMP": "A", "RPC": "C",
+    "RSP": "C", "RSQ": "C", "RT ": "T", "RUS": "U", "RFJ": "G", "V3L": "A",
+    "VC7": "G", "Z  ": "C", "ZAD": "A", "ZBC": "C", "ZBU": "U", "ZCY": "C",
+    "ZGU": "G", "31H": "A", "31M": "A", "3AU": "U", "3DA": "A", "3ME": "U",
+    "3MU": "U", "3TD": "U", "70U": "U", "7AT": "A", "7DA": "A", "7GU": "G",
+    "7MG": "G", "7BG": "G", "73W": "C", "75B": "U", "7OK": "C", "7S3": "G",
+    "7SN": "G", "C  ": "C", "C25": "C", "C2L": "C", "C2S": "C", "C31": "C",
+    "C32": "C", "C34": "C", "C36": "C", "C37": "C", "C38": "C", "C42": "C",
+    "C43": "C", "C45": "C", "C46": "C", "C49": "C", "C4S": "C", "C5L": "C",
+    "C6G": "G", "CAR": "C", "CB2": "C", "CBR": "C", "CBV": "C", "CCC": "C",
+    "CDW": "C", "CFL": "C", "CFZ": "C", "CG1": "G", "CH ": "C", "CMR": "C",
+    "CNU": "U", "CP1": "C", "CSF": "C", "CSL": "C", "CTG": "T", "CX2": "C",
+    "C7S": "C", "C7R": "C", "G  ": "G", "G1G": "G", "G25": "G", "G2L": "G",
+    "G2S": "G", "G31": "G", "G32": "G", "G33": "G", "G36": "G", "G38": "G",
+    "G42": "G", "G46": "G", "G47": "G", "G48": "G", "G49": "G", "G7M": "G",
+    "GAO": "G", "GCK": "C", "GDO": "G", "GDP": "G", "GDR": "G", "GF2": "G",
+    "GFL": "G", "GH3": "G", "GMS": "G", "GN7": "G", "GNG": "G", "GOM": "G",
+    "GRB": "G", "GS ": "G", "GSR": "G", "GSS": "G", "GTP": "G", "GX1": "G",
+    "KAG": "G", "KAK": "G", "O2G": "G", "OGX": "G", "OMC": "C", "OMG": "G",
+    "OMU": "U", "ONE": "U", "O2Z": "A", "OKN": "C", "OKQ": "C", "S2M": "T",
+    "S4A": "A", "S4C": "C", "S4G": "G", "S4U": "U", "S6G": "G", "SC ": "C",
+    "SDE": "A", "SDG": "G", "SDH": "G", "SMP": "A", "SMT": "T", "SPT": "T",
+    "SRA": "A", "SSU": "U", "SUR": "U", "00A": "A", "0AD": "G", "0AM": "A",
+    "0AP": "C", "0AV": "A", "0R8": "C", "0SP": "A", "0UH": "G", "47C": "C",
+    "4OC": "C", "4PC": "C", "4PD": "C", "4PE": "C", "4SC": "C", "4SU": "U",
+    "45A": "A", "4U3": "C", "8AG": "G", "8AN": "A", "8BA": "A", "8FG": "G",
+    "8MG": "G", "8OG": "G", "8PY": "G", "8AA": "G", "85Y": "U", "8OS": "G",
+    "UNK": "X",  # DEBUG
+}
+standard_protein_letters_3to1 = protein_letters_3to1
+standard_protein_letters_1to3 = protein_letters_1to3
+nonstandard_protein_letters_3to1 = {k: v for k, v in protein_letters_3to1_extended.items() if
+                                    k not in standard_protein_letters_3to1}
+standard_nucleic_letters_3to1 = nucleic_letters_3to1
+standard_nucleic_letters_1to3 = {v: k for k, v in standard_nucleic_letters_3to1.items()}
+nonstandard_nucleic_letters_3to1 = {k: v for k, v in nucleic_letters_3to1_extended.items() if
+                                    k not in standard_nucleic_letters_3to1}
+letters_3to1_extended = {**protein_letters_3to1_extended, **nucleic_letters_3to1_extended}

PhysDock/data/constants/__init__.py ADDED Viewed

File without changes

PhysDock/data/constants/periodic_table.py ADDED Viewed

	@@ -0,0 +1,27 @@

+periodic_table = [
+    "h", "he",
+    "li", "be", "b", "c", "n", "o", "f", "ne",
+    "na", "mg", "al", "si", "p", "s", "cl", "ar",
+    "k", "ca", "sc", "ti", "v", "cr", "mn", "fe", "co", "ni", "cu", "zn", "ga", "ge", "as", "se", "br", "kr",
+    "rb", "sr", "y", "zr", "nb", "mo", "tc", "ru", "rh", "pd", "ag", "cd", "in", "sn", "sb", "te", "i", "xe",
+    "cs", "ba",
+    "la", "ce", "pr", "nd", "pm", "sm", "eu", "gd", "tb", "dy", "ho", "er", "tm", "yb", "lu",
+    "hf", "ta", "w", "re", "os", "ir", "pt", "au", "hg", "tl", "pb", "bi", "po", "at", "rn",
+    "fr", "ra",
+    "ac", "th", "pa", "u", "np", "pu", "am", "cm", "bk", "cf", "es", "fm", "md", "no", "lr",
+    "rf", "db", "sg", "bh", "hs", "mt", "ds", "rg", "cn", "nh", "fl", "mc", "lv", "ts", "og"
+]
+PeriodicTable = [
+    "H", "He",
+    "Li", "Be", "B", "C", "N", "O", "F", "Ne",
+    "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar",
+    "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
+    "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe",
+    "Cs", "Ba",
+    "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
+    "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn",
+    "Fr", "Ra",
+    "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr",
+    "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
+]

PhysDock/data/constants/residue_constants.py ADDED Viewed

	@@ -0,0 +1,562 @@

+import numpy as np
+amino_acid_1to3 = {
+    "A": "ALA",
+    "R": "ARG",
+    "N": "ASN",
+    "D": "ASP",
+    "C": "CYS",
+    "Q": "GLN",
+    "E": "GLU",
+    "G": "GLY",
+    "H": "HIS",
+    "I": "ILE",
+    "L": "LEU",
+    "K": "LYS",
+    "M": "MET",
+    "F": "PHE",
+    "P": "PRO",
+    "S": "SER",
+    "T": "THR",
+    "W": "TRP",
+    "Y": "TYR",
+    "V": "VAL",
+    "X": "UNK",
+}
+amino_acid_3to1 = {v: k for k, v in amino_acid_1to3.items()}
+# Ligand Atom is representaed as "UNK" in token
+# standard_residue is also ccd
+standard_protein = ["ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE",
+                    "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL", "UNK", ]
+standard_rna = ["A  ", "G  ", "C  ", "U  ", "N  ", ]
+standard_dna = ["DA ", "DG ", "DC ", "DT ", "DN ", ]
+standard_nucleics = standard_rna + standard_dna
+standard_ccds_without_gap = standard_protein + standard_nucleics
+GAP = ["GAP"]  # used in msa one-hot
+standard_ccds = standard_protein + standard_nucleics + GAP
+standard_ccd_to_order = {ccd: id for id, ccd in enumerate(standard_ccds)}
+standard_purines = ["A  ", "G  ", "DA ", "DG "]
+standard_pyrimidines = ["C  ", "U  ", "DC ", "DT "]
+is_standard = lambda x: x in standard_ccds
+is_unk = lambda x: x in ["UNK", "N  ", "DN ", "GAP", "UNL"]
+is_protein = lambda x: x in standard_protein and not is_unk(x)
+is_rna = lambda x: x in standard_rna and not is_unk(x)
+is_dna = lambda x: x in standard_dna and not is_unk(x)
+is_nucleics = lambda x: x in standard_nucleics and not is_unk(x)
+is_purines = lambda x: x in standard_purines
+is_pyrimidines = lambda x: x in standard_pyrimidines
+standard_ccd_to_atoms_num = {s: n for s, n in zip(standard_ccds, [
+    5, 11, 8, 8, 6, 9, 9, 4, 10, 8,
+    8, 9, 8, 11, 7, 6, 7, 14, 12, 7, None,
+    22, 23, 20, 20, None,
+    21, 22, 19, 20, None,
+    None,
+])}
+standard_ccd_to_token_centre_atom_name = {
+    **{residue: "CA" for residue in standard_protein},
+    **{residue: "C1'" for residue in standard_nucleics},
+}
+standard_ccd_to_frame_atom_name_0 = {
+    **{residue: "N" for residue in standard_protein},
+    **{residue: "C1'" for residue in standard_nucleics},
+}
+standard_ccd_to_frame_atom_name_1 = {
+    **{residue: "CA" for residue in standard_protein},
+    **{residue: "C3'" for residue in standard_nucleics},
+}
+standard_ccd_to_frame_atom_name_2 = {
+    **{residue: "C" for residue in standard_protein},
+    **{residue: "C4'" for residue in standard_nucleics},
+}
+standard_ccd_to_token_pseudo_beta_atom_name = {
+    **{residue: "CB" for residue in standard_protein},
+    **{residue: "C4" for residue in standard_purines},
+    **{residue: "C2" for residue in standard_pyrimidines},
+}
+standard_ccd_to_token_pseudo_beta_atom_name.update({"GLY": "CA"})
+########################################################
+#    periodic table that used to encode elements       #
+########################################################
+periodic_table = [
+    "h", "he",
+    "li", "be", "b", "c", "n", "o", "f", "ne",
+    "na", "mg", "al", "si", "p", "s", "cl", "ar",
+    "k", "ca", "sc", "ti", "v", "cr", "mn", "fe", "co", "ni", "cu", "zn", "ga", "ge", "as", "se", "br", "kr",
+    "rb", "sr", "y", "zr", "nb", "mo", "tc", "ru", "rh", "pd", "ag", "cd", "in", "sn", "sb", "te", "i", "xe",
+    "cs", "ba",
+    "la", "ce", "pr", "nd", "pm", "sm", "eu", "gd", "tb", "dy", "ho", "er", "tm", "yb", "lu",
+    "hf", "ta", "w", "re", "os", "ir", "pt", "au", "hg", "tl", "pb", "bi", "po", "at", "rn",
+    "fr", "ra",
+    "ac", "th", "pa", "u", "np", "pu", "am", "cm", "bk", "cf", "es", "fm", "md", "no", "lr",
+    "rf", "db", "sg", "bh", "hs", "mt", "ds", "rg", "cn", "nh", "fl", "mc", "lv", "ts", "og"
+]
+get_element_id = {ele: ele_id for ele_id, ele in enumerate(periodic_table)}
+##########################################################
+standard_ccd_to_reference_features_table = {
+    # letters_3: [ref_pos,ref_charge, ref_mask, ref_elements, ref_atom_name_chars]
+    "ALA": [
+        [-0.966, 0.493, 1.500, 0., 1, "N", "N"],
+        [0.257, 0.418, 0.692, 0., 1, "C", "CA"],
+        [-0.094, 0.017, -0.716, 0., 1, "C", "C"],
+        [-1.056, -0.682, -0.923, 0., 1, "O", "O"],
+        [1.204, -0.620, 1.296, 0., 1, "C", "CB"],
+        [0.661, 0.439, -1.742, 0., 0, "O", "OXT"],
+    ],
+    "ARG": [
+        [-0.469, 1.110, -0.993, 0., 1, "N", "N"],
+        [0.004, 2.294, -1.708, 0., 1, "C", "CA"],
+        [-0.907, 2.521, -2.901, 0., 1, "C", "C"],
+        [-1.827, 1.789, -3.242, 0., 1, "O", "O"],
+        [1.475, 2.150, -2.127, 0., 1, "C", "CB"],
+        [1.745, 1.017, -3.130, 0., 1, "C", "CG"],
+        [3.210, 0.954, -3.557, 0., 1, "C", "CD"],
+        [4.071, 0.726, -2.421, 0., 1, "N", "NE"],
+        [5.469, 0.624, -2.528, 0., 1, "C", "CZ"],
+        [6.259, 0.404, -1.405, 0., 1, "N", "NH1"],
+        [6.078, 0.744, -3.773, 0., 1, "N", "NH2"],
+        [-0.588, 3.659, -3.574, 0., 0, "O", "OXT"],
+    ],
+    "ASN": [
+        [-0.293, 1.686, 0.094, 0., 1, "N", "N"],
+        [-0.448, 0.292, -0.340, 0., 1, "C", "CA"],
+        [-1.846, -0.179, -0.031, 0., 1, "C", "C"],
+        [-2.510, 0.402, 0.794, 0., 1, "O", "O"],
+        [0.562, -0.588, 0.401, 0., 1, "C", "CB"],
+        [1.960, -0.197, -0.002, 0., 1, "C", "CG"],
+        [2.132, 0.697, -0.804, 0., 1, "O", "OD1"],
+        [3.019, -0.841, 0.527, 0., 1, "N", "ND2"],
+        [-2.353, -1.243, -0.673, 0., 0, "O", "OXT"],
+    ],
+    "ASP": [
+        [-0.317, 1.688, 0.066, 0., 1, "N", "N"],
+        [-0.470, 0.286, -0.344, 0., 1, "C", "CA"],
+        [-1.868, -0.180, -0.029, 0., 1, "C", "C"],
+        [-2.534, 0.415, 0.786, 0., 1, "O", "O"],
+        [0.539, -0.580, 0.413, 0., 1, "C", "CB"],
+        [1.938, -0.195, 0.004, 0., 1, "C", "CG"],
+        [2.109, 0.681, -0.810, 0., 1, "O", "OD1"],
+        [2.992, -0.826, 0.543, 0., 1, "O", "OD2"],
+        [-2.374, -1.256, -0.652, 0., 0, "O", "OXT"],
+    ],
+    "CYS": [
+        [1.585, 0.483, -0.081, 0., 1, "N", "N"],
+        [0.141, 0.450, 0.186, 0., 1, "C", "CA"],
+        [-0.095, 0.006, 1.606, 0., 1, "C", "C"],
+        [0.685, -0.742, 2.143, 0., 1, "O", "O"],
+        [-0.533, -0.530, -0.774, 0., 1, "C", "CB"],
+        [-0.247, 0.004, -2.484, 0., 1, "S", "SG"],
+        [-1.174, 0.443, 2.275, 0., 0, "O", "OXT"],
+    ],
+    "GLN": [
+        [1.858, -0.148, 1.125, 0., 1, "N", "N"],
+        [0.517, 0.451, 1.112, 0., 1, "C", "CA"],
+        [-0.236, 0.022, 2.344, 0., 1, "C", "C"],
+        [-0.005, -1.049, 2.851, 0., 1, "O", "O"],
+        [-0.236, -0.013, -0.135, 0., 1, "C", "CB"],
+        [0.529, 0.421, -1.385, 0., 1, "C", "CG"],
+        [-0.213, -0.036, -2.614, 0., 1, "C", "CD"],
+        [-1.252, -0.650, -2.500, 0., 1, "O", "OE1"],
+        [0.277, 0.236, -3.839, 0., 1, "N", "NE2"],
+        [-1.165, 0.831, 2.878, 0., 0, "O", "OXT"],
+    ],
+    "GLU": [
+        [1.199, 1.867, -0.117, 0., 1, "N", "N"],
+        [1.138, 0.515, 0.453, 0., 1, "C", "CA"],
+        [2.364, -0.260, 0.041, 0., 1, "C", "C"],
+        [3.010, 0.096, -0.916, 0., 1, "O", "O"],
+        [-0.113, -0.200, -0.062, 0., 1, "C", "CB"],
+        [-1.360, 0.517, 0.461, 0., 1, "C", "CG"],
+        [-2.593, -0.187, -0.046, 0., 1, "C", "CD"],
+        [-2.485, -1.161, -0.753, 0., 1, "O", "OE1"],
+        [-3.811, 0.269, 0.287, 0., 1, "O", "OE2"],
+        [2.737, -1.345, 0.737, 0., 0, "O", "OXT"],
+    ],
+    "GLY": [
+        [1.931, 0.090, -0.034, 0., 1, "N", "N"],
+        [0.761, -0.799, -0.008, 0., 1, "C", "CA"],
+        [-0.498, 0.029, -0.005, 0., 1, "C", "C"],
+        [-0.429, 1.235, -0.023, 0., 1, "O", "O"],
+        [-1.697, -0.574, 0.018, 0., 0, "O", "OXT"],
+    ],
+    "HIS": [
+        [-0.040, -1.210, 0.053, 0., 1, "N", "N"],
+        [1.172, -1.709, 0.652, 0., 1, "C", "CA"],
+        [1.083, -3.207, 0.905, 0., 1, "C", "C"],
+        [0.040, -3.770, 1.222, 0., 1, "O", "O"],
+        [1.484, -0.975, 1.962, 0., 1, "C", "CB"],
+        [2.940, -1.060, 2.353, 0., 1, "C", "CG"],
+        [3.380, -2.075, 3.129, 0., 1, "N", "ND1"],
+        [3.960, -0.251, 2.046, 0., 1, "C", "CD2"],
+        [4.693, -1.908, 3.317, 0., 1, "C", "CE1"],
+        [5.058, -0.801, 2.662, 0., 1, "N", "NE2"],
+        [2.247, -3.882, 0.744, 0., 0, "O", "OXT"],
+    ],
+    "ILE": [
+        [-1.944, 0.335, -0.343, 0., 1, "N", "N"],
+        [-0.487, 0.519, -0.369, 0., 1, "C", "CA"],
+        [0.066, -0.032, -1.657, 0., 1, "C", "C"],
+        [-0.484, -0.958, -2.203, 0., 1, "O", "O"],
+        [0.140, -0.219, 0.814, 0., 1, "C", "CB"],
+        [-0.421, 0.341, 2.122, 0., 1, "C", "CG1"],
+        [1.658, -0.027, 0.788, 0., 1, "C", "CG2"],
+        [0.206, -0.397, 3.305, 0., 1, "C", "CD1"],
+        [1.171, 0.504, -2.197, 0., 0, "O", "OXT"],
+    ],
+    "LEU": [
+        [-1.661, 0.627, -0.406, 0., 1, "N", "N"],
+        [-0.205, 0.441, -0.467, 0., 1, "C", "CA"],
+        [0.180, -0.055, -1.836, 0., 1, "C", "C"],
+        [-0.591, -0.731, -2.474, 0., 1, "O", "O"],
+        [0.221, -0.583, 0.585, 0., 1, "C", "CB"],
+        [-0.170, -0.079, 1.976, 0., 1, "C", "CG"],
+        [0.256, -1.104, 3.029, 0., 1, "C", "CD1"],
+        [0.526, 1.254, 2.250, 0., 1, "C", "CD2"],
+        [1.382, 0.254, -2.348, 0., 0, "O", "OXT"],
+    ],
+    "LYS": [
+        [1.422, 1.796, 0.198, 0., 1, "N", "N"],
+        [1.394, 0.355, 0.484, 0., 1, "C", "CA"],
+        [2.657, -0.284, -0.032, 0., 1, "C", "C"],
+        [3.316, 0.275, -0.876, 0., 1, "O", "O"],
+        [0.184, -0.278, -0.206, 0., 1, "C", "CB"],
+        [-1.102, 0.282, 0.407, 0., 1, "C", "CG"],
+        [-2.313, -0.351, -0.283, 0., 1, "C", "CD"],
+        [-3.598, 0.208, 0.329, 0., 1, "C", "CE"],
+        [-4.761, -0.400, -0.332, 0., 1, "N", "NZ"],
+        [3.050, -1.476, 0.446, 0., 0, "O", "OXT"],
+    ],
+    "MET": [
+        [-1.816, 0.142, -1.166, 0., 1, "N", "N"],
+        [-0.392, 0.499, -1.214, 0., 1, "C", "CA"],
+        [0.206, 0.002, -2.504, 0., 1, "C", "C"],
+        [-0.236, -0.989, -3.033, 0., 1, "O", "O"],
+        [0.334, -0.145, -0.032, 0., 1, "C", "CB"],
+        [-0.273, 0.359, 1.277, 0., 1, "C", "CG"],
+        [0.589, -0.405, 2.678, 0., 1, "S", "SD"],
+        [-0.314, 0.353, 4.056, 0., 1, "C", "CE"],
+        [1.232, 0.661, -3.066, 0., 0, "O", "OXT"],
+    ],
+    "PHE": [
+        [1.317, 0.962, 1.014, 0., 1, "N", "N"],
+        [-0.020, 0.426, 1.300, 0., 1, "C", "CA"],
+        [-0.109, 0.047, 2.756, 0., 1, "C", "C"],
+        [0.879, -0.317, 3.346, 0., 1, "O", "O"],
+        [-0.270, -0.809, 0.434, 0., 1, "C", "CB"],
+        [-0.181, -0.430, -1.020, 0., 1, "C", "CG"],
+        [1.031, -0.498, -1.680, 0., 1, "C", "CD1"],
+        [-1.314, -0.018, -1.698, 0., 1, "C", "CD2"],
+        [1.112, -0.150, -3.015, 0., 1, "C", "CE1"],
+        [-1.231, 0.333, -3.032, 0., 1, "C", "CE2"],
+        [-0.018, 0.265, -3.691, 0., 1, "C", "CZ"],
+        [-1.286, 0.113, 3.396, 0., 0, "O", "OXT"],
+    ],
+    "PRO": [
+        [-0.816, 1.108, 0.254, 0., 1, "N", "N"],
+        [0.001, -0.107, 0.509, 0., 1, "C", "CA"],
+        [1.408, 0.091, 0.005, 0., 1, "C", "C"],
+        [1.650, 0.980, -0.777, 0., 1, "O", "O"],
+        [-0.703, -1.227, -0.286, 0., 1, "C", "CB"],
+        [-2.163, -0.753, -0.439, 0., 1, "C", "CG"],
+        [-2.218, 0.614, 0.276, 0., 1, "C", "CD"],
+        [2.391, -0.721, 0.424, 0., 0, "O", "OXT"],
+    ],
+    "SER": [
+        [1.525, 0.493, -0.608, 0., 1, "N", "N"],
+        [0.100, 0.469, -0.252, 0., 1, "C", "CA"],
+        [-0.053, 0.004, 1.173, 0., 1, "C", "C"],
+        [0.751, -0.760, 1.649, 0., 1, "O", "O"],
+        [-0.642, -0.489, -1.184, 0., 1, "C", "CB"],
+        [-0.496, -0.049, -2.535, 0., 1, "O", "OG"],
+        [-1.084, 0.440, 1.913, 0., 0, "O", "OXT"],
+    ],
+    "THR": [
+        [1.543, -0.702, 0.430, 0., 1, "N", "N"],
+        [0.122, -0.706, 0.056, 0., 1, "C", "CA"],
+        [-0.038, -0.090, -1.309, 0., 1, "C", "C"],
+        [0.732, 0.761, -1.683, 0., 1, "O", "O"],
+        [-0.675, 0.104, 1.079, 0., 1, "C", "CB"],
+        [-0.193, 1.448, 1.103, 0., 1, "O", "OG1"],
+        [-0.511, -0.521, 2.466, 0., 1, "C", "CG2"],
+        [-1.039, -0.488, -2.110, 0., 0, "O", "OXT"],
+    ],
+    "TRP": [
+        [1.278, 1.121, 2.059, 0., 1, "N", "N"],
+        [-0.008, 0.417, 1.970, 0., 1, "C", "CA"],
+        [-0.490, 0.076, 3.357, 0., 1, "C", "C"],
+        [0.308, -0.130, 4.240, 0., 1, "O", "O"],
+        [0.168, -0.868, 1.161, 0., 1, "C", "CB"],
+        [0.650, -0.526, -0.225, 0., 1, "C", "CG"],
+        [1.928, -0.418, -0.622, 0., 1, "C", "CD1"],
+        [-0.186, -0.256, -1.396, 0., 1, "C", "CD2"],
+        [1.978, -0.095, -1.951, 0., 1, "N", "NE1"],
+        [0.701, 0.014, -2.454, 0., 1, "C", "CE2"],
+        [-1.564, -0.210, -1.615, 0., 1, "C", "CE3"],
+        [0.190, 0.314, -3.712, 0., 1, "C", "CZ2"],
+        [-2.044, 0.086, -2.859, 0., 1, "C", "CZ3"],
+        [-1.173, 0.348, -3.907, 0., 1, "C", "CH2"],
+        [-1.806, 0.001, 3.610, 0., 0, "O", "OXT"],
+    ],
+    "TYR": [
+        [1.320, 0.952, 1.428, 0., 1, "N", "N"],
+        [-0.018, 0.429, 1.734, 0., 1, "C", "CA"],
+        [-0.103, 0.094, 3.201, 0., 1, "C", "C"],
+        [0.886, -0.254, 3.799, 0., 1, "O", "O"],
+        [-0.274, -0.831, 0.907, 0., 1, "C", "CB"],
+        [-0.189, -0.496, -0.559, 0., 1, "C", "CG"],
+        [1.022, -0.589, -1.219, 0., 1, "C", "CD1"],
+        [-1.324, -0.102, -1.244, 0., 1, "C", "CD2"],
+        [1.103, -0.282, -2.563, 0., 1, "C", "CE1"],
+        [-1.247, 0.210, -2.587, 0., 1, "C", "CE2"],
+        [-0.032, 0.118, -3.252, 0., 1, "C", "CZ"],
+        [0.044, 0.420, -4.574, 0., 1, "O", "OH"],
+        [-1.279, 0.184, 3.842, 0., 0, "O", "OXT"],
+    ],
+    "VAL": [
+        [1.564, -0.642, 0.454, 0., 1, "N", "N"],
+        [0.145, -0.698, 0.079, 0., 1, "C", "CA"],
+        [-0.037, -0.093, -1.288, 0., 1, "C", "C"],
+        [0.703, 0.784, -1.664, 0., 1, "O", "O"],
+        [-0.682, 0.086, 1.098, 0., 1, "C", "CB"],
+        [-0.497, -0.528, 2.487, 0., 1, "C", "CG1"],
+        [-0.218, 1.543, 1.119, 0., 1, "C", "CG2"],
+        [-1.022, -0.529, -2.089, 0., 0, "O", "OXT"],
+    ],
+    "A  ": [
+        [2.135, -1.141, -5.313, 0., 0, "O", "OP3"],
+        [1.024, -0.137, -4.723, 0., 1, "P", "P"],
+        [1.633, 1.190, -4.488, 0., 1, "O", "OP1"],
+        [-0.183, 0.005, -5.778, 0., 1, "O", "OP2"],
+        [0.456, -0.720, -3.334, 0., 1, "O", "O5'"],
+        [-0.520, 0.209, -2.863, 0., 1, "C", "C5'"],
+        [-1.101, -0.287, -1.538, 0., 1, "C", "C4'"],
+        [-0.064, -0.383, -0.538, 0., 1, "O", "O4'"],
+        [-2.105, 0.739, -0.969, 0., 1, "C", "C3'"],
+        [-3.445, 0.360, -1.287, 0., 1, "O", "O3'"],
+        [-1.874, 0.684, 0.558, 0., 1, "C", "C2'"],
+        [-3.065, 0.271, 1.231, 0., 1, "O", "O2'"],
+        [-0.755, -0.367, 0.729, 0., 1, "C", "C1'"],
+        [0.158, 0.029, 1.803, 0., 1, "N", "N9"],
+        [1.265, 0.813, 1.672, 0., 1, "C", "C8"],
+        [1.843, 0.963, 2.828, 0., 1, "N", "N7"],
+        [1.143, 0.292, 3.773, 0., 1, "C", "C5"],
+        [1.290, 0.091, 5.156, 0., 1, "C", "C6"],
+        [2.344, 0.664, 5.846, 0., 1, "N", "N6"],
+        [0.391, -0.656, 5.787, 0., 1, "N", "N1"],
+        [-0.617, -1.206, 5.136, 0., 1, "C", "C2"],
+        [-0.792, -1.051, 3.841, 0., 1, "N", "N3"],
+        [0.056, -0.320, 3.126, 0., 1, "C", "C4"],
+    ],
+    "G  ": [
+        [-1.945, -1.360, 5.599, 0., 0, "O", "OP3"],
+        [-0.911, -0.277, 5.008, 0., 1, "P", "P"],
+        [-1.598, 1.022, 4.844, 0., 1, "O", "OP1"],
+        [0.325, -0.105, 6.025, 0., 1, "O", "OP2"],
+        [-0.365, -0.780, 3.580, 0., 1, "O", "O5'"],
+        [0.542, 0.217, 3.109, 0., 1, "C", "C5'"],
+        [1.100, -0.200, 1.748, 0., 1, "C", "C4'"],
+        [0.033, -0.318, 0.782, 0., 1, "O", "O4'"],
+        [2.025, 0.898, 1.182, 0., 1, "C", "C3'"],
+        [3.395, 0.582, 1.439, 0., 1, "O", "O3'"],
+        [1.741, 0.884, -0.338, 0., 1, "C", "C2'"],
+        [2.927, 0.560, -1.066, 0., 1, "O", "O2'"],
+        [0.675, -0.220, -0.507, 0., 1, "C", "C1'"],
+        [-0.297, 0.162, -1.534, 0., 1, "N", "N9"],
+        [-1.440, 0.880, -1.334, 0., 1, "C", "C8"],
+        [-2.066, 1.037, -2.464, 0., 1, "N", "N7"],
+        [-1.364, 0.431, -3.453, 0., 1, "C", "C5"],
+        [-1.556, 0.279, -4.846, 0., 1, "C", "C6"],
+        [-2.534, 0.755, -5.397, 0., 1, "O", "O6"],
+        [-0.626, -0.401, -5.551, 0., 1, "N", "N1"],
+        [0.459, -0.934, -4.923, 0., 1, "C", "C2"],
+        [1.384, -1.626, -5.664, 0., 1, "N", "N2"],
+        [0.649, -0.800, -3.630, 0., 1, "N", "N3"],
+        [-0.226, -0.134, -2.868, 0., 1, "C", "C4"],
+    ],
+    "C  ": [
+        [2.147, -1.021, -4.678, 0., 0, "O", "OP3"],
+        [1.049, -0.039, -4.028, 0., 1, "P", "P"],
+        [1.692, 1.237, -3.646, 0., 1, "O", "OP1"],
+        [-0.116, 0.246, -5.102, 0., 1, "O", "OP2"],
+        [0.415, -0.733, -2.721, 0., 1, "O", "O5'"],
+        [-0.546, 0.181, -2.193, 0., 1, "C", "C5'"],
+        [-1.189, -0.419, -0.942, 0., 1, "C", "C4'"],
+        [-0.190, -0.648, 0.076, 0., 1, "O", "O4'"],
+        [-2.178, 0.583, -0.307, 0., 1, "C", "C3'"],
+        [-3.518, 0.283, -0.703, 0., 1, "O", "O3'"],
+        [-2.001, 0.373, 1.215, 0., 1, "C", "C2'"],
+        [-3.228, -0.059, 1.806, 0., 1, "O", "O2'"],
+        [-0.924, -0.729, 1.317, 0., 1, "C", "C1'"],
+        [-0.036, -0.470, 2.453, 0., 1, "N", "N1"],
+        [0.652, 0.683, 2.514, 0., 1, "C", "C2"],
+        [0.529, 1.504, 1.620, 0., 1, "O", "O2"],
+        [1.467, 0.945, 3.535, 0., 1, "N", "N3"],
+        [1.620, 0.070, 4.520, 0., 1, "C", "C4"],
+        [2.464, 0.350, 5.569, 0., 1, "N", "N4"],
+        [0.916, -1.151, 4.483, 0., 1, "C", "C5"],
+        [0.087, -1.399, 3.442, 0., 1, "C", "C6"],
+    ],
+    "U  ": [
+        [-2.122, 1.033, -4.690, 0., 0, "O", "OP3"],
+        [-1.030, 0.047, -4.037, 0., 1, "P", "P"],
+        [-1.679, -1.228, -3.660, 0., 1, "O", "OP1"],
+        [0.138, -0.241, -5.107, 0., 1, "O", "OP2"],
+        [-0.399, 0.736, -2.726, 0., 1, "O", "O5'"],
+        [0.557, -0.182, -2.196, 0., 1, "C", "C5'"],
+        [1.197, 0.415, -0.942, 0., 1, "C", "C4'"],
+        [0.194, 0.645, 0.074, 0., 1, "O", "O4'"],
+        [2.181, -0.588, -0.301, 0., 1, "C", "C3'"],
+        [3.524, -0.288, -0.686, 0., 1, "O", "O3'"],
+        [1.995, -0.383, 1.218, 0., 1, "C", "C2'"],
+        [3.219, 0.046, 1.819, 0., 1, "O", "O2'"],
+        [0.922, 0.723, 1.319, 0., 1, "C", "C1'"],
+        [0.028, 0.464, 2.451, 0., 1, "N", "N1"],
+        [-0.690, -0.671, 2.486, 0., 1, "C", "C2"],
+        [-0.587, -1.474, 1.580, 0., 1, "O", "O2"],
+        [-1.515, -0.936, 3.517, 0., 1, "N", "N3"],
+        [-1.641, -0.055, 4.530, 0., 1, "C", "C4"],
+        [-2.391, -0.292, 5.460, 0., 1, "O", "O4"],
+        [-0.894, 1.146, 4.502, 0., 1, "C", "C5"],
+        [-0.070, 1.384, 3.459, 0., 1, "C", "C6"],
+    ],
+    "DA ": [
+        [1.845, -1.282, -5.339, 0., 0, "O", "OP3"],
+        [0.934, -0.156, -4.636, 0., 1, "P", "P"],
+        [1.781, 0.996, -4.255, 0., 1, "O", "OP1"],
+        [-0.204, 0.331, -5.665, 0., 1, "O", "OP2"],
+        [0.241, -0.771, -3.320, 0., 1, "O", "O5'"],
+        [-0.549, 0.270, -2.744, 0., 1, "C", "C5'"],
+        [-1.239, -0.251, -1.482, 0., 1, "C", "C4'"],
+        [-0.267, -0.564, -0.458, 0., 1, "O", "O4'"],
+        [-2.105, 0.859, -0.835, 0., 1, "C", "C3'"],
+        [-3.409, 0.895, -1.418, 0., 1, "O", "O3'"],
+        [-2.173, 0.398, 0.640, 0., 1, "C", "C2'"],
+        [-0.965, -0.545, 0.797, 0., 1, "C", "C1'"],
+        [-0.078, -0.047, 1.852, 0., 1, "N", "N9"],
+        [0.962, 0.817, 1.689, 0., 1, "C", "C8"],
+        [1.535, 1.044, 2.835, 0., 1, "N", "N7"],
+        [0.897, 0.346, 3.805, 0., 1, "C", "C5"],
+        [1.069, 0.196, 5.191, 0., 1, "C", "C6"],
+        [2.079, 0.869, 5.856, 0., 1, "N", "N6"],
+        [0.236, -0.603, 5.850, 0., 1, "N", "N1"],
+        [-0.729, -1.249, 5.224, 0., 1, "C", "C2"],
+        [-0.925, -1.144, 3.927, 0., 1, "N", "N3"],
+        [-0.142, -0.368, 3.184, 0., 1, "C", "C4"],
+    ],
+    "DG ": [
+        [-1.603, -1.547, 5.624, 0., 0, "O", "OP3"],
+        [-0.818, -0.321, 4.935, 0., 1, "P", "P"],
+        [-1.774, 0.766, 4.630, 0., 1, "O", "OP1"],
+        [0.312, 0.224, 5.941, 0., 1, "O", "OP2"],
+        [-0.126, -0.826, 3.572, 0., 1, "O", "O5'"],
+        [0.550, 0.300, 3.011, 0., 1, "C", "C5'"],
+        [1.233, -0.113, 1.706, 0., 1, "C", "C4'"],
+        [0.253, -0.471, 0.705, 0., 1, "O", "O4'"],
+        [1.976, 1.091, 1.073, 0., 1, "C", "C3'"],
+        [3.294, 1.218, 1.612, 0., 1, "O", "O3'"],
+        [2.026, 0.692, -0.421, 0., 1, "C", "C2'"],
+        [0.897, -0.345, -0.573, 0., 1, "C", "C1'"],
+        [-0.068, 0.111, -1.575, 0., 1, "N", "N9"],
+        [-1.172, 0.877, -1.341, 0., 1, "C", "C8"],
+        [-1.804, 1.094, -2.458, 0., 1, "N", "N7"],
+        [-1.145, 0.482, -3.472, 0., 1, "C", "C5"],
+        [-1.361, 0.377, -4.866, 0., 1, "C", "C6"],
+        [-2.321, 0.914, -5.391, 0., 1, "O", "O6"],
+        [-0.473, -0.327, -5.601, 0., 1, "N", "N1"],
+        [0.593, -0.928, -5.003, 0., 1, "C", "C2"],
+        [1.474, -1.643, -5.774, 0., 1, "N", "N2"],
+        [0.804, -0.839, -3.709, 0., 1, "N", "N3"],
+        [-0.027, -0.152, -2.917, 0., 1, "C", "C4"],
+    ],
+    "DC ": [
+        [1.941, -1.055, -4.672, 0., 0, "O", "OP3"],
+        [0.987, -0.017, -3.894, 0., 1, "P", "P"],
+        [1.802, 1.099, -3.365, 0., 1, "O", "OP1"],
+        [-0.119, 0.560, -4.910, 0., 1, "O", "OP2"],
+        [0.255, -0.772, -2.674, 0., 1, "O", "O5'"],
+        [-0.571, 0.196, -2.027, 0., 1, "C", "C5'"],
+        [-1.300, -0.459, -0.852, 0., 1, "C", "C4'"],
+        [-0.363, -0.863, 0.171, 0., 1, "O", "O4'"],
+        [-2.206, 0.569, -0.129, 0., 1, "C", "C3'"],
+        [-3.488, 0.649, -0.756, 0., 1, "O", "O3'"],
+        [-2.322, -0.040, 1.288, 0., 1, "C", "C2'"],
+        [-1.106, -0.981, 1.395, 0., 1, "C", "C1'"],
+        [-0.267, -0.584, 2.528, 0., 1, "N", "N1"],
+        [0.270, 0.648, 2.563, 0., 1, "C", "C2"],
+        [0.052, 1.424, 1.647, 0., 1, "O", "O2"],
+        [1.037, 1.035, 3.581, 0., 1, "N", "N3"],
+        [1.291, 0.212, 4.589, 0., 1, "C", "C4"],
+        [2.085, 0.622, 5.635, 0., 1, "N", "N4"],
+        [0.746, -1.088, 4.580, 0., 1, "C", "C5"],
+        [-0.035, -1.465, 3.541, 0., 1, "C", "C6"],
+    ],
+    "DT ": [
+        [-3.912, -2.311, 1.636, 0., 0, "O", "OP3"],
+        [-3.968, -1.665, 3.118, 0., 1, "P", "P"],
+        [-4.406, -2.599, 4.208, 0., 1, "O", "OP1"],
+        [-4.901, -0.360, 2.920, 0., 1, "O", "OP2"],
+        [-2.493, -1.028, 3.315, 0., 1, "O", "O5'"],
+        [-2.005, -0.136, 2.327, 0., 1, "C", "C5'"],
+        [-0.611, 0.328, 2.728, 0., 1, "C", "C4'"],
+        [0.247, -0.829, 2.764, 0., 1, "O", "O4'"],
+        [0.008, 1.286, 1.720, 0., 1, "C", "C3'"],
+        [0.965, 2.121, 2.368, 0., 1, "O", "O3'"],
+        [0.710, 0.360, 0.754, 0., 1, "C", "C2'"],
+        [1.157, -0.778, 1.657, 0., 1, "C", "C1'"],
+        [1.164, -2.047, 0.989, 0., 1, "N", "N1"],
+        [2.333, -2.544, 0.374, 0., 1, "C", "C2"],
+        [3.410, -1.945, 0.363, 0., 1, "O", "O2"],
+        [2.194, -3.793, -0.240, 0., 1, "N", "N3"],
+        [1.047, -4.570, -0.300, 0., 1, "C", "C4"],
+        [0.995, -5.663, -0.857, 0., 1, "O", "O4"],
+        [-0.143, -3.980, 0.369, 0., 1, "C", "C5"],
+        [-1.420, -4.757, 0.347, 0., 1, "C", "C7"],
+        [-0.013, -2.784, 0.958, 0., 1, "C", "C6"],
+    ],
+}
+standard_ccd_to_ref_atom_name_chars = {
+    ccd: [atom_ref_feats[-1] for atom_ref_feats in standard_ccd_to_reference_features_table[ccd]]
+    for ccd in standard_ccds if not is_unk(ccd)
+}
+eye_64 = np.eye(64)
+eye_128 = np.eye(128)
+eye_9 = np.eye(9)
+eye_7 = np.eye(7)
+eye_3 = np.eye(3)
+eye_32 = np.eye(32)
+eye_5 = np.eye(5)
+eye8 = np.eye(8)
+eye5 = np.eye(5)
+def _get_ref_feat_from_ccd_data(ccd, ref_feat_table):
+    ref_feat = np.stack([
+        np.concatenate(
+            [np.array(atom_ref_feats[:5]), eye_128[get_element_id[atom_ref_feats[5].lower()]],
+             *[eye_64[ord(c) - 32] for c in f"{atom_ref_feats[-1]:<4}"]], axis=-1)
+        for atom_ref_feats in ref_feat_table[ccd]
+    ], axis=0)
+    return ref_feat
+standard_ccd_to_ref_feat = {
+    ccd: _get_ref_feat_from_ccd_data(ccd, standard_ccd_to_reference_features_table) for ccd in standard_ccds if
+    not is_unk(ccd)
+}

PhysDock/data/constants/restype_constants.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import numpy as np
+from .PDBData import protein_letters_3to1_extended, nucleic_letters_3to1_extended
+restype_1_to_3 = {
+    "A": "ALA", "R": "ARG", "N": "ASN", "D": "ASP", "C": "CYS",
+    "Q": "GLN", "E": "GLU", "G": "GLY", "H": "HIS", "I": "ILE",
+    "L": "LEU", "K": "LYS", "M": "MET", "F": "PHE", "P": "PRO",
+    "S": "SER", "T": "THR", "W": "TRP", "Y": "TYR", "V": "VAL",
+    "X": "UNK",
+    "0": "A  ", "1": "G  ", "2": "C  ", "3": "U  ", "4": "N  ",
+    "5": "DA ", "6": "DG ", "7": "DC ", "8": "DT ", "9": "DN ",
+}
+na_c_to_type = {
+    "A": "A  ", "G": "G  ", "C": "C  ", "U": "U  ", "N": "N  ", "T": "T  ", "X": "N  "
+}
+restype_3_to_1 = {v: k for k, v in restype_1_to_3.items()}
+restype_3_to_1["T  "]="8"
+restypes3 = [
+    "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE",
+    "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL", "UNK",
+    "A  ", "G  ", "C  ", "U  ", "N  ",
+    "DA ", "DG ", "DC ", "DT ", "DN ",
+]
+restypes1 = [restype_3_to_1[ccd] for ccd in restypes3]
+restype_3_to_1_extended = {}
+for c3, c in protein_letters_3to1_extended.items():
+    restype_3_to_1_extended[f"{c3:<3}"] = c
+# TODO: How to distinguish RNA and DNA
+for c3, c in nucleic_letters_3to1_extended.items():
+    restype_3_to_1_extended[c3] = restype_3_to_1[na_c_to_type[c]]
+restype_3_to_1_extended.update(restype_3_to_1)
+############
+standard_protein = ["ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE",
+                    "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL", "UNK", ]
+standard_rna = ["A  ", "G  ", "C  ", "U  ", "N  ", ]
+standard_dna = ["DA ", "DG ", "DC ", "DT ", "DN ", ]
+standard_nucleics = standard_rna + standard_dna
+standard_ccds_without_gap = standard_protein + standard_nucleics
+GAP = ["GAP"]  # used in msa one-hot
+standard_ccds = standard_protein + standard_nucleics + GAP
+standard_ccd_to_order = {ccd: id for id, ccd in enumerate(standard_ccds)}
+standard_purines = ["A  ", "G  ", "DA ", "DG "]
+standard_pyrimidines = ["C  ", "U  ", "DC ", "DT "]
+is_standard = lambda x: x in standard_ccds
+is_unk = lambda x: x in ["UNK", "N  ", "DN ", "GAP", "UNL"]
+is_protein = lambda x: x in standard_protein and not is_unk(x)
+is_rna = lambda x: x in standard_rna and not is_unk(x)
+is_dna = lambda x: x in standard_dna and not is_unk(x)
+is_nucleics = lambda x: x in standard_nucleics and not is_unk(x)
+is_purines = lambda x: x in standard_purines
+is_pyrimidines = lambda x: x in standard_pyrimidines
+standard_ccd_to_atoms_num = {s: n for s, n in zip(standard_ccds, [
+    5, 11, 8, 8, 6, 9, 9, 4, 10, 8,
+    8, 9, 8, 11, 7, 6, 7, 14, 12, 7, None,
+    22, 23, 20, 20, None,
+    21, 22, 19, 20, None,
+    None,
+])}
+standard_ccd_to_token_centre_atom_name = {
+    **{residue: "CA" for residue in standard_protein},
+    **{residue: "C1'" for residue in standard_nucleics},
+}
+standard_ccd_to_frame_atom_name_0 = {
+    **{residue: "N" for residue in standard_protein},
+    **{residue: "C1'" for residue in standard_nucleics},
+}
+standard_ccd_to_frame_atom_name_1 = {
+    **{residue: "CA" for residue in standard_protein},
+    **{residue: "C3'" for residue in standard_nucleics},
+}
+standard_ccd_to_frame_atom_name_2 = {
+    **{residue: "C" for residue in standard_protein},
+    **{residue: "C4'" for residue in standard_nucleics},
+}
+standard_ccd_to_token_pseudo_beta_atom_name = {
+    **{residue: "CB" for residue in standard_protein},
+    **{residue: "C4" for residue in standard_purines},
+    **{residue: "C2" for residue in standard_pyrimidines},
+}
+standard_ccd_to_token_pseudo_beta_atom_name.update({"GLY": "CA"})
+eye_64 = np.eye(64)
+eye_128 = np.eye(128)
+eye_9 = np.eye(9)
+eye_7 = np.eye(7)
+eye_3 = np.eye(3)
+eye_32 = np.eye(32)
+eye_5 = np.eye(5)
+eye8 = np.eye(8)
+eye5 = np.eye(5)

PhysDock/data/feature_loader.py ADDED Viewed

	@@ -0,0 +1,1283 @@

+import copy
+import os
+import random
+from functools import reduce
+from operator import add
+import torch
+import torch.nn.functional as F
+import numpy as np
+from typing import Optional
+from PhysDock.data.constants.PDBData import protein_letters_3to1_extended
+from PhysDock.data.constants import restype_constants as rc
+from PhysDock.utils.io_utils import convert_md5_string, load_json, load_pkl, dump_txt, find_files
+from PhysDock.data.tools.feature_processing_multimer import pair_and_merge
+from PhysDock.utils.tensor_utils import centre_random_augmentation_np_apply, dgram_from_positions, \
+    centre_random_augmentation_np_batch
+from PhysDock.data.constants.periodic_table import PeriodicTable
+from PhysDock.data.tools.rdkit import get_features_from_smi
+PDB_CHAIN_IDS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+class FeatureLoader:
+    def __init__(
+            self,
+            # Dataset Config
+            dataset_path=None,
+            msa_features_dir=None,
+            ccd_id_meta_data=None,
+            crop_size=256,
+            atom_crop_size=256 * 8,
+            # Infer config
+            inference_mode=False,
+            infer_pocket_type="atom",  # "ca"
+            infer_pocket_cutoff=6,  # 8 10 12
+            infer_pocket_dist_type="ligand",  # "ligand_centre"
+            infer_use_pocket=True,
+            infer_use_key_res=True,
+            # Train Config
+            train_pocket_type_atom_ratio=0.5,
+            train_pocket_cutoff_ligand_min=6,
+            train_pocket_cutoff_ligand_max=12,
+            train_pocket_cutoff_ligand_centre_min=10,
+            train_pocket_cutoff_ligand_centre_max=16,
+            train_pocket_dist_type_ligand_ratio=0.5,
+            train_use_pocket_ratio=0.5,
+            train_use_key_res_ratio=0.5,
+            train_shuffle_sym_id=True,
+            train_spatial_crop_ligand_ratio=0.2,
+            train_spatial_crop_interface_ratio=0.4,
+            train_spatial_crop_interface_threshold=15.,
+            train_charility_augmentation_ratio=0.1,
+            train_use_template_ratio=0.75,
+            train_template_mask_max_ratio=0.4,
+            # Other Configs
+            max_msa_clusters=128,
+            key_res_random_mask_ratio=0.5,
+            # Abalation
+            use_x_gt_ligand_as_ref_pos=False,
+            # Recycle
+            num_recycles=None
+    ):
+        # Init Dataset
+        if dataset_path is not None:
+            self.msa_features_path = os.path.join(dataset_path, "msa_features")
+            self.uniprot_msa_features_path = os.path.join(dataset_path, "uniprot_msa_features")
+            if os.path.exists(os.path.join(dataset_path, "train_val")):
+                self.used_sample_ids = find_files(os.path.join(dataset_path, "train_val"))
+                if os.path.exists(os.path.join(dataset_path, "train_val_weights.json")):
+                    weights = load_json(os.path.join(dataset_path, "train_val_weights.json"))
+                    self.weights = np.array([weights[sample_id] for sample_id in self.used_sample_ids])
+                    self.probabilities = torch.from_numpy(self.weights / self.weights.sum())
+        if msa_features_dir is not None:
+            self.msa_features_path = os.path.join(msa_features_dir, "msa_features")
+            self.uniprot_msa_features_path = os.path.join(msa_features_dir, "uniprot_msa_features")
+            # self.ccd_id_meta_data = load_pkl(
+            #     "/2022133002/projects/stdock/stdock_v9.5/scripts/ccd_meta_data_confs_chars.pkl.gz")
+            # self.ccd_id_ref_mol = load_pkl("/2022133002/projects/stdock/stdock_v9.5/scripts/ccd_dict.pkl.gz")
+            #
+            # # self.ccd_id_meta_data = load_pkl(
+            # #     "/2022133002/projects/stdock/stdock_v9.5/scripts/ccd_meta_data_confs_chars.pkl.gz")
+            # ddd_ccd_meta = load_pkl(
+            #     "/2022133002/projects/stdock/stdock_v9.5/scripts/phi_ccd_meta_data_confs_chars.pkl.gz")
+            # self.ccd_id_meta_data.update(ddd_ccd_meta)
+            # self.ccd_id_ref_mol = load_pkl("/2022133002/projects/stdock/stdock_v9.5/scripts/ccd_dict.pkl.gz")
+            # ccd_id_ref_mol_phi = load_pkl("/2022133002/projects/stdock/stdock_v9.5/scripts/phi_ccd_dict.pkl.gz")
+            # self.ccd_id_ref_mol.update(ccd_id_ref_mol_phi)
+        if ccd_id_meta_data is None:
+            assert os.path.exists(os.path.join(dataset_path, "ccd_id_meta_data.pkl.gz")) or ccd_id_meta_data is not None
+            self.ccd_id_meta_data = load_pkl(os.path.join(dataset_path, "ccd_id_meta_data.pkl.gz"))
+        else:
+            self.ccd_id_meta_data = load_pkl(ccd_id_meta_data)
+        # Inference Config
+        self.inference_mode = inference_mode
+        self.infer_use_pocket = infer_use_pocket
+        self.infer_use_key_res = infer_use_key_res
+        self.infer_pocket_type = infer_pocket_type
+        self.infer_pocket_cutoff = infer_pocket_cutoff
+        self.infer_pocket_dist_type = infer_pocket_dist_type
+        # Training Config
+        self.train_pocket_type_atom_ratio = train_pocket_type_atom_ratio
+        self.train_pocket_cutoff_ligand_min = train_pocket_cutoff_ligand_min
+        self.train_pocket_cutoff_ligand_max = train_pocket_cutoff_ligand_max
+        self.train_pocket_cutoff_ligand_centre_min = train_pocket_cutoff_ligand_centre_min
+        self.train_pocket_cutoff_ligand_centre_max = train_pocket_cutoff_ligand_centre_max
+        self.train_pocket_dist_type_ligand_ratio = train_pocket_dist_type_ligand_ratio
+        self.train_use_pocket_ratio = train_use_pocket_ratio
+        self.train_use_key_res_ratio = train_use_key_res_ratio
+        self.train_shuffle_sym_id = train_shuffle_sym_id
+        self.train_spatial_crop_ligand_ratio = train_spatial_crop_ligand_ratio
+        self.train_spatial_crop_interface_ratio = train_spatial_crop_interface_ratio
+        self.train_spatial_crop_interface_threshold = train_spatial_crop_interface_threshold
+        self.train_charility_augmentation_ratio = train_charility_augmentation_ratio
+        self.train_use_template_ratio = train_use_template_ratio
+        self.train_template_mask_max_ratio = train_template_mask_max_ratio
+        # Other Configs
+        self.token_bond_threshold = 2.4
+        self.key_res_random_mask_ratio = key_res_random_mask_ratio
+        self.crop_size = crop_size
+        self.atom_crop_size = atom_crop_size
+        self.max_msa_clusters = max_msa_clusters
+        self.use_x_gt_ligand_as_ref_pos = use_x_gt_ligand_as_ref_pos
+        self.num_recycles = num_recycles
+    def _update_CONF_META_DATA(self, CONF_META_DATA, ccds):
+        for ccd in ccds:
+            if ccd in CONF_META_DATA:
+                continue
+            ccd_features = self.ccd_id_meta_data[ccd]
+            ref_pos = ccd_features["ref_pos"]
+            ref_pos = ref_pos - np.mean(ref_pos, axis=0, keepdims=True)
+            CONF_META_DATA[ccd] = {
+                "ref_feat": np.concatenate([
+                    ref_pos,
+                    ccd_features["ref_charge"][..., None],
+                    rc.eye_128[ccd_features["ref_element"]].astype(np.float32),
+                    ccd_features["ref_is_aromatic"].astype(np.float32)[..., None],
+                    rc.eye_9[ccd_features["ref_degree"]].astype(np.float32),
+                    rc.eye_7[ccd_features["ref_hybridization"]].astype(np.float32),
+                    rc.eye_9[ccd_features["ref_implicit_valence"]].astype(np.float32),
+                    rc.eye_3[ccd_features["ref_chirality"]].astype(np.float32),
+                    ccd_features["ref_in_ring_of_3"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_4"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_5"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_6"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_7"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_8"].astype(np.float32)[..., None],
+                ], axis=-1),
+                "rel_tok_feat": np.concatenate([
+                    rc.eye_32[ccd_features["d_token"]].astype(np.float32),
+                    rc.eye_5[ccd_features["bond_type"]].astype(np.float32),
+                    ccd_features["token_bonds"].astype(np.float32)[..., None],
+                    ccd_features["bond_as_double"].astype(np.float32)[..., None],
+                    ccd_features["bond_in_ring"].astype(np.float32)[..., None],
+                    ccd_features["bond_is_conjugated"].astype(np.float32)[..., None],
+                    ccd_features["bond_is_aromatic"].astype(np.float32)[..., None],
+                ], axis=-1),
+                "ref_atom_name_chars": ccd_features["ref_atom_name_chars"],
+                "ref_element": ccd_features["ref_element"],
+                "token_bonds": ccd_features["token_bonds"],
+            }
+        return CONF_META_DATA
+    def _update_chain_feature(self, chain_feature, CONF_META_DATA, use_pocket, use_key_res, ):
+        ccds_ori = chain_feature["ccds"]
+        chain_class = chain_feature["chain_class"]
+        if chain_class == "protein":
+            sequence = "".join([protein_letters_3to1_extended.get(ccd, "X") for ccd in ccds_ori])
+            md5 = convert_md5_string(f"protein:{sequence}")
+            # with open("add_msa.fasta", "a") as f:
+            #     f.write(f">{md5}\n{sequence}\n")
+            try:
+                # import shutil
+                # shutil.copy(
+                #     os.path.join(self.msa_features_path, f"{md5}.pkl.gz"),
+                #     os.path.join("/home/zhangkexin/research/PhysDock/examples/demo/features/msa_features",
+                #                  f"{md5}.pkl.gz")
+                # )
+                # shutil.copy(
+                #     os.path.join(self.uniprot_msa_features_path, f"{md5}.pkl.gz"),
+                #     os.path.join("/home/zhangkexin/research/PhysDock/examples/demo/features/uniprot_msa_features",
+                #                  f"{md5}.pkl.gz")
+                # )
+                chain_feature.update(
+                    load_pkl(os.path.join(self.msa_features_path, f"{md5}.pkl.gz"))
+                )
+            except:
+                print(f"Can't find msa feature!!! md5: {md5}")
+                with open("add_msa.fasta", "a") as f:
+                    f.write(f">{md5}\n{sequence}\n")
+            chain_feature.update(
+                load_pkl(os.path.join(self.uniprot_msa_features_path, f"{md5}.pkl.gz"))
+            )
+        else:
+            chain_feature["msa"] = np.array([[rc.standard_ccds.index(ccd)
+                                              if ccd in rc.standard_ccds else 20 for ccd in ccds_ori]] * 2,
+                                            dtype=np.int8)
+            chain_feature["deletion_matrix"] = np.zeros_like(chain_feature["msa"])
+        # Merge Key Res Feat & Augmentation
+        if "salt bridges" in chain_feature and use_key_res:
+            key_res_feat = np.stack([
+                chain_feature["salt bridges"],
+                chain_feature["pi-cation interactions"],
+                chain_feature["hydrophobic interactions"],
+                chain_feature["pi-stacking"],
+                chain_feature["hydrogen bonds"],
+                chain_feature["metal complexes"],
+                np.zeros_like(chain_feature["salt bridges"]),
+            ], axis=-1).astype(np.float32)
+        else:
+            key_res_feat = np.zeros([len(ccds_ori), 7], dtype=np.float32)
+        is_key_res = np.any(key_res_feat.astype(np.bool_), axis=-1).astype(np.float32)
+        # Augmentation
+        # if not self.inference_mode:
+        key_res_feat = (key_res_feat *
+                        (np.random.random([len(ccds_ori), 7]) > self.key_res_random_mask_ratio))
+        if "pocket_res_feat" in chain_feature and use_pocket:
+            pocket_res_feat = chain_feature["pocket_res_feat"]
+        else:
+            pocket_res_feat = np.zeros([len(ccds_ori)], dtype=np.float32)
+        x_gt = []
+        atom_id_to_conformer_atom_id = []
+        # Conformer
+        conformer_id_to_chunk_sizes = []
+        residue_index = []
+        restype = []
+        ccds = []
+        conformer_exists = []
+        for c_id, ccd in enumerate(chain_feature["ccds"]):
+            no_atom_this_conf = len(CONF_META_DATA[ccd]["ref_feat"])
+            conformer_atom_ids_this_conf = np.arange(no_atom_this_conf)
+            x_gt_this_conf = chain_feature["all_atom_positions"][c_id]
+            x_exists_this_conf = chain_feature["all_atom_mask"][c_id].astype(np.bool_)
+            # TODO DEBUG
+            # conformer_exist = np.sum(x_exists_this_conf).item() > len(x_exists_this_conf) - 2
+            conformer_exist = np.any(x_exists_this_conf).item()
+            if rc.is_standard(ccd):
+                conformer_exist = conformer_exist and x_exists_this_conf[1]
+                if ccd != "GLY":
+                    conformer_exist = conformer_exist and x_exists_this_conf[4]
+            conformer_exists.append(conformer_exist)
+            if conformer_exist:
+                # Atomwise
+                x_gt.append(x_gt_this_conf[x_exists_this_conf])
+                atom_id_to_conformer_atom_id.append(conformer_atom_ids_this_conf[x_exists_this_conf])
+                # Tokenwise
+                residue_index.append(c_id)
+                conformer_id_to_chunk_sizes.append(np.sum(x_exists_this_conf).item())
+                restype.append(rc.standard_ccds.index(ccd) if ccd in rc.standard_ccds else 20)
+                ccds.append(ccd)
+        # print(ccds)
+        # print("x_gt", x_gt)
+        x_gt = np.concatenate(x_gt, axis=0)
+        atom_id_to_conformer_atom_id = np.concatenate(atom_id_to_conformer_atom_id, axis=0, dtype=np.int32)
+        residue_index = np.array(residue_index, dtype=np.int64)
+        conformer_id_to_chunk_sizes = np.array(conformer_id_to_chunk_sizes, dtype=np.int64)
+        restype = np.array(restype, dtype=np.int64)
+        conformer_exists = np.array(conformer_exists, dtype=np.bool_)
+        chain_feature_update = {
+            "x_gt": x_gt,
+            "atom_id_to_conformer_atom_id": atom_id_to_conformer_atom_id,
+            "residue_index": residue_index,
+            "conformer_id_to_chunk_sizes": conformer_id_to_chunk_sizes,
+            "restype": restype,
+            "ccds": ccds,
+            "msa": chain_feature["msa"][:, conformer_exists],
+            "deletion_matrix": chain_feature["deletion_matrix"][:, conformer_exists],
+            "chain_class": chain_class,
+            "key_res_feat": key_res_feat[conformer_exists],
+            "is_key_res": is_key_res[conformer_exists],
+            "pocket_res_feat": pocket_res_feat[conformer_exists],
+        }
+        chain_feature_update["is_protein"] = np.array([chain_class == "protein"] * len(ccds)).astype(np.float32)
+        chain_feature_update["is_ligand"] = np.array([chain_class != "protein"] * len(ccds)).astype(np.float32)
+        # Assert Short Poly Chain like peptide
+        chain_feature_update["is_short_poly"] = np.array(
+            [chain_class != "protein" and len(ccds) >= 2 and rc.is_standard(ccd) for ccd in ccds]
+        ).astype(np.float32)
+        if "msa_all_seq" in chain_feature:
+            chain_feature_update["msa_all_seq"] = chain_feature["msa_all_seq"][:, conformer_exists]
+            chain_feature_update["deletion_matrix_all_seq"] = \
+                chain_feature["deletion_matrix_all_seq"][:, conformer_exists]
+            chain_feature_update["msa_species_identifiers_all_seq"] = chain_feature["msa_species_identifiers_all_seq"]
+        del chain_feature
+        return chain_feature_update
+    def _update_smi(self, smi, all_chain_labels, CONF_META_DATA):
+        ccd = "XXX"
+        chain_id = "99"
+        label_feature, conf_feature, ref_mol = get_features_from_smi(smi)
+        all_chain_labels[chain_id] = {
+            "all_atom_positions": label_feature["x_gt"][None],
+            "all_atom_mask": label_feature["x_exists"][None],
+            "ccds": [ccd]
+        }
+        ref_atom_name_chars = []
+        for id, ele in enumerate(conf_feature["ref_element"]):
+            atom_name = f"{PeriodicTable[ele] + str(id):<4}"
+            ref_atom_name_chars.append(atom_name)
+        CONF_META_DATA[ccd] = {
+            "ref_feat": np.concatenate([
+                conf_feature["ref_pos"],
+                conf_feature["ref_charge"][..., None],
+                rc.eye_128[conf_feature["ref_element"]].astype(np.float32),
+                conf_feature["ref_is_aromatic"].astype(np.float32)[..., None],
+                rc.eye_9[conf_feature["ref_degree"]].astype(np.float32),
+                rc.eye_7[conf_feature["ref_hybridization"]].astype(np.float32),
+                rc.eye_9[conf_feature["ref_implicit_valence"]].astype(np.float32),
+                rc.eye_3[conf_feature["ref_chirality"]].astype(np.float32),
+                conf_feature["ref_in_ring_of_3"].astype(np.float32)[..., None],
+                conf_feature["ref_in_ring_of_4"].astype(np.float32)[..., None],
+                conf_feature["ref_in_ring_of_5"].astype(np.float32)[..., None],
+                conf_feature["ref_in_ring_of_6"].astype(np.float32)[..., None],
+                conf_feature["ref_in_ring_of_7"].astype(np.float32)[..., None],
+                conf_feature["ref_in_ring_of_8"].astype(np.float32)[..., None],
+            ], axis=-1),
+            "rel_tok_feat": np.concatenate([
+                rc.eye_32[conf_feature["d_token"]].astype(np.float32),
+                rc.eye_5[conf_feature["bond_type"]].astype(np.float32),
+                conf_feature["token_bonds"].astype(np.float32)[..., None],
+                conf_feature["bond_as_double"].astype(np.float32)[..., None],
+                conf_feature["bond_in_ring"].astype(np.float32)[..., None],
+                conf_feature["bond_is_conjugated"].astype(np.float32)[..., None],
+                conf_feature["bond_is_aromatic"].astype(np.float32)[..., None],
+            ], axis=-1),
+            "ref_atom_name_chars": ref_atom_name_chars,
+            "ref_element": conf_feature["ref_element"],
+            "token_bonds": conf_feature["token_bonds"],
+        }
+        return all_chain_labels, CONF_META_DATA, ref_mol
+    def _add_assembly_feature(self, all_chain_features, SEQ3, ASYM_ID):
+        entities = {}
+        for chain_id, seq3 in SEQ3.items():
+            if seq3 not in entities:
+                entities[seq3] = [chain_id]
+            else:
+                entities[seq3].append(chain_id)
+        asym_id = 0
+        for entity_id, seq3 in enumerate(list(entities.keys())):
+            chain_ids = copy.deepcopy(entities[seq3])
+            if not self.inference_mode and self.train_shuffle_sym_id:
+                # sym_id augmentation
+                random.shuffle(chain_ids)
+            for sym_id, chain_id in enumerate(chain_ids):
+                num_conformers = len(all_chain_features[chain_id]["ccds"])
+                all_chain_features[chain_id]["asym_id"] = \
+                    np.full([num_conformers], fill_value=asym_id, dtype=np.int32)
+                all_chain_features[chain_id]["sym_id"] = \
+                    np.full([num_conformers], fill_value=sym_id, dtype=np.int32)
+                all_chain_features[chain_id]["entity_id"] = \
+                    np.full([num_conformers], fill_value=entity_id, dtype=np.int32)
+                all_chain_features[chain_id]["sequence_3"] = seq3
+                ASYM_ID[asym_id] = chain_id
+                asym_id += 1
+        return all_chain_features, ASYM_ID
+    def _crop_all_chain_features(self, all_chain_features, infer_meta_data, crop_centre=None):
+        CONF_META_DATA = infer_meta_data["CONF_META_DATA"]
+        ordered_chain_ids = list(all_chain_features.keys())
+        x_gt = np.concatenate([all_chain_features[chain_id]["x_gt"] for chain_id in ordered_chain_ids], axis=0)
+        token_id_to_centre_atom_id = []
+        token_id_to_conformer_id = []
+        token_id_to_ccd_chunk_sizes = []
+        token_id_to_ccd = []
+        asym_id_ca = []
+        token_id = 0
+        atom_id = 0
+        conf_id = 0
+        x_gt_ligand = []
+        for chain_id in ordered_chain_ids:
+            if chain_id.isdigit() and len(all_chain_features[chain_id]["ccds"]) == 1:
+                x_gt_ligand.append(all_chain_features[chain_id]["x_gt"])
+            atom_offset = 0
+            for ccd, chunk_size_this_ccd, asym_id in zip(
+                    all_chain_features[chain_id]["ccds"],
+                    all_chain_features[chain_id]["conformer_id_to_chunk_sizes"],
+                    all_chain_features[chain_id]["asym_id"],
+            ):
+                inner_atom_idx = all_chain_features[chain_id]["atom_id_to_conformer_atom_id"][
+                                 atom_offset:atom_offset + chunk_size_this_ccd]
+                atom_names = [CONF_META_DATA[ccd]["ref_atom_name_chars"][i] for i in inner_atom_idx]
+                if rc.is_standard(ccd):
+                    for atom_id_this_ccd, atom_name in enumerate(atom_names):
+                        if atom_name == rc.standard_ccd_to_token_centre_atom_name[ccd]:
+                            token_id_to_centre_atom_id.append(atom_id)
+                            token_id_to_conformer_id.append(conf_id)
+                            token_id_to_ccd_chunk_sizes.append(chunk_size_this_ccd)
+                            token_id_to_ccd.append(ccd)
+                            asym_id_ca.append(asym_id)
+                        atom_id += 1
+                    token_id += 1
+                else:
+                    for atom_id_this_ccd, atom_name in enumerate(atom_names):
+                        token_id_to_centre_atom_id.append(atom_id)
+                        token_id_to_conformer_id.append(conf_id)
+                        token_id_to_ccd_chunk_sizes.append(chunk_size_this_ccd)
+                        token_id_to_ccd.append(ccd)
+                        asym_id_ca.append(asym_id)
+                        atom_id += 1
+                        token_id += 1
+                atom_offset += chunk_size_this_ccd
+                conf_id += 1
+        x_gt_ca = x_gt[token_id_to_centre_atom_id]
+        asym_id_ca = np.array(asym_id_ca)
+        crop_scheme_seed = random.random()
+        # Crop Ligand Centre
+        if self.inference_mode and len(x_gt_ligand) == 1:
+            x_gt_ligand = np.concatenate(x_gt_ligand, axis=0)
+            x_gt_sel = np.mean(x_gt_ligand, axis=0)[None]
+        # Spatial Crop Ligand
+        elif crop_scheme_seed < (self.train_spatial_crop_ligand_ratio if not self.inference_mode else 1.0) and len(
+                x_gt_ligand) > 0:
+            x_gt_ligand = np.concatenate(x_gt_ligand, axis=0)
+            x_gt_sel = random.choice(x_gt_ligand)[None]
+        # Spatial Crop Interface
+        elif crop_scheme_seed < self.train_spatial_crop_ligand_ratio + self.train_spatial_crop_interface_ratio and len(
+                set(asym_id_ca.tolist())) > 1:
+            chain_same = asym_id_ca[None] * asym_id_ca[:, None]
+            dist = np.linalg.norm(x_gt_ca[:, None] - x_gt_ca[None], axis=-1)
+            dist = dist + chain_same * 100
+            # interface_threshold
+            mask = np.any(dist < self.train_spatial_crop_interface_threshold, axis=-1)
+            if sum(mask) > 0:
+                x_gt_sel = random.choice(x_gt_ca[mask])[None]
+            else:
+                x_gt_sel = random.choice(x_gt_ca)[None]
+        # Spatial Crop
+        else:
+            x_gt_sel = random.choice(x_gt_ca)[None]
+        dist = np.linalg.norm(x_gt_ca - x_gt_sel, axis=-1)
+        token_idxs = np.argsort(dist)
+        select_ccds_idx = []
+        to_sum_atom = 0
+        to_sum_token = 0
+        for token_idx in token_idxs:
+            ccd_idx = token_id_to_conformer_id[token_idx]
+            ccd_chunk_size = token_id_to_ccd_chunk_sizes[token_idx]
+            ccd_this_token = token_id_to_ccd[token_idx]
+            if ccd_idx in select_ccds_idx:
+                continue
+            if to_sum_atom + ccd_chunk_size > self.atom_crop_size:
+                break
+            to_add_token = 1 if rc.is_standard(ccd_this_token) else ccd_chunk_size
+            if to_sum_token + to_add_token > self.crop_size:
+                break
+            select_ccds_idx.append(ccd_idx)
+            to_sum_atom += ccd_chunk_size
+            to_sum_token += to_add_token
+        ccd_all_id = 0
+        crop_chains = []
+        for chain_id in ordered_chain_ids:
+            conformer_used_mask = []
+            atom_used_mask = []
+            ccds = []
+            for ccd, chunk_size_this_ccd in zip(
+                    all_chain_features[chain_id]["ccds"],
+                    all_chain_features[chain_id]["conformer_id_to_chunk_sizes"],
+            ):
+                if ccd_all_id in select_ccds_idx:
+                    ccds.append(ccd)
+                    if chain_id not in crop_chains:
+                        crop_chains.append(chain_id)
+                conformer_used_mask.append(ccd_all_id in select_ccds_idx)
+                atom_used_mask += [ccd_all_id in select_ccds_idx] * chunk_size_this_ccd
+                ccd_all_id += 1
+            conf_mask = np.array(conformer_used_mask).astype(np.bool_)
+            atom_mask = np.array(atom_used_mask).astype(np.bool_)
+            # Update All Chain Features
+            all_chain_features[chain_id]["x_gt"] = all_chain_features[chain_id]["x_gt"][atom_mask]
+            all_chain_features[chain_id]["atom_id_to_conformer_atom_id"] = \
+                all_chain_features[chain_id]["atom_id_to_conformer_atom_id"][atom_mask]
+            all_chain_features[chain_id]["restype"] = all_chain_features[chain_id]["restype"][conf_mask]
+            all_chain_features[chain_id]["residue_index"] = all_chain_features[chain_id]["residue_index"][conf_mask]
+            all_chain_features[chain_id]["conformer_id_to_chunk_sizes"] = \
+                all_chain_features[chain_id]["conformer_id_to_chunk_sizes"][conf_mask]
+            # BUG Fix
+            all_chain_features[chain_id]["key_res_feat"] = all_chain_features[chain_id]["key_res_feat"][conf_mask]
+            all_chain_features[chain_id]["pocket_res_feat"] = all_chain_features[chain_id]["pocket_res_feat"][conf_mask]
+            all_chain_features[chain_id]["is_key_res"] = all_chain_features[chain_id]["is_key_res"][conf_mask]
+            all_chain_features[chain_id]["is_protein"] = all_chain_features[chain_id]["is_protein"][conf_mask]
+            all_chain_features[chain_id]["is_short_poly"] = all_chain_features[chain_id]["is_short_poly"][conf_mask]
+            all_chain_features[chain_id]["is_ligand"] = all_chain_features[chain_id]["is_ligand"][conf_mask]
+            all_chain_features[chain_id]["asym_id"] = all_chain_features[chain_id]["asym_id"][conf_mask]
+            all_chain_features[chain_id]["sym_id"] = all_chain_features[chain_id]["sym_id"][conf_mask]
+            all_chain_features[chain_id]["entity_id"] = all_chain_features[chain_id]["entity_id"][conf_mask]
+            all_chain_features[chain_id]["ccds"] = ccds
+            if "msa" in all_chain_features[chain_id]:
+                all_chain_features[chain_id]["msa"] = all_chain_features[chain_id]["msa"][:, conf_mask]
+                all_chain_features[chain_id]["deletion_matrix"] = \
+                    all_chain_features[chain_id]["deletion_matrix"][:, conf_mask]
+            if "msa_all_seq" in all_chain_features[chain_id]:
+                all_chain_features[chain_id]["msa_all_seq"] = all_chain_features[chain_id]["msa_all_seq"][:, conf_mask]
+                all_chain_features[chain_id]["deletion_matrix_all_seq"] = \
+                    all_chain_features[chain_id]["deletion_matrix_all_seq"][:, conf_mask]
+        # Remove Unused Chains
+        for chain_id in list(all_chain_features.keys()):
+            if chain_id not in crop_chains:
+                all_chain_features.pop(chain_id, None)
+        return all_chain_features, infer_meta_data
+    def _make_ccd_features(self, raw_feats, infer_meta_data):
+        CONF_META_DATA = infer_meta_data["CONF_META_DATA"]
+        ccds = raw_feats["ccds"]
+        atom_id_to_conformer_atom_id = raw_feats["atom_id_to_conformer_atom_id"]
+        conformer_id_to_chunk_sizes = raw_feats["conformer_id_to_chunk_sizes"]
+        # Atomwise
+        atom_id_to_conformer_id = []
+        atom_id_to_token_id = []
+        ref_feat = []
+        # Tokenwise
+        s_mask = []
+        token_id_to_conformer_id = []
+        token_id_to_chunk_sizes = []
+        token_id_to_centre_atom_id = []
+        token_id_to_pseudo_beta_atom_id = []
+        token_id = 0
+        atom_id = 0
+        for conf_id, (ccd, ccd_atoms) in enumerate(zip(ccds, conformer_id_to_chunk_sizes)):
+            conf_meta_data = CONF_META_DATA[ccd]
+            # UNK Conformer
+            if rc.is_unk(ccd):
+                s_mask.append(0)
+                token_id_to_chunk_sizes.append(0)
+                token_id_to_conformer_id.append(conf_id)
+                token_id_to_centre_atom_id.append(-1)
+                token_id_to_pseudo_beta_atom_id.append(-1)
+                token_id += 1
+            # Standard Residue
+            elif rc.is_standard(ccd):
+                inner_atom_idx = atom_id_to_conformer_atom_id[atom_id:atom_id + ccd_atoms.item()]
+                atom_names = [conf_meta_data["ref_atom_name_chars"][i] for i in inner_atom_idx]
+                ref_feat.append(conf_meta_data["ref_feat"][inner_atom_idx])
+                token_id_to_conformer_id.append(conf_id)
+                token_id_to_chunk_sizes.append(ccd_atoms.item())
+                s_mask.append(1)
+                for atom_id_this_ccd, atom_name in enumerate(atom_names):
+                    # Update Atomwise Features
+                    atom_id_to_conformer_id.append(conf_id)
+                    atom_id_to_token_id.append(token_id)
+                    # Update special atom ids
+                    if atom_name == rc.standard_ccd_to_token_centre_atom_name[ccd]:
+                        token_id_to_centre_atom_id.append(atom_id)
+                    if atom_name == rc.standard_ccd_to_token_pseudo_beta_atom_name[ccd]:
+                        token_id_to_pseudo_beta_atom_id.append(atom_id)
+                    atom_id += 1
+                token_id += 1
+            # Nonestandard Residue & Ligand
+            else:
+                inner_atom_idx = atom_id_to_conformer_atom_id[atom_id:atom_id + ccd_atoms.item()]
+                atom_names = [conf_meta_data["ref_atom_name_chars"][i] for i in inner_atom_idx]
+                ref_feat.append(conf_meta_data["ref_feat"][inner_atom_idx])
+                # ref_pos_new.append(conf_meta_data["ref_pos_new"][:, inner_atom_idx])
+                for atom_id_this_ccd, atom_name in enumerate(atom_names):
+                    # Update Atomwise Features
+                    atom_id_to_conformer_id.append(conf_id)
+                    atom_id_to_token_id.append(token_id)
+                    # Update Tokenwise Features
+                    token_id_to_chunk_sizes.append(1)
+                    token_id_to_conformer_id.append(conf_id)
+                    s_mask.append(1)
+                    token_id_to_centre_atom_id.append(atom_id)
+                    token_id_to_pseudo_beta_atom_id.append(atom_id)
+                    atom_id += 1
+                    token_id += 1
+        if len(ref_feat) > 1:
+            ref_feat = np.concatenate(ref_feat, axis=0).astype(np.float32)
+        else:
+            ref_feat = ref_feat[0].astype(np.float32)
+        features = {
+            # Atomwise
+            "atom_id_to_conformer_id": np.array(atom_id_to_conformer_id, dtype=np.int64),
+            "atom_id_to_token_id": np.array(atom_id_to_token_id, dtype=np.int64),
+            "ref_feat": ref_feat,
+            # Tokewise
+            "token_id_to_conformer_id": np.array(token_id_to_conformer_id, dtype=np.int64),
+            "s_mask": np.array(s_mask, dtype=np.int64),
+            "token_id_to_centre_atom_id": np.array(token_id_to_centre_atom_id, dtype=np.int64),
+            "token_id_to_pseudo_beta_atom_id": np.array(token_id_to_pseudo_beta_atom_id, dtype=np.int64),
+            "token_id_to_chunk_sizes": np.array(token_id_to_chunk_sizes, dtype=np.int64),
+        }
+        features["ref_pos"] = features["ref_feat"][..., :3]
+        return features
+    def pair_and_merge(self, all_chain_features, infer_meta_data):
+        CHAIN_CLASS = infer_meta_data["CHAIN_CLASS"]  # Dict
+        CONF_META_DATA = infer_meta_data["CONF_META_DATA"]
+        ASYM_ID = infer_meta_data["ASYM_ID"]
+        homo_feats = {}
+        all_chain_ids = list(all_chain_features.keys())
+        if len(all_chain_ids) == 1 and CHAIN_CLASS[all_chain_ids[0]] == "ligand":
+            ordered_chain_ids = all_chain_ids
+            raw_feats = all_chain_features[all_chain_ids[0]]
+            raw_feats["msa"] = np.repeat(raw_feats["msa"][:1], 256, axis=0)
+            raw_feats["deletion_matrix"] = np.repeat(raw_feats["msa"][:1], 256, axis=0)
+            keys = list(raw_feats.keys())
+            for feature_name in keys:
+                if feature_name not in ["x_gt", "atom_id_to_conformer_atom_id", "residue_index",
+                                        "conformer_id_to_chunk_sizes", "restype", "is_protein", "is_short_poly",
+                                        "is_ligand",
+                                        "asym_id", "sym_id", "entity_id", "msa", "deletion_matrix", "ccds",
+                                        "pocket_res_feat", "key_res_feat", "is_key_res"]:
+                    raw_feats.pop(feature_name)
+            # Update Profile and Deletion Mean
+            msa_one_hot = F.one_hot(torch.from_numpy(raw_feats["msa"]).long(), 32).type(torch.float32)
+            raw_feats["profile"] = torch.mean(msa_one_hot, dim=-3).numpy()
+            del msa_one_hot
+            raw_feats["deletion_mean"] = (torch.atan(
+                torch.sum(torch.from_numpy(raw_feats["deletion_matrix"]), dim=0) / 3.0
+            ) * (2.0 / torch.pi)).numpy()
+        else:
+            for chain_id in list(all_chain_features.keys()):
+                homo_feats[chain_id] = {
+                    "asym_id": copy.deepcopy(all_chain_features[chain_id]["asym_id"]),
+                    "sym_id": copy.deepcopy(all_chain_features[chain_id]["sym_id"]),
+                    "entity_id": copy.deepcopy(all_chain_features[chain_id]["entity_id"]),
+                }
+            for chain_id in list(all_chain_features.keys()):
+                homo_feats[chain_id]["chain_class"] = all_chain_features[chain_id].pop("chain_class")
+                homo_feats[chain_id]["sequence_3"] = all_chain_features[chain_id].pop("sequence_3")
+                homo_feats[chain_id]["msa"] = all_chain_features[chain_id].pop("msa")
+                homo_feats[chain_id]["deletion_matrix"] = all_chain_features[chain_id].pop("deletion_matrix")
+                if "msa_all_seq" in all_chain_features[chain_id]:
+                    homo_feats[chain_id]["msa_all_seq"] = all_chain_features[chain_id].pop("msa_all_seq")
+                    homo_feats[chain_id]["deletion_matrix_all_seq"] = all_chain_features[chain_id].pop(
+                        "deletion_matrix_all_seq")
+                    homo_feats[chain_id]["msa_species_identifiers_all_seq"] = all_chain_features[chain_id].pop(
+                        "msa_species_identifiers_all_seq")
+            # Initial raw feats with merged homo feats
+            raw_feats = pair_and_merge(homo_feats, is_homomer_or_monomer=False)
+            # Update Profile and Deletion Mean
+            msa_one_hot = F.one_hot(torch.from_numpy(raw_feats["msa"]).long(), 32).type(torch.float32)
+            raw_feats["profile"] = torch.mean(msa_one_hot, dim=-3).numpy()
+            del msa_one_hot
+            raw_feats["deletion_mean"] = (torch.atan(
+                torch.sum(torch.from_numpy(raw_feats["deletion_matrix"]), dim=0) / 3.0
+            ) * (2.0 / torch.pi)).numpy()
+            # Merge no homo feats according to asym_id
+            ordered_asym_ids = []
+            for i in raw_feats["asym_id"]:
+                if i not in ordered_asym_ids:
+                    ordered_asym_ids.append(i)
+            ordered_chain_ids = [ASYM_ID[i] for i in ordered_asym_ids]
+            for feature_name in ["chain_class", "sequence_3", "assembly_num_chains", "entity_mask", "seq_length",
+                                 "num_alignments"]:
+                raw_feats.pop(feature_name, None)
+            for feature_name in ["x_gt", "atom_id_to_conformer_atom_id", "residue_index", "conformer_id_to_chunk_sizes",
+                                 "restype", "is_protein", "is_short_poly", "is_ligand", "pocket_res_feat",
+                                 "key_res_feat", "is_key_res"]:
+                raw_feats[feature_name] = np.concatenate([
+                    all_chain_features[chain_id].pop(feature_name) for chain_id in ordered_chain_ids
+                ], axis=0)
+        # Conformerwise Chain Class
+        CHAIN_CLASS_NEW = []
+        for chain_id in ordered_chain_ids:
+            CHAIN_CLASS_NEW += [CHAIN_CLASS[chain_id]] * len(all_chain_features[chain_id]["ccds"])
+        infer_meta_data["CHAIN_CLASS"] = CHAIN_CLASS_NEW
+        raw_feats["ccds"] = reduce(add, [all_chain_features[chain_id].pop("ccds") for chain_id in ordered_chain_ids])
+        # Create Atomwise and Tokenwise Features
+        raw_feats.update(self._make_ccd_features(raw_feats, infer_meta_data))
+        if self.use_x_gt_ligand_as_ref_pos:
+            is_ligand_atom = raw_feats["is_ligand"][raw_feats["atom_id_to_conformer_id"]].astype(np.bool_)
+            raw_feats["ref_pos"][is_ligand_atom] = raw_feats["x_gt"][is_ligand_atom] - np.mean(
+                raw_feats["x_gt"][is_ligand_atom], axis=0, keepdims=True)
+        asym_id_conformerwise = copy.deepcopy(raw_feats["asym_id"])
+        residue_index_conformerwise = copy.deepcopy(raw_feats["residue_index"])
+        # Conformerwise to Tokenwise
+        token_id_to_conformer_id = raw_feats["token_id_to_conformer_id"]
+        for key in ["is_protein", "is_short_poly", "is_ligand", "residue_index", "restype", "asym_id", "entity_id",
+                    "sym_id", "deletion_mean", "profile", "pocket_res_feat", "key_res_feat", "is_key_res"]:
+            raw_feats[key] = raw_feats[key][token_id_to_conformer_id]
+        for key in ["msa", "deletion_matrix"]:
+            if key in raw_feats:
+                raw_feats[key] = raw_feats[key][:, token_id_to_conformer_id]
+        ###################################################
+        #       Centre Random Augmentation of ref pos     #
+        ###################################################
+        # atom_id_to_token_id
+        # atom_id_to_conformer_id
+        raw_feats["ref_pos"] = centre_random_augmentation_np_apply(
+            raw_feats["ref_pos"], raw_feats["atom_id_to_conformer_id"]).astype(np.float32)
+        raw_feats["ref_feat"][:, :3] = raw_feats["ref_pos"]
+        ###################################################
+        #            Create token pair features           #
+        ###################################################
+        no_token = len(raw_feats["token_id_to_conformer_id"])
+        token_bonds = np.zeros([no_token, no_token], dtype=np.float32)
+        rel_tok_feat = np.zeros([no_token, no_token, 42], dtype=np.float32)
+        offset = 0
+        atom_offset = 0
+        for ccd, len_atoms in zip(
+                raw_feats["ccds"],
+                raw_feats["conformer_id_to_chunk_sizes"]
+        ):
+            if rc.is_standard(ccd) or rc.is_unk(ccd):
+                offset += 1
+            else:
+                len_atoms = len_atoms.item()
+                inner_atom_idx = raw_feats["atom_id_to_conformer_atom_id"][atom_offset:atom_offset + len_atoms]
+                token_bonds[offset:offset + len_atoms, offset:offset + len_atoms] = \
+                    CONF_META_DATA[ccd]["token_bonds"][inner_atom_idx][:, inner_atom_idx]
+                rel_tok_feat[offset:offset + len_atoms, offset:offset + len_atoms] = \
+                    CONF_META_DATA[ccd]["rel_tok_feat"][inner_atom_idx][:, inner_atom_idx]
+                offset += len_atoms
+            atom_offset += len_atoms
+        raw_feats["token_bonds"] = token_bonds.astype(np.float32)
+        raw_feats["token_bonds_feature"] = token_bonds.astype(np.float32)
+        raw_feats["rel_tok_feat"] = rel_tok_feat.astype(np.float32)
+        ###################################################
+        #              Charility Augmentation             #
+        ###################################################
+        if not self.inference_mode:
+            # TODO Charility probs
+            charility_seed = random.random()
+            if charility_seed < self.train_charility_augmentation_ratio:
+                ref_chirality = raw_feats["ref_feat"][:, 158:161]
+                ref_chirality_replace = np.zeros_like(ref_chirality)
+                ref_chirality_replace[:, 2] = 1
+                is_ligand_atom = raw_feats["is_ligand"][raw_feats["atom_id_to_token_id"]]
+                remove_charility = (np.random.randint(0, 2, [len(is_ligand_atom)]) * is_ligand_atom).astype(
+                    np.bool_)
+                ref_chirality = np.where(remove_charility[:, None], ref_chirality_replace, ref_chirality)
+                raw_feats["ref_feat"][:, 158:161] = ref_chirality
+        # MASKS
+        raw_feats["x_exists"] = np.ones_like(raw_feats["x_gt"][..., 0]).astype(np.float32)
+        raw_feats["a_mask"] = raw_feats["x_exists"]
+        raw_feats["s_mask"] = np.ones_like(raw_feats["asym_id"]).astype(np.float32)
+        raw_feats["ref_space_uid"] = raw_feats["atom_id_to_conformer_id"]
+        # Write Infer Meta Data
+        infer_meta_data["ccds"] = raw_feats.pop("ccds")
+        infer_meta_data["atom_id_to_conformer_atom_id"] = raw_feats.pop("atom_id_to_conformer_atom_id")
+        infer_meta_data["residue_index"] = residue_index_conformerwise
+        infer_meta_data["asym_id"] = asym_id_conformerwise
+        infer_meta_data["conformer_id_to_chunk_sizes"] = raw_feats.pop("conformer_id_to_chunk_sizes")
+        return raw_feats, infer_meta_data
+    def make_feats(self, tensors):
+        # Target Feat
+        tensors["target_feat"] = torch.cat([
+            F.one_hot(tensors["restype"].long(), 32).float(),
+            tensors["profile"].float(),
+            tensors["deletion_mean"][..., None].float()
+        ], dim=-1)
+        if self.num_recycles is None:
+            # MSA Feat
+            inds = [0] + torch.randperm(len(tensors["msa"]))[:self.max_msa_clusters - 1].tolist()
+            tensors["msa"] = tensors["msa"][inds]
+            tensors["deletion_matrix"] = tensors["deletion_matrix"][inds]
+            has_deletion = torch.clamp(tensors["deletion_matrix"].float(), min=0., max=1.)
+            pi = torch.acos(torch.zeros(1, device=tensors["deletion_matrix"].device)) * 2
+            deletion_value = (torch.atan(tensors["deletion_matrix"] / 3.) * (2. / pi))
+            tensors["msa_feat"] = torch.cat([
+                F.one_hot(tensors["msa"].long(), 32).float(),
+                has_deletion[..., None].float(),
+                deletion_value[..., None].float(),
+            ], dim=-1)
+        else:
+            batch_msa_feat = []
+            for i in range(self.num_recycles):
+                inds = [0] + torch.randperm(len(tensors["msa"]))[:self.max_msa_clusters - 1].tolist()
+                tensors["msa"] = tensors["msa"][inds]
+                tensors["deletion_matrix"] = tensors["deletion_matrix"][inds]
+                has_deletion = torch.clamp(tensors["deletion_matrix"].float(), min=0., max=1.)
+                pi = torch.acos(torch.zeros(1, device=tensors["deletion_matrix"].device)) * 2
+                deletion_value = (torch.atan(tensors["deletion_matrix"] / 3.) * (2. / pi))
+                msa_feat = torch.cat([
+                    F.one_hot(tensors["msa"].long(), 32).float(),
+                    has_deletion[..., None].float(),
+                    deletion_value[..., None].float(),
+                ], dim=-1)
+                batch_msa_feat.append(msa_feat)
+            tensors["msa_feat"] = batch_msa_feat[0]
+            tensors["batch_msa_feat"] = torch.stack(batch_msa_feat, dim=0)
+        tensors.pop("msa", None)
+        tensors.pop("deletion_mean", None)
+        tensors.pop("profile", None)
+        tensors.pop("deletion_matrix", None)
+        return tensors
+    def _make_token_bonds(self, tensors):
+        # Get Polymer-Ligand & Ligand-Ligand Within Conformer Token Bond
+        # Atomwise asym_id
+        asym_id = tensors["asym_id"][tensors["atom_id_to_token_id"]]
+        is_ligand = tensors["is_ligand"][tensors["atom_id_to_token_id"]]
+        x_gt = tensors["x_gt"]
+        a_mask = tensors["a_mask"]
+        # Get
+        atom_id_to_token_id = tensors["atom_id_to_token_id"]
+        num_token = len(tensors["asym_id"])
+        between_conformer_token_bonds = torch.zeros([num_token, num_token])
+        # create chainwise feature
+        asym_id_chain = []
+        asym_id_atom_offset = []
+        asym_id_is_ligand = []
+        for atom_offset, (a_id, i_id) in enumerate(zip(asym_id.tolist(), is_ligand.tolist())):
+            if len(asym_id_chain) == 0 or asym_id_chain[-1] != a_id:
+                asym_id_chain.append(a_id)
+                asym_id_atom_offset.append(atom_offset)
+                asym_id_is_ligand.append(i_id)
+        len_asym_id_chain = len(asym_id_chain)
+        if len_asym_id_chain >= 2:
+            for i in range(0, len_asym_id_chain - 1):
+                asym_id_i = asym_id_chain[i]
+                mask_i = asym_id == asym_id_i
+                x_gt_i = x_gt[mask_i]
+                a_mask_i = a_mask[mask_i]
+                for j in range(i + 1, len_asym_id_chain):
+                    if not bool(asym_id_is_ligand[i]) and not bool(asym_id_is_ligand[j]):
+                        continue
+                    asym_id_j = asym_id_chain[j]
+                    mask_j = asym_id == asym_id_j
+                    x_gt_j = x_gt[mask_j]
+                    a_mask_j = a_mask[mask_j]
+                    dis_ij = torch.norm(x_gt_i[:, None, :] - x_gt_j[None, :, :], dim=-1)
+                    dis_ij = dis_ij + (1 - a_mask_i[:, None] * a_mask_j[None]) * 1000
+                    if torch.min(dis_ij) < self.token_bond_threshold:
+                        ij = torch.argmin(dis_ij).item()
+                        l_j = len(x_gt_j)
+                        atom_i = int(ij // l_j)  # raw
+                        atom_j = int(ij % l_j)  # col
+                        global_atom_i = atom_i + asym_id_atom_offset[i]
+                        global_atom_j = atom_j + asym_id_atom_offset[j]
+                        token_i = atom_id_to_token_id[global_atom_i]
+                        token_j = atom_id_to_token_id[global_atom_j]
+                        between_conformer_token_bonds[token_i, token_j] = 1
+                        between_conformer_token_bonds[token_j, token_i] = 1
+        token_bond_seed = random.random()
+        tensors["token_bonds"] = tensors["token_bonds"] + between_conformer_token_bonds
+        # Docking Indicate Token Bond
+        # if token_bond_seed >= 1:
+        #     tensors["token_bonds_feature"] = tensors["token_bonds"]
+        return tensors
+    def _pad_to_size(self, tensors):
+        to_pad_atom = self.atom_crop_size - len(tensors["x_gt"])
+        to_pad_token = self.crop_size - len(tensors["residue_index"])
+        if to_pad_token > 0:
+            for k in ["restype", "residue_index", "is_protein", "is_short_poly", "is_ligand", "is_key_res",
+                      "asym_id", "entity_id", "sym_id", "token_id_to_conformer_id", "s_mask",
+                      "token_id_to_centre_atom_id", "token_id_to_pseudo_beta_atom_id", "token_id_to_chunk_sizes",
+                      "pocket_res_feat"]:
+                tensors[k] = torch.nn.functional.pad(tensors[k], [0, to_pad_token])
+            for k in ["target_feat", "msa_feat", "key_res_feat", "batch_msa_feat"]:
+                if k in tensors:
+                    tensors[k] = torch.nn.functional.pad(tensors[k], [0, 0, 0, to_pad_token])
+            for k in ["token_bonds", "token_bonds_feature"]:
+                tensors[k] = torch.nn.functional.pad(tensors[k], [0, to_pad_token, 0, to_pad_token])
+            for k in ["rel_tok_feat"]:
+                tensors[k] = torch.nn.functional.pad(tensors[k], [0, 0, 0, to_pad_token, 0, to_pad_token])
+        if to_pad_atom > 0:
+            for k in ["a_mask", "x_exists", "atom_id_to_conformer_id", "atom_id_to_token_id", "ref_space_uid"]:
+                tensors[k] = torch.nn.functional.pad(tensors[k], [0, to_pad_atom])
+            for k in ["x_gt", "ref_feat", "ref_pos"]:  # , "ref_pos_new"
+                tensors[k] = torch.nn.functional.pad(tensors[k], [0, 0, 0, to_pad_atom])
+            # for k in ["z_mask"]:  # , "ref_pos_new"
+            #     tensors[k] = torch.nn.functional.pad(tensors[k], [0, to_pad_atom, 0, to_pad_atom])
+            # for k in ["conformer_mask_atom"]:
+            #     tensors[k] = torch.nn.functional.pad(tensors[k], [0, to_pad_atom, 0, to_pad_atom])
+            # for k in ["rel_token_feat_atom"]:
+            #     tensors[k] = torch.nn.functional.pad(tensors[k], [0,0,0, to_pad_atom, 0, to_pad_atom])
+            # rel_token_feat_atom
+        return tensors
+    def get_template_feat(self, tensors):
+        x_gt = tensors["x_gt"][tensors["token_id_to_pseudo_beta_atom_id"]]
+        z_mask = tensors["z_mask"]
+        asym_id = tensors["asym_id"]
+        is_protein = tensors["is_protein"]
+        chain_same = (asym_id[None] == asym_id[:, None]).float()
+        protein2d = is_protein[None] * is_protein[:, None]
+        dgram = dgram_from_positions(x_gt, no_bins=39)
+        dgram = dgram * protein2d[..., None] * z_mask[..., None]
+        if not self.inference_mode:
+            if random.random() > self.train_use_template_ratio:
+                tensors["t_mask"] = torch.tensor(1, dtype=torch.float32)
+                bert_mask = torch.rand([len(x_gt)]) > random.random() * (1 - self.train_template_mask_max_ratio)
+                template_pseudo_beta_mask = (bert_mask[None] * bert_mask[:, None]) * z_mask * protein2d
+            else:
+                tensors["t_mask"] = torch.tensor(0, dtype=torch.float32)
+                template_pseudo_beta_mask = z_mask * protein2d
+        else:
+            tensors["t_mask"] = torch.tensor(1, dtype=torch.float32)
+            template_pseudo_beta_mask = z_mask * protein2d
+        dgram = dgram * template_pseudo_beta_mask[..., None]
+        templ_feat = torch.cat([dgram, template_pseudo_beta_mask[..., None]], dim=-1)
+        tensors["templ_feat"] = templ_feat.float()
+        return tensors
+    def transform(self, raw_feats):
+        # np to tensor
+        tensors = dict()
+        for key in raw_feats.keys():
+            tensors[key] = torch.from_numpy(raw_feats[key])
+        # Make Target & MSA Feat
+        tensors = self.make_feats(tensors)
+        # Make Token Bond Feat
+        tensors = self._make_token_bonds(tensors)
+        # Padding
+        if not self.inference_mode:
+            tensors = self._pad_to_size(tensors)
+        # Mask
+        tensors["z_mask"] = tensors["s_mask"][None] * tensors["s_mask"][:, None]
+        tensors["ap_mask"] = tensors["a_mask"][None] * tensors["a_mask"][:, None]
+        tensors["is_dna"] = torch.zeros_like(tensors["is_protein"])
+        tensors["is_rna"] = torch.zeros_like(tensors["is_protein"])
+        # Template
+        tensors = self.get_template_feat(tensors)
+        # Correct Type
+        is_short_poly = tensors.pop("is_short_poly")
+        tensors["is_protein"] = tensors["is_protein"] + is_short_poly
+        tensors["is_ligand"] = tensors["is_ligand"] - is_short_poly
+        return tensors
+    # residue_index 0-100
+    # CCDS
+    # CCD<RES_ID>
+    #
+    def load(
+            self,
+            system_pkl_path,  # Receptor chains: all_atom_positions pocket_res_feat  Ligand_chains
+            template_receptor_chain_ids=None,  # ["A"]
+            template_ligand_chain_ids=None,  # ["1"]
+            remove_receptor=False,
+            remove_ligand=False,  # True, CCD_META_DATA ref_mol
+            smi=None,  # "CCCCC"
+    ):
+        ##########################################################
+        #               Initialization of Configs                #
+        ##########################################################
+        if self.inference_mode:
+            pocket_type = self.infer_pocket_type
+            pocket_cutoff = self.infer_pocket_cutoff
+            pocket_dist_type = self.infer_pocket_dist_type
+            use_pocket = self.infer_use_pocket
+            use_key_res = self.infer_use_key_res
+        else:
+            pocket_type = random.choices(
+                ["atom", "ca"],
+                [self.train_pocket_type_atom_ratio, 1 - self.train_pocket_type_atom_ratio])
+            pocket_dist_type = random.choices(
+                ["ligand", "ligand_cetre"],
+                [self.train_pocket_dist_type_ligand_ratio, 1 - self.train_pocket_dist_type_ligand_ratio])
+            if pocket_dist_type == "ligand":
+                pocket_cutoff = self.train_pocket_cutoff_ligand_min + random.random() * (
+                        self.train_pocket_cutoff_ligand_max - self.train_pocket_cutoff_ligand_min)
+            else:
+                pocket_cutoff = self.train_pocket_cutoff_ligand_centre_min + random.random() * (
+                        self.train_pocket_cutoff_ligand_centre_max - self.train_pocket_cutoff_ligand_centre_min)
+            use_pocket = random.random() < self.train_use_pocket_ratio
+            use_key_res = random.random() < self.train_use_key_res_ratio
+        ##########################################################
+        #               Initialization of features               #
+        ##########################################################
+        system_id = os.path.split(system_pkl_path)[1][:-7]
+        all_chain_labels = {}
+        all_chain_features = {}
+        CONF_META_DATA = {}
+        ref_mol = None
+        CHAIN_CLASS = {}
+        SEQ3 = {}
+        ASYM_ID = {}
+        ##########################################################
+        #                  Load All Chain Labels                 #
+        ##########################################################
+        data = load_pkl(system_pkl_path)
+        # print(data)
+        if template_receptor_chain_ids is None:
+            template_receptor_chain_ids = [chain_id for chain_id in data.keys() if not chain_id.isdigit()]
+        if template_ligand_chain_ids is None:
+            template_ligand_chain_ids = [chain_id for chain_id in data.keys() if chain_id.isdigit()]
+        # TODO: Save Ligand Centre for cropped screening
+        # Calculate Pocket Residue According to Template receptor and ligand
+        if not remove_receptor and len(template_ligand_chain_ids) > 0:
+            for receptor_chain_id in template_receptor_chain_ids:
+                ccds_receptor = data[receptor_chain_id]["ccds"]
+                x_gt_receptor = data[receptor_chain_id]["all_atom_positions"]
+                x_exists_receptor = data[receptor_chain_id]["all_atom_mask"]
+                x_gt_this_receptor = []
+                atom_id_to_ccd_id = []
+                for ccd_id, (ccd, x_gt_ccd, x_exists_ccd) in enumerate(
+                        zip(ccds_receptor, x_gt_receptor, x_exists_receptor)):
+                    if rc.is_standard(ccd):
+                        x_exists_ccd_bool = x_exists_ccd.astype(np.bool_)
+                        if x_exists_ccd_bool[1]:  # CA exsits
+                            if pocket_type == "atom":
+                                num_atoms = sum(x_exists_ccd_bool)
+                                x_gt_this_receptor.append(x_gt_ccd[x_exists_ccd_bool])
+                                atom_id_to_ccd_id += num_atoms * [ccd_id]
+                            else:
+                                x_gt_this_receptor.append(x_gt_ccd[1][None])
+                                atom_id_to_ccd_id.append(ccd_id)
+                x_gt_this_receptor = np.concatenate(x_gt_this_receptor, axis=0)
+                atom_id_to_ccd_id = np.array(atom_id_to_ccd_id)
+                used_ccd_ids = []
+                for ligand_chain_id in template_ligand_chain_ids:
+                    x_gt_ligand = data[ligand_chain_id]["all_atom_positions"]
+                    x_exists_ligand = data[ligand_chain_id]["all_atom_mask"]
+                    x_gt_ligand = np.concatenate(x_gt_ligand, axis=0)[
+                        np.concatenate(x_exists_ligand, axis=0).astype(np.bool_)]
+                    if pocket_dist_type == "ligand":
+                        used_ccd_bool = np.any(
+                            np.linalg.norm(x_gt_this_receptor[:, None] - x_gt_ligand[None], axis=-1) < pocket_cutoff,
+                            axis=-1)
+                    elif pocket_dist_type == "ligand_centre":
+                        x_mean = np.min(x_gt_ligand, axis=0, keepdims=True)
+                        used_ccd_bool = np.any(
+                            np.linalg.norm(x_gt_this_receptor[:, None] - x_mean[None], axis=-1) < pocket_cutoff,
+                            axis=-1)
+                    else:
+                        raise NotImplementedError()
+                    used_ccd_ids.append(atom_id_to_ccd_id[used_ccd_bool])
+                used_ccd_ids = list(sorted(list(set(np.concatenate(used_ccd_ids, axis=0).tolist()))))
+                pocket_res_feat = np.zeros([len(ccds_receptor)], dtype=np.float32)
+                pocket_res_feat[used_ccd_ids] = 1.
+                all_chain_labels[receptor_chain_id] = data[receptor_chain_id]
+                all_chain_labels[receptor_chain_id]["pocket_res_feat"] = pocket_res_feat
+        if remove_ligand:
+            if remove_receptor:
+                assert smi is not None and self.inference_mode
+            if smi is not None:
+                all_chain_labels, CONF_META_DATA, ref_mol = self._update_smi(smi, all_chain_labels, CONF_META_DATA)
+        else:
+            assert smi is None
+            for ligand_chain_id in template_ligand_chain_ids:
+                all_chain_labels[ligand_chain_id] = data[ligand_chain_id]
+            # For Benchmarking
+            if len(template_ligand_chain_ids) == 1:
+                ccds = all_chain_labels[template_ligand_chain_ids[0]]["ccds"]
+                if len(ccds) == 1:
+                    # ref_mol = self.ccd_id_ref_mol[ccds[0]]
+                    ref_mol = self.ccd_id_meta_data[ccds[0]]["ref_mol"]
+        ##########################################################
+        #                 Init All Chain Features                #
+        ##########################################################
+        for chain_id, chain_feature in all_chain_labels.items():
+            ccds = chain_feature["ccds"]
+            CONF_META_DATA = self._update_CONF_META_DATA(CONF_META_DATA, ccds)
+            SEQ3[chain_id] = "-".join(ccds)
+            chain_class = "protein" if not chain_id.isdigit() else "ligand"
+            chain_feature["chain_class"] = chain_class
+            # print(chain_id)
+            all_chain_features[chain_id] = self._update_chain_feature(
+                chain_feature,
+                CONF_META_DATA,
+                use_pocket,
+                use_key_res,
+            )
+            CHAIN_CLASS[chain_id] = chain_class
+        ##########################################################
+        #                  Add Assembly Feature                  #
+        ##########################################################
+        all_chain_features, ASYM_ID = self._add_assembly_feature(all_chain_features, SEQ3, ASYM_ID)
+        infer_meta_data = {
+            "CONF_META_DATA": CONF_META_DATA,
+            "SEQ3": SEQ3,
+            "ASYM_ID": ASYM_ID,
+            "CHAIN_CLASS": CHAIN_CLASS,
+            "ref_mol": ref_mol,
+            "system_id": system_id,
+        }
+        ##########################################################
+        #                       Cropping                         #
+        ##########################################################
+        # if not self.inference_mode:
+        if self.crop_size is not None:
+            all_chain_features, infer_meta_data = self._crop_all_chain_features(
+                all_chain_features, infer_meta_data, crop_centre=None)  # TODO: Add Cropping Centre
+        ##########################################################
+        #                      Pair & Merge                      #
+        ##########################################################
+        raw_feats, infer_meta_data = self.pair_and_merge(all_chain_features, infer_meta_data)
+        ##########################################################
+        #                        Transform                       #
+        ##########################################################
+        tensors = self.transform(raw_feats)
+        return tensors, infer_meta_data
+    def write_pdb(self, x_pred, fname, infer_meta_data, receptor_only=False, ligand_only=False):
+        ccds = infer_meta_data["ccds"]
+        atom_id_to_conformer_atom_id = infer_meta_data["atom_id_to_conformer_atom_id"]
+        ccd_chunk_sizes = infer_meta_data["conformer_id_to_chunk_sizes"].tolist()
+        CHAIN_CLASS = infer_meta_data["CHAIN_CLASS"]
+        conf_meta_data = infer_meta_data["CONF_META_DATA"]
+        residue_index = infer_meta_data["residue_index"].tolist()
+        asym_id = infer_meta_data["asym_id"].tolist()
+        atom_lines = []
+        atom_offset = 0
+        for ccd_id, (ccd, chunk_size, res_id) in enumerate(zip(ccds, ccd_chunk_sizes, residue_index)):
+            inner_atom_idx = atom_id_to_conformer_atom_id[atom_offset:atom_offset + chunk_size]
+            atom_names = [conf_meta_data[ccd]["ref_atom_name_chars"][i] for i in inner_atom_idx]
+            atom_elements = [PeriodicTable[conf_meta_data[ccd]["ref_element"][i]] for i in inner_atom_idx]
+            chain_tag = PDB_CHAIN_IDS[int(asym_id[ccd_id])]
+            record_type = "HETATM" if CHAIN_CLASS[ccd_id] == "ligand" else "ATOM"
+            for ccd_atom_idx, atom_name in enumerate(atom_names):
+                x = x_pred[atom_offset]
+                name = atom_name if len(atom_name) == 4 else f" {atom_name}"
+                res_name_3 = ccd
+                alt_loc = ""
+                insertion_code = ""
+                occupancy = 1.00
+                element = atom_elements[ccd_atom_idx]
+                # b_factor = torch.argmax(plddt[atom_offset],dim=-1).item()*2 +1
+                b_factor = 70.
+                charge = 0
+                pos = x.tolist()
+                atom_line = (
+                    f"{record_type:<6}{atom_offset + 1:>5} {name:<4}{alt_loc:>1}"
+                    f"{res_name_3.split()[0][-3:]:>3} {chain_tag:>1}"
+                    f"{res_id + 1:>4}{insertion_code:>1}   "
+                    f"{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}"
+                    f"{occupancy:>6.2f}{b_factor:>6.2f}          "
+                    f"{element:>2}{charge:>2}"
+                )
+                if receptor_only and not ligand_only:
+                    if record_type == "ATOM":
+                        atom_lines.append(atom_line)
+                elif not receptor_only and ligand_only:
+                    if record_type == "HETATM":
+                        atom_lines.append(atom_line)
+                elif not receptor_only and not ligand_only:
+                    atom_lines.append(atom_line)
+                else:
+                    raise NotImplementedError()
+                atom_offset += 1
+                if atom_offset == len(atom_id_to_conformer_atom_id):
+                    break
+        out = "\n".join(atom_lines)
+        out = f"MODEL     1\n{out}\nTER\nENDMDL\nEND"
+        dump_txt(out, fname)
+    def write_pdb_block(self, x_pred, infer_meta_data, receptor_only=False, ligand_only=False):
+        ccds = infer_meta_data["ccds"]
+        atom_id_to_conformer_atom_id = infer_meta_data["atom_id_to_conformer_atom_id"]
+        ccd_chunk_sizes = infer_meta_data["conformer_id_to_chunk_sizes"].tolist()
+        CHAIN_CLASS = infer_meta_data["CHAIN_CLASS"]
+        conf_meta_data = infer_meta_data["CONF_META_DATA"]
+        residue_index = infer_meta_data["residue_index"].tolist()
+        asym_id = infer_meta_data["asym_id"].tolist()
+        atom_lines = []
+        atom_offset = 0
+        for ccd_id, (ccd, chunk_size, res_id) in enumerate(zip(ccds, ccd_chunk_sizes, residue_index)):
+            inner_atom_idx = atom_id_to_conformer_atom_id[atom_offset:atom_offset + chunk_size]
+            atom_names = [conf_meta_data[ccd]["ref_atom_name_chars"][i] for i in inner_atom_idx]
+            atom_elements = [PeriodicTable[conf_meta_data[ccd]["ref_element"][i]] for i in inner_atom_idx]
+            chain_tag = PDB_CHAIN_IDS[int(asym_id[ccd_id])]
+            record_type = "HETATM" if CHAIN_CLASS[ccd_id] == "ligand" else "ATOM"
+            for ccd_atom_idx, atom_name in enumerate(atom_names):
+                x = x_pred[atom_offset]
+                name = atom_name if len(atom_name) == 4 else f" {atom_name}"
+                res_name_3 = ccd
+                alt_loc = ""
+                insertion_code = ""
+                occupancy = 1.00
+                element = atom_elements[ccd_atom_idx]
+                # b_factor = torch.argmax(plddt[atom_offset],dim=-1).item()*2 +1
+                b_factor = 70.
+                charge = 0
+                pos = x.tolist()
+                atom_line = (
+                    f"{record_type:<6}{atom_offset + 1:>5} {name:<4}{alt_loc:>1}"
+                    f"{res_name_3.split()[0][-3:]:>3} {chain_tag:>1}"
+                    f"{res_id + 1:>4}{insertion_code:>1}   "
+                    f"{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}"
+                    f"{occupancy:>6.2f}{b_factor:>6.2f}          "
+                    f"{element:>2}{charge:>2}"
+                )
+                if receptor_only and not ligand_only:
+                    if record_type == "ATOM":
+                        atom_lines.append(atom_line)
+                elif not receptor_only and ligand_only:
+                    if record_type == "HETATM":
+                        atom_lines.append(atom_line)
+                elif not receptor_only and not ligand_only:
+                    atom_lines.append(atom_line)
+                else:
+                    raise NotImplementedError()
+                atom_offset += 1
+                if atom_offset == len(atom_id_to_conformer_atom_id):
+                    break
+        out = "\n".join(atom_lines)
+        out = f"MODEL     1\n{out}\nTER\nENDMDL\nEND"
+        return out

PhysDock/data/feature_loader_plinder.py ADDED Viewed

	@@ -0,0 +1,1258 @@

+############################
+# 0.85 Receptor+Ligand
+#  0.5 APO Template
+#  0.5 HOLO Template
+# 0.05 Protein (All APO or PRED)
+# 0.1 Ligand
+############################
+import copy
+import os
+import random
+from functools import reduce
+from operator import add
+import torch
+import torch.nn.functional as F
+# Key Res
+# Dynamic Cutoff
+import numpy as np
+from stdock.data.constants.PDBData import protein_letters_3to1_extended
+from stdock.data.constants import restype_constants as rc
+from stdock.utils.io_utils import convert_md5_string, load_json, load_pkl, dump_txt
+from stdock.data.tools.feature_processing_multimer import pair_and_merge
+from stdock.utils.tensor_utils import centre_random_augmentation_np_apply, dgram_from_positions, \
+    centre_random_augmentation_np_batch
+from stdock.data.constants.periodic_table import PeriodicTable
+PDB_CHAIN_IDS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+class FeatureLoader:
+    def __init__(
+            self,
+            # config,
+            token_crop_size=256,
+            atom_crop_size=256 * 8,
+            inference_mode=False,
+    ):
+        self.inference_mode = inference_mode
+        self.msa_features_path = "/2022133002/data/stfold-data-v5/features/msa_features/"
+        self.uniprot_msa_features_path = "/2022133002/data/stfold-data-v5/features/uniprot_msa_features/"
+        self.token_crop_size = token_crop_size
+        self.atom_crop_size = atom_crop_size
+        self.token_bond_threshold = 2.4
+        self.ccd_id_meta_data = load_pkl(
+            "/2022133002/projects/stdock/stdock_v9.5/scripts/ccd_meta_data_confs_chars.pkl.gz")
+        self.samples_path = "/2022133002/data/plinder/2024-06/v2/plinder_samples_raw_data_v2"
+        to_remove = set(load_json("/2022133002/projects/stdock/stdock_v9.6/scripts/to_remove.json"))
+        weights = load_json(
+            "/2022133002/projects/stdock/stdock_v9.5/scripts/cluster_scripts/train_samples_new_weight_seq.json")
+        self.splits = load_json("/2022133002/data/plinder/2024-06/v2/splits/splits.json")
+        self.used_sample_ids = [sample_id for sample_id in weights
+                                if sample_id in self.splits and self.splits[sample_id] != "test"
+                                and sample_id not in to_remove]
+        print("train samples", len(self.used_sample_ids))
+        # self.weights = np.array(list(weights.values()))
+        self.weights = np.array([weights[sample_id] for sample_id in self.used_sample_ids])
+        self.probabilities = torch.from_numpy(self.weights / self.weights.sum())
+        self.used_test_sample_ids = [sample_id for sample_id in weights
+                                     if sample_id in self.splits and self.splits[sample_id] == "test"
+                                     and sample_id not in to_remove]
+        print("test samples", len(self.used_test_sample_ids))
+        # self.weights = np.array(list(weights.values()))
+        self.test_weights = np.array([weights[sample_id] for sample_id in self.used_test_sample_ids])
+        self.test_probabilities = torch.from_numpy(self.test_weights / self.test_weights.sum())
+    def _update_CONF_META_DATA(self, CONF_META_DATA, ccds):
+        for ccd in ccds:
+            if ccd in CONF_META_DATA:
+                continue
+            ccd_features = self.ccd_id_meta_data[ccd]
+            ref_pos = ccd_features["ref_pos"]
+            ref_pos = ref_pos - np.mean(ref_pos, axis=0, keepdims=True)
+            CONF_META_DATA[ccd] = {
+                "ref_feat": np.concatenate([
+                    ref_pos,
+                    ccd_features["ref_charge"][..., None],
+                    rc.eye_128[ccd_features["ref_element"]].astype(np.float32),
+                    ccd_features["ref_is_aromatic"].astype(np.float32)[..., None],
+                    rc.eye_9[ccd_features["ref_degree"]].astype(np.float32),
+                    rc.eye_7[ccd_features["ref_hybridization"]].astype(np.float32),
+                    rc.eye_9[ccd_features["ref_implicit_valence"]].astype(np.float32),
+                    rc.eye_3[ccd_features["ref_chirality"]].astype(np.float32),
+                    ccd_features["ref_in_ring_of_3"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_4"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_5"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_6"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_7"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_8"].astype(np.float32)[..., None],
+                ], axis=-1),
+                "rel_tok_feat": np.concatenate([
+                    rc.eye_32[ccd_features["d_token"]].astype(np.float32),
+                    rc.eye_5[ccd_features["bond_type"]].astype(np.float32),
+                    ccd_features["token_bonds"].astype(np.float32)[..., None],
+                    ccd_features["bond_as_double"].astype(np.float32)[..., None],
+                    ccd_features["bond_in_ring"].astype(np.float32)[..., None],
+                    ccd_features["bond_is_conjugated"].astype(np.float32)[..., None],
+                    ccd_features["bond_is_aromatic"].astype(np.float32)[..., None],
+                ], axis=-1),
+                "ref_atom_name_chars": ccd_features["ref_atom_name_chars"],
+                "ref_element": ccd_features["ref_element"],
+                "token_bonds": ccd_features["token_bonds"],
+            }
+            if not rc.is_standard(ccd):
+                conformers = ccd_features["conformers"]
+                if conformers is None:
+                    conformers = np.repeat(ccd_features["ref_pos"][None], 32, axis=0)
+                else:
+                    conformers = np.stack([random.choice(ccd_features["conformers"]) for i in range(32)], axis=0)
+                CONF_META_DATA[ccd]["batch_ref_pos"] = centre_random_augmentation_np_batch(conformers)
+        return CONF_META_DATA
+    def _update_CONF_META_DATA_ligand(self, CONF_META_DATA, sequence_3, ccd_features):
+        ccds = sequence_3.split("-")
+        # ccd_features = self.ccd_id_meta_data[ccd]
+        for ccd in ccds:
+            CONF_META_DATA[ccd] = {
+                "ref_feat": np.concatenate([
+                    ccd_features["ref_pos"],
+                    ccd_features["ref_charge"][..., None],
+                    rc.eye_128[ccd_features["ref_element"]].astype(np.float32),
+                    ccd_features["ref_is_aromatic"].astype(np.float32)[..., None],
+                    rc.eye_9[ccd_features["ref_degree"]].astype(np.float32),
+                    rc.eye_7[ccd_features["ref_hybridization"]].astype(np.float32),
+                    rc.eye_9[ccd_features["ref_implicit_valence"]].astype(np.float32),
+                    rc.eye_3[ccd_features["ref_chirality"]].astype(np.float32),
+                    ccd_features["ref_in_ring_of_3"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_4"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_5"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_6"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_7"].astype(np.float32)[..., None],
+                    ccd_features["ref_in_ring_of_8"].astype(np.float32)[..., None],
+                ], axis=-1),
+                "rel_tok_feat": np.concatenate([
+                    rc.eye_32[ccd_features["d_token"]].astype(np.float32),
+                    rc.eye_5[ccd_features["bond_type"]].astype(np.float32),
+                    ccd_features["token_bonds"].astype(np.float32)[..., None],
+                    ccd_features["bond_as_double"].astype(np.float32)[..., None],
+                    ccd_features["bond_in_ring"].astype(np.float32)[..., None],
+                    ccd_features["bond_is_conjugated"].astype(np.float32)[..., None],
+                    ccd_features["bond_is_aromatic"].astype(np.float32)[..., None],
+                ], axis=-1),
+                "ref_atom_name_chars": ccd_features["ref_atom_name_chars"],
+                "ref_element": ccd_features["ref_element"],
+                "token_bonds": ccd_features["token_bonds"],
+            }
+            if not rc.is_standard(ccd):
+                conformers = ccd_features["conformers"]
+                if conformers is None:
+                    conformers = np.repeat(ccd_features["ref_pos"][None], 32, axis=0)
+                else:
+                    conformers = np.stack([random.choice(ccd_features["conformers"]) for i in range(32)], axis=0)
+                CONF_META_DATA[ccd]["batch_ref_pos"] = centre_random_augmentation_np_batch(conformers)
+        return CONF_META_DATA
+    def _update_chain_feature(self, chain_feature, CONF_META_DATA):
+        ccds_ori = chain_feature["ccds"]
+        chain_class = chain_feature["chain_class"]
+        if chain_class == "protein":
+            sequence = "".join([protein_letters_3to1_extended.get(ccd, "X") for ccd in ccds_ori])
+            md5 = convert_md5_string(f"protein:{sequence}")
+            chain_feature.update(
+                load_pkl(os.path.join(self.msa_features_path, f"{md5}.pkl.gz"))
+            )
+            chain_feature.update(
+                load_pkl(os.path.join(self.uniprot_msa_features_path, f"{md5}.pkl.gz"))
+            )
+        else:
+            chain_feature["msa"] = np.array([[rc.standard_ccds.index(ccd)
+                                              if ccd in rc.standard_ccds else 20 for ccd in ccds_ori]] * 2,
+                                            dtype=np.int8)
+            chain_feature["deletion_matrix"] = np.zeros_like(chain_feature["msa"])
+        # Merge Key Res Feat & Augmentation
+        if "salt bridges" in chain_feature:
+            key_res_feat = np.stack([
+                chain_feature["salt bridges"],
+                chain_feature["pi-cation interactions"],
+                chain_feature["hydrophobic interactions"],
+                chain_feature["pi-stacking"],
+                chain_feature["hydrogen bonds"],
+                chain_feature["metal complexes"],
+                np.zeros_like(chain_feature["salt bridges"]),
+            ], axis=-1).astype(np.float32)
+        else:
+            key_res_feat = np.zeros(
+                [len(ccds_ori), 7], dtype=np.float32
+            )
+        is_key_res = np.any(key_res_feat.astype(np.bool_), axis=-1).astype(np.float32)
+        # Augmentation
+        if not self.inference_mode:
+            key_res_feat = key_res_feat * (np.random.random([len(ccds_ori), 7]) < 0.5)
+        # else:
+        #     # TODO: No key res in inference mode
+        #     key_res_feat = key_res_feat * 0
+        # Atom
+        x_gt = []
+        atom_id_to_conformer_atom_id = []
+        # Conformer
+        conformer_id_to_chunk_sizes = []
+        residue_index = []
+        restype = []
+        ccds = []
+        conformer_exists = []
+        for c_id, ccd in enumerate(chain_feature["ccds"]):
+            no_atom_this_conf = len(CONF_META_DATA[ccd]["ref_feat"])
+            conformer_atom_ids_this_conf = np.arange(no_atom_this_conf)
+            x_gt_this_conf = chain_feature["all_atom_positions"][c_id]
+            x_exists_this_conf = chain_feature["all_atom_mask"][c_id].astype(np.bool_)
+            # TODO DEBUG
+            #
+            conformer_exist = np.any(x_exists_this_conf).item()
+            if rc.is_standard(ccd):
+                conformer_exist = np.sum(x_exists_this_conf).item() > len(x_exists_this_conf) - 2
+                # conformer_exist = conformer_exist and x_exists_this_conf[1]
+                # if ccd != "GLY":
+                #     conformer_exist = conformer_exist and x_exists_this_conf[4]
+            conformer_exists.append(conformer_exist)
+            if conformer_exist:
+                # Atomwise
+                x_gt.append(x_gt_this_conf[x_exists_this_conf])
+                atom_id_to_conformer_atom_id.append(conformer_atom_ids_this_conf[x_exists_this_conf])
+                # Tokenwise
+                residue_index.append(c_id)
+                conformer_id_to_chunk_sizes.append(np.sum(x_exists_this_conf).item())
+                restype.append(rc.standard_ccds.index(ccd) if ccd in rc.standard_ccds else 20)
+                ccds.append(ccd)
+        x_gt = np.concatenate(x_gt, axis=0)
+        atom_id_to_conformer_atom_id = np.concatenate(atom_id_to_conformer_atom_id, axis=0, dtype=np.int32)
+        residue_index = np.array(residue_index, dtype=np.int64)
+        conformer_id_to_chunk_sizes = np.array(conformer_id_to_chunk_sizes, dtype=np.int64)
+        restype = np.array(restype, dtype=np.int64)
+        conformer_exists = np.array(conformer_exists, dtype=np.bool_)
+        chain_feature_update = {
+            "x_gt": x_gt,
+            "atom_id_to_conformer_atom_id": atom_id_to_conformer_atom_id,
+            "residue_index": residue_index,
+            "conformer_id_to_chunk_sizes": conformer_id_to_chunk_sizes,
+            "restype": restype,
+            "ccds": ccds,
+            "msa": chain_feature["msa"][:, conformer_exists],
+            "deletion_matrix": chain_feature["deletion_matrix"][:, conformer_exists],
+            "chain_class": chain_class,
+            "key_res_feat": key_res_feat[conformer_exists],
+            "is_key_res": is_key_res[conformer_exists],
+        }
+        chain_feature_update["is_protein"] = np.array([chain_class == "protein"] * len(ccds)).astype(np.float32)
+        chain_feature_update["is_ligand"] = np.array([chain_class != "protein"] * len(ccds)).astype(np.float32)
+        # Assert Short Poly Chain like peptide
+        chain_feature_update["is_short_poly"] = np.array(
+            [chain_class != "protein" and len(ccds) >= 2 and rc.is_standard(ccd) for ccd in ccds]
+        ).astype(np.float32)
+        if "msa_all_seq" in chain_feature:
+            chain_feature_update["msa_all_seq"] = chain_feature["msa_all_seq"][:, conformer_exists]
+            chain_feature_update["deletion_matrix_all_seq"] = \
+                chain_feature["deletion_matrix_all_seq"][:, conformer_exists]
+            chain_feature_update["msa_species_identifiers_all_seq"] = chain_feature["msa_species_identifiers_all_seq"]
+        del chain_feature
+        return chain_feature_update
+    def _add_assembly_feature(self, all_chain_features, SEQ3):
+        entities = {}
+        for chain_id, seq3 in SEQ3.items():
+            if seq3 not in entities:
+                entities[seq3] = [chain_id]
+            else:
+                entities[seq3].append(chain_id)
+        asym_id = 0
+        ASYM_ID = {}
+        for entity_id, seq3 in enumerate(list(entities.keys())):
+            chain_ids = copy.deepcopy(entities[seq3])
+            if not self.inference_mode:
+                # sym_id augmentation
+                random.shuffle(chain_ids)
+            for sym_id, chain_id in enumerate(chain_ids):
+                num_conformers = len(all_chain_features[chain_id]["ccds"])
+                all_chain_features[chain_id]["asym_id"] = \
+                    np.full([num_conformers], fill_value=asym_id, dtype=np.int32)
+                all_chain_features[chain_id]["sym_id"] = \
+                    np.full([num_conformers], fill_value=sym_id, dtype=np.int32)
+                all_chain_features[chain_id]["entity_id"] = \
+                    np.full([num_conformers], fill_value=entity_id, dtype=np.int32)
+                all_chain_features[chain_id]["sequence_3"] = seq3
+                ASYM_ID[asym_id] = chain_id
+                asym_id += 1
+        return all_chain_features, ASYM_ID
+    def load_all_chain_features(self, all_chain_labels):
+        all_chain_features = {}
+        CONF_META_DATA = {}
+        SEQ3 = {}
+        CHAIN_CLASS = {}
+        for chain_id, chain_feature in all_chain_labels.items():
+            ccds = chain_feature["ccds"]
+            CONF_META_DATA = self._update_CONF_META_DATA(CONF_META_DATA, ccds)
+            SEQ3[chain_id] = "-".join(ccds)
+            chain_class = "protein" if not chain_id.isdigit() else "ligand"
+            chain_feature["chain_class"] = chain_class
+            all_chain_features[chain_id] = self._update_chain_feature(
+                chain_feature,
+                CONF_META_DATA
+            )
+            CHAIN_CLASS[chain_id] = chain_class
+        all_chain_features, ASYM_ID = self._add_assembly_feature(all_chain_features, SEQ3)
+        infer_meta_data = {
+            "CONF_META_DATA": CONF_META_DATA,
+            "SEQ3": SEQ3,
+            "ASYM_ID": ASYM_ID,
+            "CHAIN_CLASS": CHAIN_CLASS
+        }
+        return all_chain_features, infer_meta_data
+    def _spatial_crop_v2(self, all_chain_features, infer_meta_data):
+        CONF_META_DATA = infer_meta_data["CONF_META_DATA"]
+        ordered_chain_ids = list(all_chain_features.keys())
+        x_gt = np.concatenate([all_chain_features[chain_id]["x_gt"] for chain_id in ordered_chain_ids], axis=0)
+        # asym_id = np.concatenate([all_chain_features[chain_id]["asym_id"] for chain_id in ordered_chain_ids], axis=0)
+        token_id_to_centre_atom_id = []
+        token_id_to_conformer_id = []
+        token_id_to_ccd_chunk_sizes = []
+        token_id_to_ccd = []
+        asym_id_ca = []
+        token_id = 0
+        atom_id = 0
+        conf_id = 0
+        x_gt_ligand = []
+        for chain_id in ordered_chain_ids:
+            if chain_id.isdigit() and len(all_chain_features[chain_id]["ccds"]) == 1:
+                x_gt_ligand.append(all_chain_features[chain_id]["x_gt"])
+            atom_offset = 0
+            for ccd, chunk_size_this_ccd, asym_id in zip(
+                    all_chain_features[chain_id]["ccds"],
+                    all_chain_features[chain_id]["conformer_id_to_chunk_sizes"],
+                    all_chain_features[chain_id]["asym_id"],
+            ):
+                inner_atom_idx = all_chain_features[chain_id]["atom_id_to_conformer_atom_id"][
+                                 atom_offset:atom_offset + chunk_size_this_ccd]
+                atom_names = [CONF_META_DATA[ccd]["ref_atom_name_chars"][i] for i in inner_atom_idx]
+                if rc.is_standard(ccd):
+                    for atom_id_this_ccd, atom_name in enumerate(atom_names):
+                        if atom_name == rc.standard_ccd_to_token_centre_atom_name[ccd]:
+                            token_id_to_centre_atom_id.append(atom_id)
+                            token_id_to_conformer_id.append(conf_id)
+                            token_id_to_ccd_chunk_sizes.append(chunk_size_this_ccd)
+                            token_id_to_ccd.append(ccd)
+                            asym_id_ca.append(asym_id)
+                        atom_id += 1
+                    token_id += 1
+                else:
+                    for atom_id_this_ccd, atom_name in enumerate(atom_names):
+                        token_id_to_centre_atom_id.append(atom_id)
+                        token_id_to_conformer_id.append(conf_id)
+                        token_id_to_ccd_chunk_sizes.append(chunk_size_this_ccd)
+                        token_id_to_ccd.append(ccd)
+                        asym_id_ca.append(asym_id)
+                        atom_id += 1
+                        token_id += 1
+                atom_offset += chunk_size_this_ccd
+                conf_id += 1
+        x_gt_ca = x_gt[token_id_to_centre_atom_id]
+        asym_id_ca = np.array(asym_id_ca)
+        crop_scheme_seed = random.random()
+        # Spatial Crop Ligand
+        if crop_scheme_seed < (0.6 if not self.inference_mode else 1.0) and len(x_gt_ligand) > 0:
+            x_gt_ligand = np.concatenate(x_gt_ligand, axis=0)
+            x_gt_sel = random.choice(x_gt_ligand)[None]
+        # Spatial Crop Interface
+        elif crop_scheme_seed < 0.8 and len(set(asym_id_ca.tolist())) > 1:
+            chain_same = asym_id_ca[None] * asym_id_ca[:, None]
+            dist = np.linalg.norm(x_gt_ca[:, None] - x_gt_ca[None], axis=-1)
+            dist = dist + chain_same * 100
+            # interface_threshold
+            mask = np.any(dist < 15, axis=-1)
+            if sum(mask) > 0:
+                x_gt_sel = random.choice(x_gt_ca[mask])[None]
+            else:
+                x_gt_sel = random.choice(x_gt_ca)[None]
+        # Spatial Crop
+        else:
+            x_gt_sel = random.choice(x_gt_ca)[None]
+        dist = np.linalg.norm(x_gt_ca - x_gt_sel, axis=-1)
+        token_idxs = np.argsort(dist)
+        select_ccds_idx = []
+        to_sum_atom = 0
+        to_sum_token = 0
+        for token_idx in token_idxs:
+            ccd_idx = token_id_to_conformer_id[token_idx]
+            ccd_chunk_size = token_id_to_ccd_chunk_sizes[token_idx]
+            ccd_this_token = token_id_to_ccd[token_idx]
+            if ccd_idx in select_ccds_idx:
+                continue
+            if to_sum_atom + ccd_chunk_size > self.atom_crop_size:
+                break
+            to_add_token = 1 if rc.is_standard(ccd_this_token) else ccd_chunk_size
+            if to_sum_token + to_add_token > self.token_crop_size:
+                break
+            select_ccds_idx.append(ccd_idx)
+            to_sum_atom += ccd_chunk_size
+            to_sum_token += to_add_token
+        ccd_all_id = 0
+        crop_chains = []
+        for chain_id in ordered_chain_ids:
+            conformer_used_mask = []
+            atom_used_mask = []
+            ccds = []
+            for ccd, chunk_size_this_ccd in zip(
+                    all_chain_features[chain_id]["ccds"],
+                    all_chain_features[chain_id]["conformer_id_to_chunk_sizes"],
+            ):
+                if ccd_all_id in select_ccds_idx:
+                    ccds.append(ccd)
+                    if chain_id not in crop_chains:
+                        crop_chains.append(chain_id)
+                conformer_used_mask.append(ccd_all_id in select_ccds_idx)
+                atom_used_mask += [ccd_all_id in select_ccds_idx] * chunk_size_this_ccd
+                ccd_all_id += 1
+            conf_mask = np.array(conformer_used_mask).astype(np.bool_)
+            atom_mask = np.array(atom_used_mask).astype(np.bool_)
+            # Update All Chain Features
+            all_chain_features[chain_id]["x_gt"] = all_chain_features[chain_id]["x_gt"][atom_mask]
+            all_chain_features[chain_id]["atom_id_to_conformer_atom_id"] = \
+                all_chain_features[chain_id]["atom_id_to_conformer_atom_id"][atom_mask]
+            all_chain_features[chain_id]["restype"] = all_chain_features[chain_id]["restype"][conf_mask]
+            all_chain_features[chain_id]["residue_index"] = all_chain_features[chain_id]["residue_index"][conf_mask]
+            all_chain_features[chain_id]["conformer_id_to_chunk_sizes"] = \
+                all_chain_features[chain_id]["conformer_id_to_chunk_sizes"][conf_mask]
+            # BUG Fix
+            all_chain_features[chain_id]["key_res_feat"] = all_chain_features[chain_id]["key_res_feat"][conf_mask]
+            all_chain_features[chain_id]["is_key_res"] = all_chain_features[chain_id]["is_key_res"][conf_mask]
+            all_chain_features[chain_id]["is_protein"] = all_chain_features[chain_id]["is_protein"][conf_mask]
+            all_chain_features[chain_id]["is_short_poly"] = all_chain_features[chain_id]["is_short_poly"][conf_mask]
+            all_chain_features[chain_id]["is_ligand"] = all_chain_features[chain_id]["is_ligand"][conf_mask]
+            all_chain_features[chain_id]["asym_id"] = all_chain_features[chain_id]["asym_id"][conf_mask]
+            all_chain_features[chain_id]["sym_id"] = all_chain_features[chain_id]["sym_id"][conf_mask]
+            all_chain_features[chain_id]["entity_id"] = all_chain_features[chain_id]["entity_id"][conf_mask]
+            all_chain_features[chain_id]["ccds"] = ccds
+            if "msa" in all_chain_features[chain_id]:
+                all_chain_features[chain_id]["msa"] = all_chain_features[chain_id]["msa"][:, conf_mask]
+                all_chain_features[chain_id]["deletion_matrix"] = \
+                    all_chain_features[chain_id]["deletion_matrix"][:, conf_mask]
+            if "msa_all_seq" in all_chain_features[chain_id]:
+                all_chain_features[chain_id]["msa_all_seq"] = all_chain_features[chain_id]["msa_all_seq"][:, conf_mask]
+                all_chain_features[chain_id]["deletion_matrix_all_seq"] = \
+                    all_chain_features[chain_id]["deletion_matrix_all_seq"][:, conf_mask]
+        # Remove Unused Chains
+        for chain_id in list(all_chain_features.keys()):
+            if chain_id not in crop_chains:
+                all_chain_features.pop(chain_id, None)
+        return all_chain_features
+    def _spatial_crop(self, all_chain_features):
+        ordered_chain_ids = list(all_chain_features.keys())
+        atom_id_to_ccd_id = []
+        atom_id_to_ccd_chunk_sizes = []
+        atom_id_to_ccd = []
+        ccd_all_id = 0
+        for chain_id in ordered_chain_ids:
+            for ccd, chunk_size_this_ccd in zip(
+                    all_chain_features[chain_id]["ccds"],
+                    all_chain_features[chain_id]["conformer_id_to_chunk_sizes"],
+            ):
+                atom_id_to_ccd_id += [ccd_all_id] * chunk_size_this_ccd
+                atom_id_to_ccd_chunk_sizes += [chunk_size_this_ccd] * chunk_size_this_ccd
+                atom_id_to_ccd += [ccd] * chunk_size_this_ccd
+                ccd_all_id += 1
+        to_sum_atom = 0
+        to_sum_token = 0
+        x_gt = np.concatenate([all_chain_features[chain_id]["x_gt"] for chain_id in ordered_chain_ids], axis=0)
+        spatial_crop_ratio = 0.3 if not self.inference_mode else 0
+        if random.random() < spatial_crop_ratio or len(ordered_chain_ids) == 1:
+            x_gt_sel = random.choice(x_gt)[None]
+        else:
+            asym_id = np.array(reduce(add, [
+                [asym_id] * len(all_chain_features[chain_id]["x_gt"])
+                for asym_id, chain_id in enumerate(ordered_chain_ids)
+            ]))
+            chain_same = asym_id[None] * asym_id[:, None]
+            dist = np.linalg.norm(x_gt[:, None] - x_gt[None], axis=-1)
+            dist = dist + chain_same * 100
+            mask = np.any(dist < 4, axis=-1)
+            if sum(mask) > 0:
+                x_gt_ = x_gt[mask]
+            x_gt_sel = random.choice(x_gt)[None]
+        dist = np.linalg.norm(x_gt - x_gt_sel, axis=-1)
+        atom_idxs = np.argsort(dist)
+        select_ccds_idx = []
+        for atom_idx in atom_idxs:
+            ccd_idx = atom_id_to_ccd_id[atom_idx]
+            ccd_chunk_size = atom_id_to_ccd_chunk_sizes[atom_idx]
+            ccd_this_atom = atom_id_to_ccd[atom_idx]
+            if ccd_idx in select_ccds_idx:
+                continue
+            if to_sum_atom + ccd_chunk_size > self.atom_crop_size:
+                break
+            to_add_token = 1 if rc.is_standard(ccd_this_atom) else ccd_chunk_size
+            if to_sum_token + to_add_token > self.token_crop_size:
+                break
+            select_ccds_idx.append(ccd_idx)
+            to_sum_atom += ccd_chunk_size
+            to_sum_token += to_add_token
+        ccd_all_id = 0
+        crop_chains = []
+        for chain_id in ordered_chain_ids:
+            conformer_used_mask = []
+            atom_used_mask = []
+            ccds = []
+            for ccd, chunk_size_this_ccd in zip(
+                    all_chain_features[chain_id]["ccds"],
+                    all_chain_features[chain_id]["conformer_id_to_chunk_sizes"],
+            ):
+                if ccd_all_id in select_ccds_idx:
+                    ccds.append(ccd)
+                    if chain_id not in crop_chains:
+                        crop_chains.append(chain_id)
+                conformer_used_mask.append(ccd_all_id in select_ccds_idx)
+                atom_used_mask += [ccd_all_id in select_ccds_idx] * chunk_size_this_ccd
+                ccd_all_id += 1
+            conf_mask = np.array(conformer_used_mask).astype(np.bool_)
+            atom_mask = np.array(atom_used_mask).astype(np.bool_)
+            # Update All Chain Features
+            all_chain_features[chain_id]["x_gt"] = all_chain_features[chain_id]["x_gt"][atom_mask]
+            all_chain_features[chain_id]["atom_id_to_conformer_atom_id"] = \
+                all_chain_features[chain_id]["atom_id_to_conformer_atom_id"][atom_mask]
+            all_chain_features[chain_id]["restype"] = all_chain_features[chain_id]["restype"][conf_mask]
+            all_chain_features[chain_id]["residue_index"] = all_chain_features[chain_id]["residue_index"][conf_mask]
+            all_chain_features[chain_id]["conformer_id_to_chunk_sizes"] = \
+                all_chain_features[chain_id]["conformer_id_to_chunk_sizes"][conf_mask]
+            all_chain_features[chain_id]["ccds"] = ccds
+            if "msa" in all_chain_features[chain_id]:
+                all_chain_features[chain_id]["msa"] = all_chain_features[chain_id]["msa"][:, conf_mask]
+                all_chain_features[chain_id]["deletion_matrix"] = \
+                    all_chain_features[chain_id]["deletion_matrix"][:, conf_mask]
+            if "msa_all_seq" in all_chain_features[chain_id]:
+                all_chain_features[chain_id]["msa_all_seq"] = all_chain_features[chain_id]["msa_all_seq"][:, conf_mask]
+                all_chain_features[chain_id]["deletion_matrix_all_seq"] = \
+                    all_chain_features[chain_id]["deletion_matrix_all_seq"][:, conf_mask]
+        # Remove Unused Chains
+        for chain_id in list(all_chain_features.keys()):
+            if chain_id not in crop_chains:
+                all_chain_features.pop(chain_id, None)
+        return all_chain_features
+    def crop_all_chain_features(self, all_chain_features, infer_meta_data):
+        # all_chain_features = self._spatial_crop(all_chain_features)
+        all_chain_features = self._spatial_crop_v2(all_chain_features, infer_meta_data)
+        return all_chain_features, infer_meta_data
+    def _make_pocket_features(self, all_chain_features):
+        # minimium distance 6-12
+        all_chain_ids = list(all_chain_features.keys())
+        for chain_id in all_chain_ids:
+            all_chain_features[chain_id]["pocket_res_feat"] = np.zeros(
+                [len(all_chain_features[chain_id]["ccds"])], dtype=np.bool_)
+        ligand_chain_ids = [i for i in all_chain_ids if i.isdigit()]
+        receptor_chain_ids = [i for i in all_chain_ids if not i.isdigit()]
+        use_pocket = random.random() < 0.5
+        # TODO: Inference mode assign
+        if len(ligand_chain_ids) == 0 or len(receptor_chain_ids) == 0 or not use_pocket:
+            for chain_id in all_chain_ids:
+                all_chain_features[chain_id]["pocket_res_feat"] = all_chain_features[chain_id][
+                    "pocket_res_feat"].astype(np.float32)
+            return all_chain_features
+        # Aug Part
+        for ligand_chain_id in ligand_chain_ids:
+            x_gt_ligand = all_chain_features[ligand_chain_id]["x_gt"]
+            # x_gt_mean = np.mean(x_gt_ligand, axis=0) + np.random.randn(3)
+            for receptor_chain_id in receptor_chain_ids:
+                x_gt_receptor = all_chain_features[receptor_chain_id]["x_gt"]
+                # dist = np.linalg.norm(x_gt_receptor - x_gt_mean[None], axis=-1)
+                # is_pocket_atom = (dist < (random.random() * 6 + 8)).astype(np.bool_)
+                is_pocket_atom = np.any(
+                    np.linalg.norm(x_gt_receptor[:, None] - x_gt_ligand[None], axis=-1) < (random.random() * 6 + 6),
+                    axis=-1
+                )
+                is_pocket_ccd = []
+                offset = 0
+                for chunk_size in all_chain_features[receptor_chain_id]["conformer_id_to_chunk_sizes"]:
+                    is_pocket_ccd.append(np.any(is_pocket_atom[offset:offset + chunk_size]).item())
+                    offset += chunk_size
+                is_pocket_ccd = np.array(is_pocket_ccd, dtype=np.bool_)
+                is_pocket_ccd = np.array([np.any(i).item() for i in is_pocket_ccd], dtype=np.bool_)
+                all_chain_features[receptor_chain_id]["pocket_res_feat"] = all_chain_features[receptor_chain_id][
+                                                                               "pocket_res_feat"] | is_pocket_ccd
+        for chain_id in all_chain_ids:
+            all_chain_features[chain_id]["pocket_res_feat"] = all_chain_features[chain_id]["pocket_res_feat"].astype(
+                np.float32)
+        return all_chain_features
+    def _make_ccd_features(self, raw_feats, infer_meta_data):
+        CONF_META_DATA = infer_meta_data["CONF_META_DATA"]
+        ccds = raw_feats["ccds"]
+        atom_id_to_conformer_atom_id = raw_feats["atom_id_to_conformer_atom_id"]
+        conformer_id_to_chunk_sizes = raw_feats["conformer_id_to_chunk_sizes"]
+        # Atomwise
+        atom_id_to_conformer_id = []
+        atom_id_to_token_id = []
+        ref_feat = []
+        # Tokenwise
+        s_mask = []
+        token_id_to_conformer_id = []
+        token_id_to_chunk_sizes = []
+        token_id_to_centre_atom_id = []
+        token_id_to_pseudo_beta_atom_id = []
+        token_id = 0
+        atom_id = 0
+        for conf_id, (ccd, ccd_atoms) in enumerate(zip(ccds, conformer_id_to_chunk_sizes)):
+            conf_meta_data = CONF_META_DATA[ccd]
+            # UNK Conformer
+            if rc.is_unk(ccd):
+                s_mask.append(0)
+                token_id_to_chunk_sizes.append(0)
+                token_id_to_conformer_id.append(conf_id)
+                token_id_to_centre_atom_id.append(-1)
+                token_id_to_pseudo_beta_atom_id.append(-1)
+                token_id += 1
+            # Standard Residue
+            elif rc.is_standard(ccd):
+                inner_atom_idx = atom_id_to_conformer_atom_id[atom_id:atom_id + ccd_atoms.item()]
+                atom_names = [conf_meta_data["ref_atom_name_chars"][i] for i in inner_atom_idx]
+                ref_feat.append(conf_meta_data["ref_feat"][inner_atom_idx])
+                token_id_to_conformer_id.append(conf_id)
+                token_id_to_chunk_sizes.append(ccd_atoms.item())
+                s_mask.append(1)
+                for atom_id_this_ccd, atom_name in enumerate(atom_names):
+                    # Update Atomwise Features
+                    atom_id_to_conformer_id.append(conf_id)
+                    atom_id_to_token_id.append(token_id)
+                    # Update special atom ids
+                    if atom_name == rc.standard_ccd_to_token_centre_atom_name[ccd]:
+                        token_id_to_centre_atom_id.append(atom_id)
+                    if atom_name == rc.standard_ccd_to_token_pseudo_beta_atom_name[ccd]:
+                        token_id_to_pseudo_beta_atom_id.append(atom_id)
+                    atom_id += 1
+                token_id += 1
+            # Nonestandard Residue & Ligand
+            else:
+                inner_atom_idx = atom_id_to_conformer_atom_id[atom_id:atom_id + ccd_atoms.item()]
+                atom_names = [conf_meta_data["ref_atom_name_chars"][i] for i in inner_atom_idx]
+                ref_feat.append(conf_meta_data["ref_feat"][inner_atom_idx])
+                # ref_pos_new.append(conf_meta_data["ref_pos_new"][:, inner_atom_idx])
+                for atom_id_this_ccd, atom_name in enumerate(atom_names):
+                    # Update Atomwise Features
+                    atom_id_to_conformer_id.append(conf_id)
+                    atom_id_to_token_id.append(token_id)
+                    # Update Tokenwise Features
+                    token_id_to_chunk_sizes.append(1)
+                    token_id_to_conformer_id.append(conf_id)
+                    s_mask.append(1)
+                    token_id_to_centre_atom_id.append(atom_id)
+                    token_id_to_pseudo_beta_atom_id.append(atom_id)
+                    atom_id += 1
+                    token_id += 1
+        if len(ref_feat) > 1:
+            ref_feat = np.concatenate(ref_feat, axis=0).astype(np.float32)
+        else:
+            ref_feat = ref_feat[0].astype(np.float32)
+        features = {
+            # Atomwise
+            "atom_id_to_conformer_id": np.array(atom_id_to_conformer_id, dtype=np.int64),
+            "atom_id_to_token_id": np.array(atom_id_to_token_id, dtype=np.int64),
+            "ref_feat": ref_feat,
+            # Tokewise
+            "token_id_to_conformer_id": np.array(token_id_to_conformer_id, dtype=np.int64),
+            "s_mask": np.array(s_mask, dtype=np.int64),
+            "token_id_to_centre_atom_id": np.array(token_id_to_centre_atom_id, dtype=np.int64),
+            "token_id_to_pseudo_beta_atom_id": np.array(token_id_to_pseudo_beta_atom_id, dtype=np.int64),
+            "token_id_to_chunk_sizes": np.array(token_id_to_chunk_sizes, dtype=np.int64),
+        }
+        features["ref_pos"] = features["ref_feat"][..., :3]
+        return features
+    def pair_and_merge(self, all_chain_features, infer_meta_data):
+        CHAIN_CLASS = infer_meta_data["CHAIN_CLASS"]  # Dict
+        CONF_META_DATA = infer_meta_data["CONF_META_DATA"]
+        ASYM_ID = infer_meta_data["ASYM_ID"]
+        homo_feats = {}
+        # Create Aug Pocket Feature
+        all_chain_features = self._make_pocket_features(all_chain_features)
+        all_chain_ids = list(all_chain_features.keys())
+        if len(all_chain_ids) == 1 and CHAIN_CLASS[all_chain_ids[0]] == "ligand":
+            ordered_chain_ids = all_chain_ids
+            raw_feats = all_chain_features[all_chain_ids[0]]
+            raw_feats["msa"] = np.repeat(raw_feats["msa"][:1], 256, axis=0)
+            raw_feats["deletion_matrix"] = np.repeat(raw_feats["msa"][:1], 256, axis=0)
+            keys = list(raw_feats.keys())
+            for feature_name in keys:
+                if feature_name not in ["x_gt", "atom_id_to_conformer_atom_id", "residue_index",
+                                        "conformer_id_to_chunk_sizes", "restype", "is_protein", "is_short_poly",
+                                        "is_ligand",
+                                        "asym_id", "sym_id", "entity_id", "msa", "deletion_matrix", "ccds",
+                                        "pocket_res_feat", "key_res_feat", "is_key_res"]:
+                    raw_feats.pop(feature_name)
+            # Update Profile and Deletion Mean
+            msa_one_hot = F.one_hot(torch.from_numpy(raw_feats["msa"]).long(), 32).type(torch.float32)
+            raw_feats["profile"] = torch.mean(msa_one_hot, dim=-3).numpy()
+            del msa_one_hot
+            raw_feats["deletion_mean"] = (torch.atan(
+                torch.sum(torch.from_numpy(raw_feats["deletion_matrix"]), dim=0) / 3.0
+            ) * (2.0 / torch.pi)).numpy()
+        else:
+            for chain_id in list(all_chain_features.keys()):
+                homo_feats[chain_id] = {
+                    "asym_id": copy.deepcopy(all_chain_features[chain_id]["asym_id"]),
+                    "sym_id": copy.deepcopy(all_chain_features[chain_id]["sym_id"]),
+                    "entity_id": copy.deepcopy(all_chain_features[chain_id]["entity_id"]),
+                }
+            for chain_id in list(all_chain_features.keys()):
+                homo_feats[chain_id]["chain_class"] = all_chain_features[chain_id].pop("chain_class")
+                homo_feats[chain_id]["sequence_3"] = all_chain_features[chain_id].pop("sequence_3")
+                homo_feats[chain_id]["msa"] = all_chain_features[chain_id].pop("msa")
+                homo_feats[chain_id]["deletion_matrix"] = all_chain_features[chain_id].pop("deletion_matrix")
+                if "msa_all_seq" in all_chain_features[chain_id]:
+                    homo_feats[chain_id]["msa_all_seq"] = all_chain_features[chain_id].pop("msa_all_seq")
+                    homo_feats[chain_id]["deletion_matrix_all_seq"] = all_chain_features[chain_id].pop(
+                        "deletion_matrix_all_seq")
+                    homo_feats[chain_id]["msa_species_identifiers_all_seq"] = all_chain_features[chain_id].pop(
+                        "msa_species_identifiers_all_seq")
+            # Initial raw feats with merged homo feats
+            raw_feats = pair_and_merge(homo_feats, is_homomer_or_monomer=False)
+            # Update Profile and Deletion Mean
+            msa_one_hot = F.one_hot(torch.from_numpy(raw_feats["msa"]).long(), 32).type(torch.float32)
+            raw_feats["profile"] = torch.mean(msa_one_hot, dim=-3).numpy()
+            del msa_one_hot
+            raw_feats["deletion_mean"] = (torch.atan(
+                torch.sum(torch.from_numpy(raw_feats["deletion_matrix"]), dim=0) / 3.0
+            ) * (2.0 / torch.pi)).numpy()
+            # Merge no homo feats according to asym_id
+            ordered_asym_ids = []
+            for i in raw_feats["asym_id"]:
+                if i not in ordered_asym_ids:
+                    ordered_asym_ids.append(i)
+            ordered_chain_ids = [ASYM_ID[i] for i in ordered_asym_ids]
+            for feature_name in ["chain_class", "sequence_3", "assembly_num_chains", "entity_mask", "seq_length",
+                                 "num_alignments"]:
+                raw_feats.pop(feature_name, None)
+            for feature_name in ["x_gt", "atom_id_to_conformer_atom_id", "residue_index", "conformer_id_to_chunk_sizes",
+                                 "restype", "is_protein", "is_short_poly", "is_ligand", "pocket_res_feat",
+                                 "key_res_feat", "is_key_res"]:
+                raw_feats[feature_name] = np.concatenate([
+                    all_chain_features[chain_id].pop(feature_name) for chain_id in ordered_chain_ids
+                ], axis=0)
+        # Conformerwise Chain Class
+        CHAIN_CLASS_NEW = []
+        for chain_id in ordered_chain_ids:
+            CHAIN_CLASS_NEW += [CHAIN_CLASS[chain_id]] * len(all_chain_features[chain_id]["ccds"])
+        infer_meta_data["CHAIN_CLASS"] = CHAIN_CLASS_NEW
+        raw_feats["ccds"] = reduce(add, [all_chain_features[chain_id].pop("ccds") for chain_id in ordered_chain_ids])
+        # Create Atomwise and Tokenwise Features
+        raw_feats.update(self._make_ccd_features(raw_feats, infer_meta_data))
+        asym_id_conformerwise = copy.deepcopy(raw_feats["asym_id"])
+        residue_index_conformerwise = copy.deepcopy(raw_feats["residue_index"])
+        # Conformerwise to Tokenwise
+        token_id_to_conformer_id = raw_feats["token_id_to_conformer_id"]
+        for key in ["is_protein", "is_short_poly", "is_ligand", "residue_index", "restype", "asym_id", "entity_id",
+                    "sym_id", "deletion_mean", "profile", "pocket_res_feat", "key_res_feat", "is_key_res"]:
+            raw_feats[key] = raw_feats[key][token_id_to_conformer_id]
+        for key in ["msa", "deletion_matrix"]:
+            if key in raw_feats:
+                raw_feats[key] = raw_feats[key][:, token_id_to_conformer_id]
+        ###################################################
+        #       Centre Random Augmentation of ref pos     #
+        ###################################################
+        raw_feats["ref_pos"] = centre_random_augmentation_np_apply(
+            raw_feats["ref_pos"], raw_feats["atom_id_to_token_id"]).astype(np.float32)
+        raw_feats["ref_feat"][:, :3] = raw_feats["ref_pos"]
+        ###################################################
+        #            Create token pair features           #
+        ###################################################
+        no_token = len(raw_feats["token_id_to_conformer_id"])
+        token_bonds = np.zeros([no_token, no_token], dtype=np.float32)
+        rel_tok_feat = np.zeros([no_token, no_token, 42], dtype=np.float32)
+        batch_ref_pos = np.zeros([32, no_token, 3], dtype=np.float32)
+        offset = 0
+        atom_offset = 0
+        for ccd, len_atoms in zip(
+                raw_feats["ccds"],
+                raw_feats["conformer_id_to_chunk_sizes"]
+        ):
+            if rc.is_standard(ccd) or rc.is_unk(ccd):
+                offset += 1
+            else:
+                len_atoms = len_atoms.item()
+                inner_atom_idx = raw_feats["atom_id_to_conformer_atom_id"][atom_offset:atom_offset + len_atoms]
+                batch_ref_pos[:, offset:offset + len_atoms] = CONF_META_DATA[ccd]["batch_ref_pos"][:, inner_atom_idx]
+                token_bonds[offset:offset + len_atoms, offset:offset + len_atoms] = \
+                    CONF_META_DATA[ccd]["token_bonds"][inner_atom_idx][:, inner_atom_idx]
+                rel_tok_feat[offset:offset + len_atoms, offset:offset + len_atoms] = \
+                    CONF_META_DATA[ccd]["rel_tok_feat"][inner_atom_idx][:, inner_atom_idx]
+                offset += len_atoms
+            atom_offset += len_atoms
+        raw_feats["token_bonds"] = token_bonds.astype(np.float32)
+        raw_feats["token_bonds_feature"] = token_bonds.astype(np.float32)
+        raw_feats["rel_tok_feat"] = rel_tok_feat.astype(np.float32)
+        raw_feats["batch_ref_pos"] = batch_ref_pos.astype(np.float32)
+        ###################################################
+        #              Charility Augmentation             #
+        ###################################################
+        if not self.inference_mode:
+            # TODO Charility probs
+            charility_seed = random.random()
+            if charility_seed < 0.1:
+                ref_chirality = raw_feats["ref_feat"][:, 158:161]
+                ref_chirality_replace = np.zeros_like(ref_chirality)
+                ref_chirality_replace[:, 2] = 1
+                is_ligand_atom = raw_feats["is_ligand"][raw_feats["atom_id_to_token_id"]]
+                remove_charility = (np.random.randint(0, 2, [len(is_ligand_atom)]) * is_ligand_atom).astype(
+                    np.bool_)
+                ref_chirality = np.where(remove_charility[:, None], ref_chirality_replace, ref_chirality)
+                raw_feats["ref_feat"][:, 158:161] = ref_chirality
+        # MASKS
+        raw_feats["x_exists"] = np.ones_like(raw_feats["x_gt"][..., 0]).astype(np.float32)
+        raw_feats["a_mask"] = raw_feats["x_exists"]
+        raw_feats["s_mask"] = np.ones_like(raw_feats["asym_id"]).astype(np.float32)
+        raw_feats["ref_space_uid"] = raw_feats["atom_id_to_conformer_id"]
+        # Write Infer Meta Data
+        infer_meta_data["ccds"] = raw_feats.pop("ccds")
+        infer_meta_data["atom_id_to_conformer_atom_id"] = raw_feats.pop("atom_id_to_conformer_atom_id")
+        infer_meta_data["residue_index"] = residue_index_conformerwise
+        infer_meta_data["asym_id"] = asym_id_conformerwise
+        infer_meta_data["conformer_id_to_chunk_sizes"] = raw_feats.pop("conformer_id_to_chunk_sizes")
+        return raw_feats, infer_meta_data
+    def make_feats(self, tensors):
+        # Target Feat
+        tensors["target_feat"] = torch.cat([
+            F.one_hot(tensors["restype"].long(), 32).float(),
+            tensors["profile"].float(),
+            tensors["deletion_mean"][..., None].float()
+        ], dim=-1)
+        # MSA Feat
+        inds = [0] + torch.randperm(len(tensors["msa"]))[:127].tolist()
+        tensors["msa"] = tensors["msa"][inds]
+        tensors["deletion_matrix"] = tensors["deletion_matrix"][inds]
+        has_deletion = torch.clamp(tensors["deletion_matrix"].float(), min=0., max=1.)
+        pi = torch.acos(torch.zeros(1, device=tensors["deletion_matrix"].device)) * 2
+        deletion_value = (torch.atan(tensors["deletion_matrix"] / 3.) * (2. / pi))
+        tensors["msa_feat"] = torch.cat([
+            F.one_hot(tensors["msa"].long(), 32).float(),
+            has_deletion[..., None].float(),
+            deletion_value[..., None].float(),
+        ], dim=-1)
+        tensors.pop("msa", None)
+        tensors.pop("deletion_mean", None)
+        tensors.pop("profile", None)
+        tensors.pop("deletion_matrix", None)
+        return tensors
+    def _make_token_bonds(self, tensors):
+        # Get Polymer-Ligand & Ligand-Ligand Within Conformer Token Bond
+        # Atomwise asym_id
+        asym_id = tensors["asym_id"][tensors["atom_id_to_token_id"]]
+        is_ligand = tensors["is_ligand"][tensors["atom_id_to_token_id"]]
+        x_gt = tensors["x_gt"]
+        a_mask = tensors["a_mask"]
+        # Get
+        atom_id_to_token_id = tensors["atom_id_to_token_id"]
+        num_token = len(tensors["asym_id"])
+        between_conformer_token_bonds = torch.zeros([num_token, num_token])
+        # create chainwise feature
+        asym_id_chain = []
+        asym_id_atom_offset = []
+        asym_id_is_ligand = []
+        for atom_offset, (a_id, i_id) in enumerate(zip(asym_id.tolist(), is_ligand.tolist())):
+            if len(asym_id_chain) == 0 or asym_id_chain[-1] != a_id:
+                asym_id_chain.append(a_id)
+                asym_id_atom_offset.append(atom_offset)
+                asym_id_is_ligand.append(i_id)
+        len_asym_id_chain = len(asym_id_chain)
+        if len_asym_id_chain >= 2:
+            for i in range(0, len_asym_id_chain - 1):
+                asym_id_i = asym_id_chain[i]
+                mask_i = asym_id == asym_id_i
+                x_gt_i = x_gt[mask_i]
+                a_mask_i = a_mask[mask_i]
+                for j in range(i + 1, len_asym_id_chain):
+                    if not bool(asym_id_is_ligand[i]) and not bool(asym_id_is_ligand[j]):
+                        continue
+                    asym_id_j = asym_id_chain[j]
+                    mask_j = asym_id == asym_id_j
+                    x_gt_j = x_gt[mask_j]
+                    a_mask_j = a_mask[mask_j]
+                    dis_ij = torch.norm(x_gt_i[:, None, :] - x_gt_j[None, :, :], dim=-1)
+                    dis_ij = dis_ij + (1 - a_mask_i[:, None] * a_mask_j[None]) * 1000
+                    if torch.min(dis_ij) < self.token_bond_threshold:
+                        ij = torch.argmin(dis_ij).item()
+                        l_j = len(x_gt_j)
+                        atom_i = int(ij // l_j)  # raw
+                        atom_j = int(ij % l_j)  # col
+                        global_atom_i = atom_i + asym_id_atom_offset[i]
+                        global_atom_j = atom_j + asym_id_atom_offset[j]
+                        token_i = atom_id_to_token_id[global_atom_i]
+                        token_j = atom_id_to_token_id[global_atom_j]
+                        between_conformer_token_bonds[token_i, token_j] = 1
+                        between_conformer_token_bonds[token_j, token_i] = 1
+        token_bond_seed = random.random()
+        tensors["token_bonds"] = tensors["token_bonds"] + between_conformer_token_bonds
+        # Docking Indicate Token Bond
+        if token_bond_seed >= 0:
+            tensors["token_bonds_feature"] = tensors["token_bonds"]
+        return tensors
+    def _pad_to_size(self, tensors):
+        to_pad_atom = self.atom_crop_size - len(tensors["x_gt"])
+        to_pad_token = self.token_crop_size - len(tensors["residue_index"])
+        if to_pad_token > 0:
+            for k in ["restype", "residue_index", "is_protein", "is_short_poly", "is_ligand", "is_key_res",
+                      "asym_id", "entity_id", "sym_id", "token_id_to_conformer_id", "s_mask",
+                      "token_id_to_centre_atom_id", "token_id_to_pseudo_beta_atom_id", "token_id_to_chunk_sizes",
+                      "pocket_res_feat"]:
+                tensors[k] = torch.nn.functional.pad(tensors[k], [0, to_pad_token])
+            for k in ["target_feat", "msa_feat", "batch_ref_pos", "key_res_feat"]:
+                if k in tensors:
+                    tensors[k] = torch.nn.functional.pad(tensors[k], [0, 0, 0, to_pad_token])
+            for k in ["token_bonds", "token_bonds_feature"]:
+                tensors[k] = torch.nn.functional.pad(tensors[k], [0, to_pad_token, 0, to_pad_token])
+            for k in ["rel_tok_feat"]:
+                tensors[k] = torch.nn.functional.pad(tensors[k], [0, 0, 0, to_pad_token, 0, to_pad_token])
+        if to_pad_atom > 0:
+            for k in ["a_mask", "x_exists", "atom_id_to_conformer_id", "atom_id_to_token_id", "ref_space_uid"]:
+                tensors[k] = torch.nn.functional.pad(tensors[k], [0, to_pad_atom])
+            for k in ["x_gt", "ref_feat", "ref_pos"]:  # , "ref_pos_new"
+                tensors[k] = torch.nn.functional.pad(tensors[k], [0, 0, 0, to_pad_atom])
+            # for k in ["z_mask"]:  # , "ref_pos_new"
+            #     tensors[k] = torch.nn.functional.pad(tensors[k], [0, to_pad_atom, 0, to_pad_atom])
+            # for k in ["conformer_mask_atom"]:
+            #     tensors[k] = torch.nn.functional.pad(tensors[k], [0, to_pad_atom, 0, to_pad_atom])
+            # for k in ["rel_token_feat_atom"]:
+            #     tensors[k] = torch.nn.functional.pad(tensors[k], [0,0,0, to_pad_atom, 0, to_pad_atom])
+            # rel_token_feat_atom
+        return tensors
+    def get_template_feat(self, tensors):
+        x_gt = tensors["x_gt"][tensors["token_id_to_pseudo_beta_atom_id"]]
+        z_mask = tensors["z_mask"]
+        asym_id = tensors["asym_id"]
+        is_protein = tensors["is_protein"]
+        chain_same = (asym_id[None] == asym_id[:, None]).float()
+        protein2d = is_protein[None] * is_protein[:, None]
+        dgram = dgram_from_positions(x_gt)
+        dgram = dgram * protein2d[..., None] * z_mask[..., None]
+        # if not self.inference_mode:
+        #     bert_mask = torch.rand([len(x_gt)]) > random.random() * 0.4
+        #     asym_ids = list(set(asym_id.tolist()))
+        #     used_asym_ids = []
+        #     for a in asym_ids:
+        #         if random.random() > 0.6:
+        #             used_asym_ids.append(a)
+        #     if len(used_asym_ids) > 0:
+        #         used_asym_ids = torch.tensor(used_asym_ids)
+        #         chain_bert_mask = torch.any(asym_id[:, None] == used_asym_ids[None], dim=-1)
+        #         bert_mask = chain_bert_mask * bert_mask
+        #     else:
+        #         bert_mask = bert_mask * 0
+        #     template_pseudo_beta_mask = (bert_mask[None] * bert_mask[:, None]) * z_mask * protein2d
+        # else:
+        #     template_pseudo_beta_mask = z_mask * protein2d
+        template_pseudo_beta_mask = z_mask * protein2d
+        # template_pseudo_beta_mask = protein2d * z_mask
+        dgram = dgram * template_pseudo_beta_mask[..., None]
+        templ_feat = torch.cat([dgram, template_pseudo_beta_mask[..., None]], dim=-1)
+        tensors["templ_feat"] = templ_feat.float()[None]
+        t_mask_seed = random.random()
+        # Template Augmentation
+        if self.inference_mode or t_mask_seed < 0.1:
+            tensors["t_mask"] = torch.ones([len(tensors["templ_feat"])], dtype=torch.float32)
+        else:
+            tensors["t_mask"] = torch.zeros([len(tensors["templ_feat"])], dtype=torch.float32)
+        # TODO: No Template
+        # if not self.inference_mode:
+        #     if random.random() < 0.5:
+        #         tensors["templ_feat"] *= 0
+        return tensors
+    def transform(self, raw_feats):
+        # np to tensor
+        tensors = dict()
+        for key in raw_feats.keys():
+            tensors[key] = torch.from_numpy(raw_feats[key])
+        # Make Target & MSA Feat
+        tensors = self.make_feats(tensors)
+        # Make Token Bond Feat
+        tensors = self._make_token_bonds(tensors)
+        # Padding
+        if not self.inference_mode:
+            tensors = self._pad_to_size(tensors)
+        # # Make Pocket Res Feat
+        #
+        # tensors["pocket_res_feat"] = torch.zeros([l], dtype=torch.float32)
+        # Make Key Res Feat
+        # l = len(tensors["asym_id"])
+        # tensors["key_res_feat"] = torch.zeros([l, 7], dtype=torch.float32)
+        # tensors["key_res_feat"][:, 0] = 1.
+        # Mask
+        tensors["z_mask"] = tensors["s_mask"][None] * tensors["s_mask"][:, None]
+        # Template
+        tensors = self.get_template_feat(tensors)
+        # Correct Type
+        is_short_poly = tensors.pop("is_short_poly")
+        tensors["is_protein"] = tensors["is_protein"] + is_short_poly
+        tensors["is_ligand"] = tensors["is_ligand"] - is_short_poly
+        tensors["is_dna"] = torch.zeros_like(tensors["is_protein"])
+        tensors["is_rna"] = torch.zeros_like(tensors["is_protein"])
+        return tensors
+    def load(self, sample_id):
+        all_chain_labels = load_pkl(os.path.join(self.samples_path, f"{sample_id}.pkl.gz"))
+        all_chain_features, infer_meta_data = self.load_all_chain_features(all_chain_labels)
+        infer_meta_data["system_id"] = sample_id
+        # if not self.inference_mode:
+        all_chain_features, infer_meta_data = self.crop_all_chain_features(all_chain_features, infer_meta_data)
+        raw_feats, infer_meta_data = self.pair_and_merge(all_chain_features, infer_meta_data)
+        tensors = self.transform(raw_feats)
+        return tensors, infer_meta_data
+    def random_load(self):
+        sample_id = self.used_sample_ids[torch.multinomial(self.probabilities, 1).item()]
+        print(sample_id)
+        return self.load(sample_id)
+    def random_load_test(self):
+        sample_id = self.used_test_sample_ids[torch.multinomial(self.test_probabilities, 1).item()]
+        print(sample_id)
+        return self.load(sample_id)
+    def weighted_random_load(self):
+        weight_seed = random.random()
+        if weight_seed < 0.95:
+            sample_id = self.used_sample_ids[torch.multinomial(self.probabilities, 1).item()]
+        else:
+            return self.random_load_mol_chunks()
+        return self.load(sample_id)
+    def load_ligand(self, sample_id, chain_features):
+        all_chain_features = {}
+        CHAIN_META_DATA = {
+            "spatial_crop_chain_ids": None,
+            "chain_class": {},
+            "chain_sequence_3s": {},
+            "fake_ccds": [],
+        }
+        CONF_META_DATA = {}
+        num_prev_fake_ccds = len(CHAIN_META_DATA["fake_ccds"])
+        fake_ccd = f"{num_prev_fake_ccds:#>3}"
+        chain_features["msa"] = chain_features["restype"][None]
+        chain_features["deletion_matrix"] = np.ones_like(chain_features["msa"])
+        chain_features["ccds"] = [fake_ccd]
+        chain_features["chain_class"] = "ligand"
+        chain_features["all_atom_positions"] = chain_features["all_atom_positions"][None]
+        chain_features["all_atom_mask"] = chain_features["all_atom_mask"][None]
+        # Update Chain and Conf
+        CHAIN_META_DATA["fake_ccds"].append(fake_ccd)
+        sequence_3 = fake_ccd
+        chain_id = f"SDFM_{sample_id}"
+        CHAIN_META_DATA["chain_sequence_3s"][chain_id] = sequence_3
+        CHAIN_META_DATA["chain_class"][chain_id] = "ligand"
+        CONF_META_DATA = self._update_CONF_META_DATA_ligand(
+            CONF_META_DATA, sequence_3, chain_features)
+        all_chain_features[chain_id] = chain_features
+        all_chain_features[chain_id] = self._update_chain_feature(
+            chain_features,
+            CONF_META_DATA
+        )
+        SEQ3 = {}
+        CHAIN_CLASS = {}
+        SEQ3[chain_id] = "-".join([fake_ccd])
+        all_chain_features, ASYM_ID = self._add_assembly_feature(all_chain_features, SEQ3)
+        all_chain_features[chain_id]["conformer_id_to_chunk_sizes"] = np.array(
+            [len(chain_features["ref_atom_name_chars"])], dtype=np.int64)
+        all_chain_features[chain_id]["x_gt"] = chain_features["all_atom_positions"][0]
+        all_chain_features[chain_id]["x_exists"] = chain_features["all_atom_mask"][0]
+        CHAIN_CLASS[chain_id] = "ligand"
+        infer_meta_data = {
+            "CONF_META_DATA": CONF_META_DATA,
+            "SEQ3": SEQ3,
+            "ASYM_ID": ASYM_ID,
+            "CHAIN_CLASS": CHAIN_CLASS
+        }
+        # if not self.inference_mode:
+        #     all_chain_features, infer_meta_data = self.crop_all_chain_features(all_chain_features, infer_meta_data)
+        raw_feats, infer_meta_data = self.pair_and_merge(all_chain_features, infer_meta_data)
+        tensors = self.transform(raw_feats)
+        infer_meta_data["system_id"] = sample_id
+        return tensors, infer_meta_data
+    def random_load_mol_chunks(self, ligand_db_name="1"):
+        if ligand_db_name == "0":
+            ligand_db = load_pkl("/2022133002/projects/stdock/stdock_v9.5/scripts/try_new.pkl.gz")
+        elif ligand_db_name == "1":
+            id = random.randint(1, 374)
+            ligand_db = load_pkl(f"/2022133002/data/ligand_samples/samples_{id}.pkl.gz")
+        elif ligand_db_name == "2":
+            ligand_db = load_pkl("/2022133002/projects/stdock/stdock_v9.5/scripts/try_400k_2_new.pkl.gz")
+        else:
+            raise ValueError("MOL DB Name is Wrong!")
+        sample_id = random.choice(list(ligand_db.keys()))
+        sample_feature = ligand_db[sample_id]
+        tensors, infer_meta_data = self.load_ligand(sample_id, sample_feature)
+        return tensors, infer_meta_data
+    def write_pdb(self, x_pred, fname, infer_meta_data):
+        ccds = infer_meta_data["ccds"]
+        atom_id_to_conformer_atom_id = infer_meta_data["atom_id_to_conformer_atom_id"]
+        ccd_chunk_sizes = infer_meta_data["conformer_id_to_chunk_sizes"].tolist()
+        CHAIN_CLASS = infer_meta_data["CHAIN_CLASS"]
+        conf_meta_data = infer_meta_data["CONF_META_DATA"]
+        residue_index = infer_meta_data["residue_index"].tolist()
+        asym_id = infer_meta_data["asym_id"].tolist()
+        atom_lines = []
+        atom_offset = 0
+        for ccd_id, (ccd, chunk_size, res_id) in enumerate(zip(ccds, ccd_chunk_sizes, residue_index)):
+            inner_atom_idx = atom_id_to_conformer_atom_id[atom_offset:atom_offset + chunk_size]
+            atom_names = [conf_meta_data[ccd]["ref_atom_name_chars"][i] for i in inner_atom_idx]
+            atom_elements = [PeriodicTable[conf_meta_data[ccd]["ref_element"][i]] for i in inner_atom_idx]
+            chain_tag = PDB_CHAIN_IDS[int(asym_id[ccd_id])]
+            record_type = "HETATM" if CHAIN_CLASS[ccd_id] == "ligand" else "ATOM"
+            for ccd_atom_idx, atom_name in enumerate(atom_names):
+                x = x_pred[atom_offset]
+                name = atom_name if len(atom_name) == 4 else f" {atom_name}"
+                res_name_3 = ccd
+                alt_loc = ""
+                insertion_code = ""
+                occupancy = 1.00
+                element = atom_elements[ccd_atom_idx]
+                # b_factor = torch.argmax(plddt[atom_offset],dim=-1).item()*2 +1
+                b_factor = 70.
+                charge = 0
+                pos = x.tolist()
+                atom_line = (
+                    f"{record_type:<6}{atom_offset + 1:>5} {name:<4}{alt_loc:>1}"
+                    f"{res_name_3.split()[0]:>3} {chain_tag:>1}"
+                    f"{res_id + 1:>4}{insertion_code:>1}   "
+                    f"{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}"
+                    f"{occupancy:>6.2f}{b_factor:>6.2f}          "
+                    f"{element:>2}{charge:>2}"
+                )
+                atom_lines.append(atom_line)
+                atom_offset += 1
+                if atom_offset == len(atom_id_to_conformer_atom_id):
+                    break
+        out = "\n".join(atom_lines)
+        out = f"MODEL     1\n{out}\nTER\nENDMDL\nEND"
+        dump_txt(out, fname)

PhysDock/data/generate_system.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import sys
+import os
+import argparse
+import warnings
+import numpy as np
+from Bio.PDB import PDBParser
+from rdkit import Chem
+from PhysDock.utils.io_utils import dump_pkl, load_pkl, convert_md5_string
+from PhysDock.data.constants.PDBData import protein_letters_3to1_extended
+warnings.filterwarnings("ignore")
+def generate_system(
+        receptor_pdb_path,
+        ligand_sdf_path,
+        ligand_ccd_id,
+        systems_dir,
+        ccd_id_meta_data=None,
+        # bfd_database_path,
+        # uniclust30_database_path,
+        # uniref90_database_path,
+        # mgnify_database_path,
+        # uniprot_database_path,
+        # jackhmmer_binary_path,
+        # hhblits_binary_path,
+        #
+        # input_dir,
+        # out_dir,
+        #
+        # n_cpus=16,
+        # n_workers=1,
+):
+    """
+    Parse PDB and SDF files to generate protein-ligand complex features.
+    Args:
+        input_pdb (str): Path to the input PDB file.
+        input_sdf (str): Path to the input ligand SDF file.
+        systems_dir (str): Directory to save system feature pickle files.
+        feature_dir (str): Directory to save feature files (e.g., input FASTA).
+        ligand_id (str): CCD ID of the ligand.
+    """
+    # Create output directories
+    if ccd_id_meta_data is None:
+        print("Loading CCD meta data ...")
+        ccd_id_meta_data = load_pkl(os.path.join(os.path.split(__file__)[0], "../../params/ccd_id_meta_data.pkl.gz"))
+    os.makedirs(systems_dir, exist_ok=True)
+    # Initialize parser and data containers
+    pdb_parser = PDBParser()
+    structure = pdb_parser.get_structure("", receptor_pdb_path)
+    model = structure[0]
+    all_chain_features = {}
+    used_chain_ids = []
+    # Extract protein chains from PDB
+    for chain in model:
+        chain_id = chain.id
+        used_chain_ids.append(chain_id)
+        all_chain_features[chain_id] = {
+            "all_atom_positions": [],
+            "all_atom_mask": [],
+            "ccds": []
+        }
+        offset = None
+        for residue in chain:
+            if offset is None:
+                offset = int(residue.id[1])
+            resname = residue.get_resname().strip().ljust(3)
+            res_idx = int(residue.id[1]) - offset
+            num_atoms = len(ccd_id_meta_data[resname]["ref_atom_name_chars"])
+            # Fill missing residues
+            while len(all_chain_features[chain_id]["ccds"]) < res_idx:
+                all_chain_features[chain_id]["ccds"].append("UNK")
+                all_chain_features[chain_id]["all_atom_positions"].append(np.zeros([1, 3], dtype=np.float32))
+                all_chain_features[chain_id]["all_atom_mask"].append(np.zeros([1], dtype=np.int8))
+            # Initialize residue data
+            all_chain_features[chain_id]["ccds"].append(resname)
+            all_chain_features[chain_id]["all_atom_positions"].append(np.zeros([num_atoms, 3], dtype=np.float32))
+            all_chain_features[chain_id]["all_atom_mask"].append(np.zeros([num_atoms], dtype=np.int8))
+            ref_atom_names = ccd_id_meta_data[resname]["ref_atom_name_chars"]
+            for atom in residue:
+                if atom.name in ref_atom_names:
+                    atom_idx = ref_atom_names.index(atom.name)
+                    all_chain_features[chain_id]["all_atom_positions"][res_idx][atom_idx] = atom.coord
+                    all_chain_features[chain_id]["all_atom_mask"][res_idx][atom_idx] = 1
+        # Add interaction features # TODO PLIP
+        interaction_keys = ['salt bridges', 'pi-cation interactions', 'hydrophobic interactions',
+                            'pi-stacking', 'hydrogen bonds', 'metal complexes']
+        for key in interaction_keys:
+            all_chain_features[chain_id][key] = np.zeros(len(all_chain_features[chain_id]["ccds"]), dtype=np.int8)
+    # Extract ligand from SDF
+    supplier = Chem.SDMolSupplier(ligand_sdf_path, removeHs=True, sanitize=False)
+    mol = supplier[0]
+    mol = Chem.RemoveAllHs(mol)
+    conf = mol.GetConformer()
+    ligand_chain_id = "1"
+    used_chain_ids.append(ligand_chain_id)
+    ligand_atom_count = mol.GetNumAtoms()
+    ligand_positions = np.zeros([ligand_atom_count, 3], dtype=np.float32)
+    ligand_masks = np.ones([ligand_atom_count], dtype=np.int8)
+    for atom in mol.GetAtoms():
+        idx = atom.GetIdx()
+        pos = conf.GetAtomPosition(idx)
+        ligand_positions[idx] = [pos.x, pos.y, pos.z]
+    all_chain_features[ligand_chain_id] = {
+        "all_atom_positions": [ligand_positions],
+        "all_atom_mask": [ligand_masks],
+        "ccds": [ligand_ccd_id.upper()]
+    }
+    for key in interaction_keys:
+        all_chain_features[ligand_chain_id][key] = np.zeros(1, dtype=np.int8)
+    # Generate system pickle file
+    save_name = os.path.basename(receptor_pdb_path).replace('.pdb', '')
+    for cid in used_chain_ids:
+        save_name += f"_{cid}"
+    dump_pkl(all_chain_features, os.path.join(systems_dir, f"{save_name}.pkl.gz"))
+    # Generate FASTA files to run homo search
+    for cid, features in all_chain_features.items():
+        if cid == ligand_chain_id:
+            continue
+        sequence = ''.join(protein_letters_3to1_extended.get(ccd, "X") for ccd in features["ccds"])
+        md5_hash = convert_md5_string(f"protein:{sequence}")
+        os.makedirs(os.path.join(systems_dir, "fastas"), exist_ok=True)
+        with open(os.path.join(systems_dir, "fastas", f"{md5_hash}.fasta"), "w") as f:
+            f.write(f">{md5_hash}\n{sequence}\n")
+    print("Make system successfully!")

PhysDock/data/relaxation.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import copy
+import sys
+import torch
+import os
+import tqdm
+import pandas as pd
+import argparse
+sys.path.append("../")
+from PhysDock.utils.io_utils import run_pool_tasks, load_txt, dump_txt
+from pathlib import Path
+import openmm.app as mm_app
+import openmm.unit as mm_unit
+import openmm as mm
+import os.path
+import sys
+import mdtraj
+from openmm.app import PDBFile, Modeller
+import pdbfixer
+from openmmforcefields.generators import SystemGenerator
+from openff.toolkit import Molecule
+from openff.toolkit.utils.exceptions import UndefinedStereochemistryError, RadicalsNotSupportedError
+from openmm import CustomExternalForce
+from posebusters import PoseBusters
+from posebusters.posebusters import _dataframe_from_output
+from posebusters.cli import _select_mode, _format_results
+def get_bust_results(  # noqa: PLR0913
+        mol_pred,
+        mol_true,
+        mol_cond,
+        top_n: int | None = None,
+):
+    mol_pred = [Path(mol_pred)]
+    mol_true = Path(mol_true)
+    mol_cond = Path(mol_cond)  # Each bust running has different receptor
+    # run on single input
+    d = {k for k, v in dict(mol_pred=mol_pred, mol_true=mol_true, mol_cond=mol_cond).items() if v}
+    mode = _select_mode(None, d)
+    posebusters = PoseBusters(mode, top_n=top_n)
+    cols = ["mol_pred", "mol_true", "mol_cond"]
+    posebusters.file_paths = pd.DataFrame([[mol_pred, mol_true, mol_cond] for mol_pred in mol_pred], columns=cols)
+    posebusters_results = posebusters._run()
+    results = None
+    for i, results_dict in enumerate(posebusters_results):
+        results = _dataframe_from_output(results_dict, posebusters.config, full_report=True)
+        break
+    return results
+def fix_pdb(pdbname, outdir, file_name):
+    """add"""
+    fixer = pdbfixer.PDBFixer(pdbname)
+    fixer.findMissingResidues()
+    fixer.findNonstandardResidues()
+    fixer.replaceNonstandardResidues()
+    fixer.findMissingAtoms()
+    fixer.addMissingAtoms()
+    fixer.addMissingHydrogens(7.0)
+    # 根据文件名判断是否写入指定目录
+    # if "relaxed_complex" in file_name:
+    #     target_path = f'{outdir}/{file_name}_hydrogen_added.pdb'
+    # else:
+    #     target_path = f'{file_name}_hydrogen_added.pdb'
+    # mm_app.PDBFile.writeFile(fixer.topology, fixer.positions, open(target_path, 'w'))
+    return fixer.topology, fixer.positions
+def set_system(topology):
+    """
+    Set the system using the topology from the pdb file
+    """
+    # Put it in a force field to skip adding all particles manually
+    forcefield = mm_app.ForceField('amber14-all.xml', 'amber14/tip3pfb.xml')
+    system = forcefield.createSystem(topology,
+                                     removeCMMotion=False,
+                                     nonbondedMethod=mm_app.NoCutoff,
+                                     rigidWater=True  # Use implicit solvent
+                                     )
+    return system
+def minimize_energy(
+        topology,
+        system,
+        positions,
+        outdir,
+        out_title
+):
+    '''Function that minimizes energy, given topology, OpenMM system, and positions '''
+    # Use a Brownian Integrator
+    integrator = mm.BrownianIntegrator(
+        100 * mm.unit.kelvin,
+        100. / mm.unit.picoseconds,
+        2.0 * mm.unit.femtoseconds
+    )
+    # platform = Platform.getPlatformByName('CUDA')
+    # properties = {'DeviceIndex': '0', 'Precision': 'mixed'}
+    simulation = mm.app.Simulation(topology, system, integrator)
+    # Initialize the DCDReporter
+    reportInterval = 100  # Adjust this value as needed
+    reporter = mdtraj.reporters.DCDReporter('positions.dcd', reportInterval)
+    # Add the reporter to the simulation
+    simulation.reporters.append(reporter)
+    simulation.context.setPositions(positions)
+    simulation.minimizeEnergy(1, 100)
+    # Save positions
+    minpositions = simulation.context.getState(getPositions=True).getPositions()
+    # 根据out_title决定是否写入指定目录
+    if "relaxed_complex" in out_title:
+        target_path = outdir + f'/{out_title}.pdb'
+    else:
+        target_path = f'{out_title}.pdb'
+    mm_app.PDBFile.writeFile(topology, minpositions, open(target_path, 'w'))
+    # Get and return the minimized energy
+    minimized_energy = simulation.context.getState(getEnergy=True).getPotentialEnergy()
+    reporter.close()
+    return topology, minpositions, minimized_energy
+def add_restraints(
+        system,
+        topology,
+        positions,
+        restraint_type
+):
+    '''Function to add restraints to specified group of atoms
+    Code adapted from https://gist.github.com/peastman/ad8cda653242d731d75e18c836b2a3a5
+    '''
+    restraint = CustomExternalForce('k*periodicdistance(x, y, z, x0, y0, z0)^2')
+    system.addForce(restraint)
+    restraint.addGlobalParameter('k', 100000000.0 * mm_unit.kilojoules_per_mole / mm_unit.nanometer ** 2)
+    restraint.addPerParticleParameter('x0')
+    restraint.addPerParticleParameter('y0')
+    restraint.addPerParticleParameter('z0')
+    for atom in topology.atoms():
+        if restraint_type == 'protein':
+            if 'x' not in atom.name:
+                restraint.addParticle(atom.index, positions[atom.index])
+        elif restraint_type == 'CA+ligand':
+            if ('x' in atom.name) or (atom.name == "CA"):
+                restraint.addParticle(atom.index, positions[atom.index])
+    return system
+def run(
+        # i
+        input_pdb,
+        outdir,
+        mol_in,
+        file_name,
+        restraint_type="ca+ligand",
+        relax_protein_first=False,
+        steps=100,
+):
+    try:
+        ligand_mol = Molecule.from_file(mol_in)
+    # Check for undefined stereochemistry, allow undefined stereochemistry to be loaded
+    except UndefinedStereochemistryError:
+        print('Undefined Stereochemistry Error found! Trying with undefined stereo flag True')
+        ligand_mol = Molecule.from_file(mol_in, allow_undefined_stereo=True)
+    # Check for radicals -- break out of script if radical is encountered
+    except RadicalsNotSupportedError:
+        print('OpenFF does not currently support radicals -- use unrelaxed structure')
+        sys.exit()
+    # Assigning partial charges first because the default method (am1bcc) does not work
+    ligand_mol.assign_partial_charges(partial_charge_method='gasteiger')
+    ## Read protein PDB and add hydrogens
+    protein_topology, protein_positions = fix_pdb(input_pdb, outdir, file_name)
+    # print('Added all atoms...')
+    # Minimize energy for the protein
+    system = set_system(protein_topology)
+    # print('Creating system...')
+    # Relax
+    if relax_protein_first:
+        print('Relaxing ONLY protein structure...')
+        protein_topology, protein_positions = minimize_energy(
+            protein_topology,
+            system,
+            protein_positions,
+            outdir,
+            f'{file_name}_relaxed_protein'
+        )
+    # print('Preparing complex')
+    ## Add protein first
+    modeller = Modeller(protein_topology, protein_positions)
+    # print('System has %d atoms' % modeller.topology.getNumAtoms())
+    ## Then add ligand
+    # print('Adding ligand...')
+    lig_top = ligand_mol.to_topology()
+    modeller.add(lig_top.to_openmm(), lig_top.get_positions().to_openmm())
+    # print('System has %d atoms' % modeller.topology.getNumAtoms())
+    # print('Preparing system')
+    # Initialize a SystemGenerator using the GAFF for the ligand and implicit water.
+    # forcefield_kwargs = {'constraints': mm_app.HBonds, 'rigidWater': True, 'removeCMMotion': False, 'hydrogenMass': 4*mm_unit.amu }
+    system_generator = SystemGenerator(
+        forcefields=['amber14-all.xml', 'implicit/gbn2.xml'],
+        small_molecule_forcefield='gaff-2.11',
+        molecules=[ligand_mol],
+        # forcefield_kwargs=forcefield_kwargs
+    )
+    ## Create system
+    system = system_generator.create_system(modeller.topology, molecules=ligand_mol)
+    # if restraint_type == 'protein':
+    #     print('Adding restraints on entire protein')
+    # elif restraint_type == 'CA+ligand':
+    #     print('Adding restraints on protein CAs and ligand atoms')
+    system = add_restraints(system, modeller.topology, modeller.positions, restraint_type=restraint_type)
+    ## Minimize energy for the complex and print the minimized energy
+    _, _, minimized_energy = minimize_energy(
+        modeller.topology,
+        system,
+        modeller.positions,
+        outdir,
+        f'{file_name}_relaxed_complex'
+    )
+def relax(receptor_pdb, ligand_mol_sdf):
+    output_dir = os.path.split(receptor_pdb)[0]
+    file_name = os.path.split(receptor_pdb)[1].split(".")[0]
+    system_file_name = "system" + file_name.split("receptor")[1]
+    try:
+        run(
+            input_pdb=receptor_pdb,
+            outdir=output_dir,
+            mol_in=ligand_mol_sdf,
+            file_name=system_file_name
+        )
+        lines = load_txt(
+            os.path.join(output_dir, f"{system_file_name}_relaxed_complex.pdb")).split("\n")
+        receptor = "\n".join([i for i in lines if "HETATM" not in i])
+        dump_txt(receptor, os.path.join(output_dir, f"{file_name}_relaxed_complex.pdb"))
+    except Exception as e:
+        print(dir, "can't relax,", e)

PhysDock/data/tools/PDBData.py ADDED Viewed

	@@ -0,0 +1,348 @@

+# Copyright 2000 Andrew Dalke.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+"""Information about the IUPAC alphabets."""
+protein_letters = "ACDEFGHIKLMNPQRSTVWY"
+extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO"
+#   B = "Asx";  aspartic acid or asparagine (D or N)
+#   X = "Xxx";  unknown or 'other' amino acid
+#   Z = "Glx";  glutamic acid or glutamine (E or Q)
+#   http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212
+#
+#   J = "Xle";  leucine or isoleucine (L or I, used in NMR)
+#   Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
+#   Also the International Nucleotide Sequence Database Collaboration (INSDC)
+#   (i.e. GenBank, EMBL, DDBJ) adopted this in 2006
+#   http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html
+#
+#   Xle (J); Leucine or Isoleucine
+#   The residue abbreviations, Xle (the three-letter abbreviation) and J
+#   (the one-letter abbreviation) are reserved for the case that cannot
+#   experimentally distinguish leucine from isoleucine.
+#
+#   U = "Sec";  selenocysteine
+#   http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
+#
+#   O = "Pyl";  pyrrolysine
+#   http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35
+protein_letters_1to3 = {
+    "A": "Ala",
+    "C": "Cys",
+    "D": "Asp",
+    "E": "Glu",
+    "F": "Phe",
+    "G": "Gly",
+    "H": "His",
+    "I": "Ile",
+    "K": "Lys",
+    "L": "Leu",
+    "M": "Met",
+    "N": "Asn",
+    "P": "Pro",
+    "Q": "Gln",
+    "R": "Arg",
+    "S": "Ser",
+    "T": "Thr",
+    "V": "Val",
+    "W": "Trp",
+    "Y": "Tyr",
+}
+protein_letters_1to3 = {k.upper(): v.upper() for k, v in protein_letters_1to3.items()}
+protein_letters_3to1 = {v: k for k, v in protein_letters_1to3.items()}
+protein_letters_3to1_extended = {
+    "A5N": "N", "A8E": "V", "A9D": "S", "AA3": "A", "AA4": "A", "AAR": "R",
+    "ABA": "A", "ACL": "R", "AEA": "C", "AEI": "D", "AFA": "N", "AGM": "R",
+    "AGQ": "Y", "AGT": "C", "AHB": "N", "AHL": "R", "AHO": "A", "AHP": "A",
+    "AIB": "A", "AKL": "D", "AKZ": "D", "ALA": "A", "ALC": "A", "ALM": "A",
+    "ALN": "A", "ALO": "T", "ALS": "A", "ALT": "A", "ALV": "A", "ALY": "K",
+    "AME": "M", "AN6": "L", "AN8": "A", "API": "K", "APK": "K", "AR2": "R",
+    "AR4": "E", "AR7": "R", "ARG": "R", "ARM": "R", "ARO": "R", "AS7": "N",
+    "ASA": "D", "ASB": "D", "ASI": "D", "ASK": "D", "ASL": "D", "ASN": "N",
+    "ASP": "D", "ASQ": "D", "AYA": "A", "AZH": "A", "AZK": "K", "AZS": "S",
+    "AZY": "Y", "AVJ": "H", "A30": "Y", "A3U": "F", "ECC": "Q", "ECX": "C",
+    "EFC": "C", "EHP": "F", "ELY": "K", "EME": "E", "EPM": "M", "EPQ": "Q",
+    "ESB": "Y", "ESC": "M", "EXY": "L", "EXA": "K", "E0Y": "P", "E9V": "H",
+    "E9M": "W", "EJA": "C", "EUP": "T", "EZY": "G", "E9C": "Y", "EW6": "S",
+    "EXL": "W", "I2M": "I", "I4G": "G", "I58": "K", "IAM": "A", "IAR": "R",
+    "ICY": "C", "IEL": "K", "IGL": "G", "IIL": "I", "ILE": "I", "ILG": "E",
+    "ILM": "I", "ILX": "I", "ILY": "K", "IML": "I", "IOR": "R", "IPG": "G",
+    "IT1": "K", "IYR": "Y", "IZO": "M", "IC0": "G", "M0H": "C", "M2L": "K",
+    "M2S": "M", "M30": "G", "M3L": "K", "M3R": "K", "MA ": "A", "MAA": "A",
+    "MAI": "R", "MBQ": "Y", "MC1": "S", "MCL": "K", "MCS": "C", "MD3": "C",
+    "MD5": "C", "MD6": "G", "MDF": "Y", "ME0": "M", "MEA": "F", "MEG": "E",
+    "MEN": "N", "MEQ": "Q", "MET": "M", "MEU": "G", "MFN": "E", "MGG": "R",
+    "MGN": "Q", "MGY": "G", "MH1": "H", "MH6": "S", "MHL": "L", "MHO": "M",
+    "MHS": "H", "MHU": "F", "MIR": "S", "MIS": "S", "MK8": "L", "ML3": "K",
+    "MLE": "L", "MLL": "L", "MLY": "K", "MLZ": "K", "MME": "M", "MMO": "R",
+    "MNL": "L", "MNV": "V", "MP8": "P", "MPQ": "G", "MSA": "G", "MSE": "M",
+    "MSL": "M", "MSO": "M", "MT2": "M", "MTY": "Y", "MVA": "V", "MYK": "K",
+    "MYN": "R", "QCS": "C", "QIL": "I", "QMM": "Q", "QPA": "C", "QPH": "F",
+    "Q3P": "K", "QVA": "C", "QX7": "A", "Q2E": "W", "Q75": "M", "Q78": "F",
+    "QM8": "L", "QMB": "A", "QNQ": "C", "QNT": "C", "QNW": "C", "QO2": "C",
+    "QO5": "C", "QO8": "C", "QQ8": "Q", "U2X": "Y", "U3X": "F", "UF0": "S",
+    "UGY": "G", "UM1": "A", "UM2": "A", "UMA": "A", "UQK": "A", "UX8": "W",
+    "UXQ": "F", "YCM": "C", "YOF": "Y", "YPR": "P", "YPZ": "Y", "YTH": "T",
+    "Y1V": "L", "Y57": "K", "YHA": "K", "200": "F", "23F": "F", "23P": "A",
+    "26B": "T", "28X": "T", "2AG": "A", "2CO": "C", "2FM": "M", "2GX": "F",
+    "2HF": "H", "2JG": "S", "2KK": "K", "2KP": "K", "2LT": "Y", "2LU": "L",
+    "2ML": "L", "2MR": "R", "2MT": "P", "2OR": "R", "2P0": "P", "2QZ": "T",
+    "2R3": "Y", "2RA": "A", "2RX": "S", "2SO": "H", "2TY": "Y", "2VA": "V",
+    "2XA": "C", "2ZC": "S", "6CL": "K", "6CW": "W", "6GL": "A", "6HN": "K",
+    "60F": "C", "66D": "I", "6CV": "A", "6M6": "C", "6V1": "C", "6WK": "C",
+    "6Y9": "P", "6DN": "K", "DA2": "R", "DAB": "A", "DAH": "F", "DBS": "S",
+    "DBU": "T", "DBY": "Y", "DBZ": "A", "DC2": "C", "DDE": "H", "DDZ": "A",
+    "DI7": "Y", "DHA": "S", "DHN": "V", "DIR": "R", "DLS": "K", "DM0": "K",
+    "DMH": "N", "DMK": "D", "DNL": "K", "DNP": "A", "DNS": "K", "DNW": "A",
+    "DOH": "D", "DON": "L", "DP1": "R", "DPL": "P", "DPP": "A", "DPQ": "Y",
+    "DYS": "C", "D2T": "D", "DYA": "D", "DJD": "F", "DYJ": "P", "DV9": "E",
+    "H14": "F", "H1D": "M", "H5M": "P", "HAC": "A", "HAR": "R", "HBN": "H",
+    "HCM": "C", "HGY": "G", "HHI": "H", "HIA": "H", "HIC": "H", "HIP": "H",
+    "HIQ": "H", "HIS": "H", "HL2": "L", "HLU": "L", "HMR": "R", "HNC": "C",
+    "HOX": "F", "HPC": "F", "HPE": "F", "HPH": "F", "HPQ": "F", "HQA": "A",
+    "HR7": "R", "HRG": "R", "HRP": "W", "HS8": "H", "HS9": "H", "HSE": "S",
+    "HSK": "H", "HSL": "S", "HSO": "H", "HT7": "W", "HTI": "C", "HTR": "W",
+    "HV5": "A", "HVA": "V", "HY3": "P", "HYI": "M", "HYP": "P", "HZP": "P",
+    "HIX": "A", "HSV": "H", "HLY": "K", "HOO": "H", "H7V": "A", "L5P": "K",
+    "LRK": "K", "L3O": "L", "LA2": "K", "LAA": "D", "LAL": "A", "LBY": "K",
+    "LCK": "K", "LCX": "K", "LDH": "K", "LE1": "V", "LED": "L", "LEF": "L",
+    "LEH": "L", "LEM": "L", "LEN": "L", "LET": "K", "LEU": "L", "LEX": "L",
+    "LGY": "K", "LLO": "K", "LLP": "K", "LLY": "K", "LLZ": "K", "LME": "E",
+    "LMF": "K", "LMQ": "Q", "LNE": "L", "LNM": "L", "LP6": "K", "LPD": "P",
+    "LPG": "G", "LPS": "S", "LSO": "K", "LTR": "W", "LVG": "G", "LVN": "V",
+    "LWY": "P", "LYF": "K", "LYK": "K", "LYM": "K", "LYN": "K", "LYO": "K",
+    "LYP": "K", "LYR": "K", "LYS": "K", "LYU": "K", "LYX": "K", "LYZ": "K",
+    "LAY": "L", "LWI": "F", "LBZ": "K", "P1L": "C", "P2Q": "Y", "P2Y": "P",
+    "P3Q": "Y", "PAQ": "Y", "PAS": "D", "PAT": "W", "PBB": "C", "PBF": "F",
+    "PCA": "Q", "PCC": "P", "PCS": "F", "PE1": "K", "PEC": "C", "PF5": "F",
+    "PFF": "F", "PG1": "S", "PGY": "G", "PHA": "F", "PHD": "D", "PHE": "F",
+    "PHI": "F", "PHL": "F", "PHM": "F", "PKR": "P", "PLJ": "P", "PM3": "F",
+    "POM": "P", "PPN": "F", "PR3": "C", "PR4": "P", "PR7": "P", "PR9": "P",
+    "PRJ": "P", "PRK": "K", "PRO": "P", "PRS": "P", "PRV": "G", "PSA": "F",
+    "PSH": "H", "PTH": "Y", "PTM": "Y", "PTR": "Y", "PVH": "H", "PXU": "P",
+    "PYA": "A", "PYH": "K", "PYX": "C", "PH6": "P", "P9S": "C", "P5U": "S",
+    "POK": "R", "T0I": "Y", "T11": "F", "TAV": "D", "TBG": "V", "TBM": "T",
+    "TCQ": "Y", "TCR": "W", "TEF": "F", "TFQ": "F", "TH5": "T", "TH6": "T",
+    "THC": "T", "THR": "T", "THZ": "R", "TIH": "A", "TIS": "S", "TLY": "K",
+    "TMB": "T", "TMD": "T", "TNB": "C", "TNR": "S", "TNY": "T", "TOQ": "W",
+    "TOX": "W", "TPJ": "P", "TPK": "P", "TPL": "W", "TPO": "T", "TPQ": "Y",
+    "TQI": "W", "TQQ": "W", "TQZ": "C", "TRF": "W", "TRG": "K", "TRN": "W",
+    "TRO": "W", "TRP": "W", "TRQ": "W", "TRW": "W", "TRX": "W", "TRY": "W",
+    "TS9": "I", "TSY": "C", "TTQ": "W", "TTS": "Y", "TXY": "Y", "TY1": "Y",
+    "TY2": "Y", "TY3": "Y", "TY5": "Y", "TY8": "Y", "TY9": "Y", "TYB": "Y",
+    "TYC": "Y", "TYE": "Y", "TYI": "Y", "TYJ": "Y", "TYN": "Y", "TYO": "Y",
+    "TYQ": "Y", "TYR": "Y", "TYS": "Y", "TYT": "Y", "TYW": "Y", "TYY": "Y",
+    "T8L": "T", "T9E": "T", "TNQ": "W", "TSQ": "F", "TGH": "W", "X2W": "E",
+    "XCN": "C", "XPR": "P", "XSN": "N", "XW1": "A", "XX1": "K", "XYC": "A",
+    "XA6": "F", "11Q": "P", "11W": "E", "12L": "P", "12X": "P", "12Y": "P",
+    "143": "C", "1AC": "A", "1L1": "A", "1OP": "Y", "1PA": "F", "1PI": "A",
+    "1TQ": "W", "1TY": "Y", "1X6": "S", "56A": "H", "5AB": "A", "5CS": "C",
+    "5CW": "W", "5HP": "E", "5OH": "A", "5PG": "G", "51T": "Y", "54C": "W",
+    "5CR": "F", "5CT": "K", "5FQ": "A", "5GM": "I", "5JP": "S", "5T3": "K",
+    "5MW": "K", "5OW": "K", "5R5": "S", "5VV": "N", "5XU": "A", "55I": "F",
+    "999": "D", "9DN": "N", "9NE": "E", "9NF": "F", "9NR": "R", "9NV": "V",
+    "9E7": "K", "9KP": "K", "9WV": "A", "9TR": "K", "9TU": "K", "9TX": "K",
+    "9U0": "K", "9IJ": "F", "B1F": "F", "B27": "T", "B2A": "A", "B2F": "F",
+    "B2I": "I", "B2V": "V", "B3A": "A", "B3D": "D", "B3E": "E", "B3K": "K",
+    "B3U": "H", "B3X": "N", "B3Y": "Y", "BB6": "C", "BB7": "C", "BB8": "F",
+    "BB9": "C", "BBC": "C", "BCS": "C", "BCX": "C", "BFD": "D", "BG1": "S",
+    "BH2": "D", "BHD": "D", "BIF": "F", "BIU": "I", "BL2": "L", "BLE": "L",
+    "BLY": "K", "BMT": "T", "BNN": "F", "BOR": "R", "BP5": "A", "BPE": "C",
+    "BSE": "S", "BTA": "L", "BTC": "C", "BTK": "K", "BTR": "W", "BUC": "C",
+    "BUG": "V", "BYR": "Y", "BWV": "R", "BWB": "S", "BXT": "S", "F2F": "F",
+    "F2Y": "Y", "FAK": "K", "FB5": "A", "FB6": "A", "FC0": "F", "FCL": "F",
+    "FDL": "K", "FFM": "C", "FGL": "G", "FGP": "S", "FH7": "K", "FHL": "K",
+    "FHO": "K", "FIO": "R", "FLA": "A", "FLE": "L", "FLT": "Y", "FME": "M",
+    "FOE": "C", "FP9": "P", "FPK": "P", "FT6": "W", "FTR": "W", "FTY": "Y",
+    "FVA": "V", "FZN": "K", "FY3": "Y", "F7W": "W", "FY2": "Y", "FQA": "K",
+    "F7Q": "Y", "FF9": "K", "FL6": "D", "JJJ": "C", "JJK": "C", "JJL": "C",
+    "JLP": "K", "J3D": "C", "J9Y": "R", "J8W": "S", "JKH": "P", "N10": "S",
+    "N7P": "P", "NA8": "A", "NAL": "A", "NAM": "A", "NBQ": "Y", "NC1": "S",
+    "NCB": "A", "NEM": "H", "NEP": "H", "NFA": "F", "NIY": "Y", "NLB": "L",
+    "NLE": "L", "NLN": "L", "NLO": "L", "NLP": "L", "NLQ": "Q", "NLY": "G",
+    "NMC": "G", "NMM": "R", "NNH": "R", "NOT": "L", "NPH": "C", "NPI": "A",
+    "NTR": "Y", "NTY": "Y", "NVA": "V", "NWD": "A", "NYB": "C", "NYS": "C",
+    "NZH": "H", "N80": "P", "NZC": "T", "NLW": "L", "N0A": "F", "N9P": "A",
+    "N65": "K", "R1A": "C", "R4K": "W", "RE0": "W", "RE3": "W", "RGL": "R",
+    "RGP": "E", "RT0": "P", "RVX": "S", "RZ4": "S", "RPI": "R", "RVJ": "A",
+    "VAD": "V", "VAF": "V", "VAH": "V", "VAI": "V", "VAL": "V", "VB1": "K",
+    "VH0": "P", "VR0": "R", "V44": "C", "V61": "F", "VPV": "K", "V5N": "H",
+    "V7T": "K", "Z01": "A", "Z3E": "T", "Z70": "H", "ZBZ": "C", "ZCL": "F",
+    "ZU0": "T", "ZYJ": "P", "ZYK": "P", "ZZD": "C", "ZZJ": "A", "ZIQ": "W",
+    "ZPO": "P", "ZDJ": "Y", "ZT1": "K", "30V": "C", "31Q": "C", "33S": "F",
+    "33W": "A", "34E": "V", "3AH": "H", "3BY": "P", "3CF": "F", "3CT": "Y",
+    "3GA": "A", "3GL": "E", "3MD": "D", "3MY": "Y", "3NF": "Y", "3O3": "E",
+    "3PX": "P", "3QN": "K", "3TT": "P", "3XH": "G", "3YM": "Y", "3WS": "A",
+    "3WX": "P", "3X9": "C", "3ZH": "H", "7JA": "I", "73C": "S", "73N": "R",
+    "73O": "Y", "73P": "K", "74P": "K", "7N8": "F", "7O5": "A", "7XC": "F",
+    "7ID": "D", "7OZ": "A", "C1S": "C", "C1T": "C", "C1X": "K", "C22": "A",
+    "C3Y": "C", "C4R": "C", "C5C": "C", "C6C": "C", "CAF": "C", "CAS": "C",
+    "CAY": "C", "CCS": "C", "CEA": "C", "CGA": "E", "CGU": "E", "CGV": "C",
+    "CHP": "G", "CIR": "R", "CLE": "L", "CLG": "K", "CLH": "K", "CME": "C",
+    "CMH": "C", "CML": "C", "CMT": "C", "CR5": "G", "CS0": "C", "CS1": "C",
+    "CS3": "C", "CS4": "C", "CSA": "C", "CSB": "C", "CSD": "C", "CSE": "C",
+    "CSJ": "C", "CSO": "C", "CSP": "C", "CSR": "C", "CSS": "C", "CSU": "C",
+    "CSW": "C", "CSX": "C", "CSZ": "C", "CTE": "W", "CTH": "T", "CWD": "A",
+    "CWR": "S", "CXM": "M", "CY0": "C", "CY1": "C", "CY3": "C", "CY4": "C",
+    "CYA": "C", "CYD": "C", "CYF": "C", "CYG": "C", "CYJ": "K", "CYM": "C",
+    "CYQ": "C", "CYR": "C", "CYS": "C", "CYW": "C", "CZ2": "C", "CZZ": "C",
+    "CG6": "C", "C1J": "R", "C4G": "R", "C67": "R", "C6D": "R", "CE7": "N",
+    "CZS": "A", "G01": "E", "G8M": "E", "GAU": "E", "GEE": "G", "GFT": "S",
+    "GHC": "E", "GHG": "Q", "GHW": "E", "GL3": "G", "GLH": "Q", "GLJ": "E",
+    "GLK": "E", "GLN": "Q", "GLQ": "E", "GLU": "E", "GLY": "G", "GLZ": "G",
+    "GMA": "E", "GME": "E", "GNC": "Q", "GPL": "K", "GSC": "G", "GSU": "E",
+    "GT9": "C", "GVL": "S", "G3M": "R", "G5G": "L", "G1X": "Y", "G8X": "P",
+    "K1R": "C", "KBE": "K", "KCX": "K", "KFP": "K", "KGC": "K", "KNB": "A",
+    "KOR": "M", "KPI": "K", "KPY": "K", "KST": "K", "KYN": "W", "KYQ": "K",
+    "KCR": "K", "KPF": "K", "K5L": "S", "KEO": "K", "KHB": "K", "KKD": "D",
+    "K5H": "C", "K7K": "S", "OAR": "R", "OAS": "S", "OBS": "K", "OCS": "C",
+    "OCY": "C", "OHI": "H", "OHS": "D", "OLD": "H", "OLT": "T", "OLZ": "S",
+    "OMH": "S", "OMT": "M", "OMX": "Y", "OMY": "Y", "ONH": "A", "ORN": "A",
+    "ORQ": "R", "OSE": "S", "OTH": "T", "OXX": "D", "OYL": "H", "O7A": "T",
+    "O7D": "W", "O7G": "V", "O2E": "S", "O6H": "W", "OZW": "F", "S12": "S",
+    "S1H": "S", "S2C": "C", "S2P": "A", "SAC": "S", "SAH": "C", "SAR": "G",
+    "SBG": "S", "SBL": "S", "SCH": "C", "SCS": "C", "SCY": "C", "SD4": "N",
+    "SDB": "S", "SDP": "S", "SEB": "S", "SEE": "S", "SEG": "A", "SEL": "S",
+    "SEM": "S", "SEN": "S", "SEP": "S", "SER": "S", "SET": "S", "SGB": "S",
+    "SHC": "C", "SHP": "G", "SHR": "K", "SIB": "C", "SLL": "K", "SLZ": "K",
+    "SMC": "C", "SME": "M", "SMF": "F", "SNC": "C", "SNN": "N", "SOY": "S",
+    "SRZ": "S", "STY": "Y", "SUN": "S", "SVA": "S", "SVV": "S", "SVW": "S",
+    "SVX": "S", "SVY": "S", "SVZ": "S", "SXE": "S", "SKH": "K", "SNM": "S",
+    "SNK": "H", "SWW": "S", "WFP": "F", "WLU": "L", "WPA": "F", "WRP": "W",
+    "WVL": "V", "02K": "A", "02L": "N", "02O": "A", "02Y": "A", "033": "V",
+    "037": "P", "03Y": "C", "04U": "P", "04V": "P", "05N": "P", "07O": "C",
+    "0A0": "D", "0A1": "Y", "0A2": "K", "0A8": "C", "0A9": "F", "0AA": "V",
+    "0AB": "V", "0AC": "G", "0AF": "W", "0AG": "L", "0AH": "S", "0AK": "D",
+    "0AR": "R", "0BN": "F", "0CS": "A", "0E5": "T", "0EA": "Y", "0FL": "A",
+    "0LF": "P", "0NC": "A", "0PR": "Y", "0QL": "C", "0TD": "D", "0UO": "W",
+    "0WZ": "Y", "0X9": "R", "0Y8": "P", "4AF": "F", "4AR": "R", "4AW": "W",
+    "4BF": "F", "4CF": "F", "4CY": "M", "4DP": "W", "4FB": "P", "4FW": "W",
+    "4HL": "Y", "4HT": "W", "4IN": "W", "4MM": "M", "4PH": "F", "4U7": "A",
+    "41H": "F", "41Q": "N", "42Y": "S", "432": "S", "45F": "P", "4AK": "K",
+    "4D4": "R", "4GJ": "C", "4KY": "P", "4L0": "P", "4LZ": "Y", "4N7": "P",
+    "4N8": "P", "4N9": "P", "4OG": "W", "4OU": "F", "4OV": "S", "4OZ": "S",
+    "4PQ": "W", "4SJ": "F", "4WQ": "A", "4HH": "S", "4HJ": "S", "4J4": "C",
+    "4J5": "R", "4II": "F", "4VI": "R", "823": "N", "8SP": "S", "8AY": "A",
+}
+# Nucleic Acids
+nucleic_letters_3to1 = {
+    "A  ": "A", "C  ": "C", "G  ": "G", "U  ": "U",
+    "DA ": "A", "DC ": "C", "DG ": "G", "DT ": "T",
+}
+rna_letters_3to1 = {
+    "A  ": "A", "C  ": "C", "G  ": "G", "U  ": "U",
+}
+dna_letters_3to1 = {
+    "DA ": "A", "DC ": "C", "DG ": "G", "DT ": "T",
+}
+# fmt: off
+nucleic_letters_3to1_extended = {
+    "A  ": "A", "A23": "A", "A2L": "A", "A2M": "A", "A34": "A", "A35": "A",
+    "A38": "A", "A39": "A", "A3A": "A", "A3P": "A", "A40": "A", "A43": "A",
+    "A44": "A", "A47": "A", "A5L": "A", "A5M": "C", "A5O": "A", "A6A": "A",
+    "A6C": "C", "A6G": "G", "A6U": "U", "A7E": "A", "A9Z": "A", "ABR": "A",
+    "ABS": "A", "AD2": "A", "ADI": "A", "ADP": "A", "AET": "A", "AF2": "A",
+    "AFG": "G", "AMD": "A", "AMO": "A", "AP7": "A", "AS ": "A", "ATD": "T",
+    "ATL": "T", "ATM": "T", "AVC": "A", "AI5": "C", "E  ": "A", "E1X": "A",
+    "EDA": "A", "EFG": "G", "EHG": "G", "EIT": "T", "EXC": "C", "E3C": "C",
+    "E6G": "G", "E7G": "G", "EQ4": "G", "EAN": "T", "I5C": "C", "IC ": "C",
+    "IG ": "G", "IGU": "G", "IMC": "C", "IMP": "G", "IU ": "U", "I4U": "U",
+    "IOO": "G", "M1G": "G", "M2G": "G", "M4C": "C", "M5M": "C", "MA6": "A",
+    "MA7": "A", "MAD": "A", "MCY": "C", "ME6": "C", "MEP": "U", "MG1": "G",
+    "MGQ": "A", "MGT": "G", "MGV": "G", "MIA": "A", "MMT": "T", "MNU": "U",
+    "MRG": "G", "MTR": "T", "MTU": "A", "MFO": "G", "M7A": "A", "MHG": "G",
+    "MMX": "C", "QUO": "G", "QCK": "T", "QSQ": "A", "U  ": "U", "U25": "U",
+    "U2L": "U", "U2P": "U", "U31": "U", "U34": "U", "U36": "U", "U37": "U",
+    "U8U": "U", "UAR": "U", "UBB": "U", "UBD": "U", "UD5": "U", "UPV": "U",
+    "UR3": "U", "URD": "U", "US3": "T", "US5": "U", "UZR": "U", "UMO": "U",
+    "U23": "U", "U48": "C", "U7B": "C", "Y  ": "A", "YCO": "C", "YG ": "G",
+    "YYG": "G", "23G": "G", "26A": "A", "2AR": "A", "2AT": "T", "2AU": "U",
+    "2BT": "T", "2BU": "A", "2DA": "A", "2DT": "T", "2EG": "G", "2GT": "T",
+    "2JV": "G", "2MA": "A", "2MG": "G", "2MU": "U", "2NT": "T", "2OM": "U",
+    "2OT": "T", "2PR": "G", "2SG": "G", "2ST": "T", "63G": "G", "63H": "G",
+    "64T": "T", "68Z": "G", "6CT": "T", "6HA": "A", "6HB": "A", "6HC": "C",
+    "6HG": "G", "6HT": "T", "6IA": "A", "6MA": "A", "6MC": "A", "6MP": "A",
+    "6MT": "A", "6MZ": "A", "6OG": "G", "6PO": "G", "6FK": "G", "6NW": "A",
+    "6OO": "C", "D00": "C", "D3T": "T", "D4M": "T", "DA ": "A", "DC ": "C",
+    "DCG": "G", "DCT": "C", "DDG": "G", "DFC": "C", "DFG": "G", "DG ": "G",
+    "DG8": "G", "DGI": "G", "DGP": "G", "DHU": "U", "DNR": "C", "DOC": "C",
+    "DPB": "T", "DRT": "T", "DT ": "T", "DZM": "A", "D4B": "C", "H2U": "U",
+    "HN0": "G", "HN1": "G", "LC ": "C", "LCA": "A", "LCG": "G", "LG ": "G",
+    "LGP": "G", "LHU": "U", "LSH": "T", "LST": "T", "LDG": "G", "L3X": "A",
+    "LHH": "C", "LV2": "C", "L1J": "G", "P  ": "G", "P2T": "T", "P5P": "A",
+    "PG7": "G", "PGN": "G", "PGP": "G", "PMT": "C", "PPU": "A", "PPW": "G",
+    "PR5": "A", "PRN": "A", "PST": "T", "PSU": "U", "PU ": "A", "PVX": "C",
+    "PYO": "U", "PZG": "G", "P4U": "U", "P7G": "G", "T  ": "T", "T2S": "T",
+    "T31": "U", "T32": "T", "T36": "T", "T37": "T", "T38": "T", "T39": "T",
+    "T3P": "T", "T41": "T", "T48": "T", "T49": "T", "T4S": "T", "T5S": "T",
+    "T64": "T", "T6A": "A", "TA3": "T", "TAF": "T", "TBN": "A", "TC1": "C",
+    "TCP": "T", "TCY": "A", "TDY": "T", "TED": "T", "TFE": "T", "TFF": "T",
+    "TFO": "A", "TFT": "T", "TGP": "G", "TCJ": "C", "TLC": "T", "TP1": "T",
+    "TPC": "C", "TPG": "G", "TSP": "T", "TTD": "T", "TTM": "T", "TXD": "A",
+    "TXP": "A", "TC ": "C", "TG ": "G", "T0N": "G", "T0Q": "G", "X  ": "G",
+    "XAD": "A", "XAL": "A", "XCL": "C", "XCR": "C", "XCT": "C", "XCY": "C",
+    "XGL": "G", "XGR": "G", "XGU": "G", "XPB": "G", "XTF": "T", "XTH": "T",
+    "XTL": "T", "XTR": "T", "XTS": "G", "XUA": "A", "XUG": "G", "102": "G",
+    "10C": "C", "125": "U", "126": "U", "127": "U", "12A": "A", "16B": "C",
+    "18M": "G", "1AP": "A", "1CC": "C", "1FC": "C", "1MA": "A", "1MG": "G",
+    "1RN": "U", "1SC": "C", "5AA": "A", "5AT": "T", "5BU": "U", "5CG": "G",
+    "5CM": "C", "5FA": "A", "5FC": "C", "5FU": "U", "5HC": "C", "5HM": "C",
+    "5HT": "T", "5IC": "C", "5IT": "T", "5MC": "C", "5MU": "U", "5NC": "C",
+    "5PC": "C", "5PY": "T", "9QV": "U", "94O": "T", "9SI": "A", "9SY": "A",
+    "B7C": "C", "BGM": "G", "BOE": "T", "B8H": "U", "B8K": "G", "B8Q": "C",
+    "B8T": "C", "B8W": "G", "B9B": "G", "B9H": "C", "BGH": "G", "F3H": "T",
+    "F3N": "A", "F4H": "T", "FA2": "A", "FDG": "G", "FHU": "U", "FMG": "G",
+    "FNU": "U", "FOX": "G", "F2T": "U", "F74": "G", "F4Q": "G", "F7H": "C",
+    "F7K": "G", "JDT": "T", "JMH": "C", "J0X": "C", "N5M": "C", "N6G": "G",
+    "N79": "A", "NCU": "C", "NMS": "T", "NMT": "T", "NTT": "T", "N7X": "C",
+    "R  ": "A", "RBD": "A", "RDG": "G", "RIA": "A", "RMP": "A", "RPC": "C",
+    "RSP": "C", "RSQ": "C", "RT ": "T", "RUS": "U", "RFJ": "G", "V3L": "A",
+    "VC7": "G", "Z  ": "C", "ZAD": "A", "ZBC": "C", "ZBU": "U", "ZCY": "C",
+    "ZGU": "G", "31H": "A", "31M": "A", "3AU": "U", "3DA": "A", "3ME": "U",
+    "3MU": "U", "3TD": "U", "70U": "U", "7AT": "A", "7DA": "A", "7GU": "G",
+    "7MG": "G", "7BG": "G", "73W": "C", "75B": "U", "7OK": "C", "7S3": "G",
+    "7SN": "G", "C  ": "C", "C25": "C", "C2L": "C", "C2S": "C", "C31": "C",
+    "C32": "C", "C34": "C", "C36": "C", "C37": "C", "C38": "C", "C42": "C",
+    "C43": "C", "C45": "C", "C46": "C", "C49": "C", "C4S": "C", "C5L": "C",
+    "C6G": "G", "CAR": "C", "CB2": "C", "CBR": "C", "CBV": "C", "CCC": "C",
+    "CDW": "C", "CFL": "C", "CFZ": "C", "CG1": "G", "CH ": "C", "CMR": "C",
+    "CNU": "U", "CP1": "C", "CSF": "C", "CSL": "C", "CTG": "T", "CX2": "C",
+    "C7S": "C", "C7R": "C", "G  ": "G", "G1G": "G", "G25": "G", "G2L": "G",
+    "G2S": "G", "G31": "G", "G32": "G", "G33": "G", "G36": "G", "G38": "G",
+    "G42": "G", "G46": "G", "G47": "G", "G48": "G", "G49": "G", "G7M": "G",
+    "GAO": "G", "GCK": "C", "GDO": "G", "GDP": "G", "GDR": "G", "GF2": "G",
+    "GFL": "G", "GH3": "G", "GMS": "G", "GN7": "G", "GNG": "G", "GOM": "G",
+    "GRB": "G", "GS ": "G", "GSR": "G", "GSS": "G", "GTP": "G", "GX1": "G",
+    "KAG": "G", "KAK": "G", "O2G": "G", "OGX": "G", "OMC": "C", "OMG": "G",
+    "OMU": "U", "ONE": "U", "O2Z": "A", "OKN": "C", "OKQ": "C", "S2M": "T",
+    "S4A": "A", "S4C": "C", "S4G": "G", "S4U": "U", "S6G": "G", "SC ": "C",
+    "SDE": "A", "SDG": "G", "SDH": "G", "SMP": "A", "SMT": "T", "SPT": "T",
+    "SRA": "A", "SSU": "U", "SUR": "U", "00A": "A", "0AD": "G", "0AM": "A",
+    "0AP": "C", "0AV": "A", "0R8": "C", "0SP": "A", "0UH": "G", "47C": "C",
+    "4OC": "C", "4PC": "C", "4PD": "C", "4PE": "C", "4SC": "C", "4SU": "U",
+    "45A": "A", "4U3": "C", "8AG": "G", "8AN": "A", "8BA": "A", "8FG": "G",
+    "8MG": "G", "8OG": "G", "8PY": "G", "8AA": "G", "85Y": "U", "8OS": "G",
+    "UNK": "X",  # DEBUG
+}
+standard_protein_letters_3to1 = protein_letters_3to1
+standard_protein_letters_1to3 = protein_letters_1to3
+nonstandard_protein_letters_3to1 = {k: v for k, v in protein_letters_3to1_extended.items() if
+                                    k not in standard_protein_letters_3to1}
+standard_nucleic_letters_3to1 = nucleic_letters_3to1
+standard_nucleic_letters_1to3 = {v: k for k, v in standard_nucleic_letters_3to1.items()}
+nonstandard_nucleic_letters_3to1 = {k: v for k, v in nucleic_letters_3to1_extended.items() if
+                                    k not in standard_nucleic_letters_3to1}
+letters_3to1_extended = {**protein_letters_3to1_extended, **nucleic_letters_3to1_extended}

PhysDock/data/tools/__init__.py ADDED Viewed

File without changes

PhysDock/data/tools/alignment_runner.py ADDED Viewed

	@@ -0,0 +1,588 @@

+import logging
+import os.path
+from functools import partial
+import tqdm
+from typing import Optional, Mapping, Any, Union
+from PhysDock.data.tools import jackhmmer, nhmmer, hhblits, kalign, hmmalign, parsers, hmmbuild, hhsearch, templates
+from PhysDock.utils.io_utils import load_pkl, load_txt, load_json, run_pool_tasks, convert_md5_string, dump_pkl
+from PhysDock.data.tools.parsers import parse_fasta
+TemplateSearcher = Union[hhsearch.HHSearch]
+class AlignmentRunner:
+    def __init__(
+            self,
+            # Homo Search Tools
+            jackhmmer_binary_path: Optional[str] = None,
+            hhblits_binary_path: Optional[str] = None,
+            nhmmer_binary_path: Optional[str] = None,
+            hmmbuild_binary_path: Optional[str] = None,
+            hmmalign_binary_path: Optional[str] = None,
+            kalign_binary_path: Optional[str] = None,
+            # Templ Search Tools
+            hhsearch_binary_path: Optional[str] = None,
+            template_searcher: Optional[TemplateSearcher] = None,
+            template_featurizer: Optional[templates.TemplateHitFeaturizer] = None,
+            # Databases
+            uniref90_database_path: Optional[str] = None,
+            uniprot_database_path: Optional[str] = None,
+            uniclust30_database_path: Optional[str] = None,
+            uniref30_database_path: Optional[str] = None,
+            bfd_database_path: Optional[str] = None,
+            reduced_bfd_database_path: Optional[str] = None,
+            mgnify_database_path: Optional[str] = None,
+            rfam_database_path: Optional[str] = None,
+            rnacentral_database_path: Optional[str] = None,
+            nt_database_path: Optional[str] = None,
+            #
+            no_cpus: int = 8,
+            # Limitations
+            uniref90_seq_limit: int = 100000,
+            uniprot_seq_limit: int = 500000,
+            reduced_bfd_seq_limit: int = 50000,
+            mgnify_seq_limit: int = 50000,
+            uniref90_max_hits: int = 10000,
+            uniprot_max_hits: int = 50000,
+            reduced_bfd_max_hits: int = 5000,
+            mgnify_max_hits: int = 5000,
+            rfam_max_hits: int = 10000,
+            rnacentral_max_hits: int = 10000,
+            nt_max_hits: int = 10000,
+    ):
+        self.uniref90_jackhmmer_runner = None
+        self.uniprot_jackhmmer_runner = None
+        self.reduced_bfd_jackhmmer_runner = None
+        self.mgnify_jackhmmer_runner = None
+        self.bfd_uniref30_hhblits_runner = None
+        self.bfd_uniclust30_hhblits_runner = None
+        self.rfam_nhmmer_runner = None
+        self.rnacentral_nhmmer_runner = None
+        self.nt_nhmmer_runner = None
+        self.rna_realign_runner = None
+        self.template_searcher = template_searcher
+        self.template_featurizer = template_featurizer
+        def _all_exists(*objs, hhblits_mode=False):
+            if not hhblits_mode:
+                for obj in objs:
+                    if obj is None or not os.path.exists(obj):
+                        return False
+            else:
+                for obj in objs:
+                    if obj is None or not os.path.exists(os.path.split(obj)[0]):
+                        return False
+            return True
+        def _run_msa_tool(
+                fasta_path: str,
+                msa_out_path: str,
+                msa_runner,
+                msa_format: str,
+                max_sto_sequences: Optional[int] = None,
+        ) -> Mapping[str, Any]:
+            """Runs an MSA tool, checking if output already exists first."""
+            if (msa_format == "sto" and max_sto_sequences is not None):
+                result = msa_runner.query(fasta_path, max_sto_sequences)[0]
+            else:
+                result = msa_runner.query(fasta_path)[0]
+            assert msa_out_path.split('.')[-1] == msa_format
+            with open(msa_out_path, "w") as f:
+                f.write(result[msa_format])
+            return result
+        def _run_rna_realign_tool(
+                fasta_path: str,
+                msa_in_path: str,
+                msa_out_path: str,
+                use_precompute=True,
+        ):
+            runner = hmmalign.Hmmalign(
+                hmmbuild_binary_path=hmmbuild_binary_path,
+                hmmalign_binary_path=hmmalign_binary_path,
+            )
+            if os.path.exists(msa_in_path) and os.path.getsize(msa_in_path) == 0:
+                # print("MSA sto file is 0")
+                with open(msa_out_path, "w") as f:
+                    pass
+                return
+            if use_precompute:
+                if os.path.exists(msa_in_path) and os.path.exists(msa_out_path):
+                    if os.path.getsize(msa_in_path) > 0 and os.path.getsize(msa_out_path) == 0:
+                        logging.warning(f"The msa realign file size is zero but the origin file size is over 0! "
+                                        f"fasta: {fasta_path} msa_in_file: {msa_in_path}")
+                        runner.realign_sto_with_fasta(fasta_path, msa_in_path, msa_out_path)
+                else:
+                    runner.realign_sto_with_fasta(fasta_path, msa_in_path, msa_out_path)
+            else:
+                runner.realign_sto_with_fasta(fasta_path, msa_in_path, msa_out_path)
+            # with open(msa_out_path, "w") as f:
+            #     f.write(msa_out)
+        assert uniclust30_database_path is None or uniref30_database_path is None, "Only one used"
+        # Jackhmmer
+        if _all_exists(jackhmmer_binary_path, uniref90_database_path):
+            self.uniref90_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=uniref90_database_path,
+                    seq_limit=uniref90_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=uniref90_max_hits
+            )
+        if _all_exists(jackhmmer_binary_path, uniprot_database_path):
+            self.uniprot_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=uniprot_database_path,
+                    seq_limit=uniprot_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=uniprot_max_hits
+            )
+        if _all_exists(jackhmmer_binary_path, reduced_bfd_database_path):
+            self.reduced_bfd_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=reduced_bfd_database_path,
+                    seq_limit=reduced_bfd_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=reduced_bfd_max_hits
+            )
+        if _all_exists(jackhmmer_binary_path, mgnify_database_path):
+            self.mgnify_jackhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=mgnify_database_path,
+                    seq_limit=mgnify_seq_limit,
+                    n_cpu=no_cpus,
+                ),
+                msa_format="sto",
+                max_sto_sequences=mgnify_max_hits
+            )
+        # HHblits
+        if _all_exists(hhblits_binary_path, bfd_database_path, uniref30_database_path, hhblits_mode=True):
+            self.bfd_uniref30_hhblits_runner = partial(
+                _run_msa_tool,
+                msa_runner=hhblits.HHBlits(
+                    binary_path=hhblits_binary_path,
+                    databases=[bfd_database_path, uniref30_database_path],
+                    n_cpu=no_cpus,
+                ),
+                msa_format="a3m",
+            )
+        elif _all_exists(hhblits_binary_path, bfd_database_path, uniclust30_database_path, hhblits_mode=True):
+            self.bfd_uniclust30_hhblits_runner = partial(
+                _run_msa_tool,
+                msa_runner=hhblits.HHBlits(
+                    binary_path=hhblits_binary_path,
+                    databases=[bfd_database_path, uniclust30_database_path],
+                    n_cpu=no_cpus,
+                ),
+                msa_format="a3m",
+            )
+        # Nhmmer
+        if _all_exists(nhmmer_binary_path, rfam_database_path):
+            self.rfam_nhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=nhmmer.Nhmmer(
+                    binary_path=nhmmer_binary_path,
+                    database_path=rfam_database_path,
+                    n_cpu=no_cpus
+                ),
+                msa_format="sto",
+                max_sto_sequences=rfam_max_hits
+            )
+        if _all_exists(nhmmer_binary_path, rnacentral_database_path):
+            self.rnacentral_nhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=nhmmer.Nhmmer(
+                    binary_path=nhmmer_binary_path,
+                    database_path=rnacentral_database_path,
+                    n_cpu=no_cpus
+                ),
+                msa_format="sto",
+                max_sto_sequences=rnacentral_max_hits
+            )
+        if _all_exists(nhmmer_binary_path, nt_database_path):
+            self.nt_nhmmer_runner = partial(
+                _run_msa_tool,
+                msa_runner=nhmmer.Nhmmer(
+                    binary_path=nhmmer_binary_path,
+                    database_path=nt_database_path,
+                    n_cpu=no_cpus
+                ),
+                msa_format="sto",
+                max_sto_sequences=nt_max_hits
+            )
+        # def _run_rna_hmm(
+        #         fasta_path: str,
+        #         hmm_out_path: str,
+        # ):
+        #     runner = hmmbuild.Hmmbuild(binary_path=hmmbuild_binary_path)
+        #     hmm = runner.build_rna_profile_from_fasta(fasta_path)
+        #     with open(hmm_out_path, "w") as f:
+        #         f.write(hmm)
+        if _all_exists(hmmbuild_binary_path, hmmalign_binary_path):
+            self.rna_realign_runner = _run_rna_realign_tool
+    def run(self, input_fasta_path, output_msas_dir, use_precompute=True):
+        os.makedirs(output_msas_dir, exist_ok=True)
+        templates_out_path = os.path.join(output_msas_dir, "templates")
+        uniref90_out_path = os.path.join(output_msas_dir, "uniref90_hits.sto")
+        uniprot_out_path = os.path.join(output_msas_dir, "uniprot_hits.sto")
+        reduced_bfd_out_path = os.path.join(output_msas_dir, "reduced_bfd_hits.sto")
+        mgnify_out_path = os.path.join(output_msas_dir, "mgnify_hits.sto")
+        bfd_uniref30_out_path = os.path.join(output_msas_dir, f"bfd_uniref30_hits.a3m")
+        bfd_uniclust30_out_path = os.path.join(output_msas_dir, f"bfd_uniclust30_hits.a3m")
+        seqs, decs = parse_fasta(load_txt(input_fasta_path))
+        prefix = "protein"
+        md5 = convert_md5_string(f"{prefix}:{seqs[0]}")
+        output_feature = os.path.dirname(output_msas_dir)
+        output_feature = os.path.dirname(output_feature)
+        pkl_save_path_msa = os.path.join(output_feature, "msa_features", f"{md5}.pkl.gz")
+        pkl_save_path_msa_uni = os.path.join(output_feature, "uniprot_msa_features", f"{md5}.pkl.gz")
+        pkl_save_path_temp = os.path.join(output_feature, "template_features", f"{md5}.pkl.gz")
+        if self.uniref90_jackhmmer_runner is not None and not os.path.exists(pkl_save_path_temp):
+            if not os.path.exists(uniref90_out_path) or not use_precompute or not os.path.exists(pkl_save_path_temp):
+                if not os.path.exists(uniref90_out_path):
+                    print(uniref90_out_path)
+                    self.uniref90_jackhmmer_runner(input_fasta_path, uniref90_out_path)
+                print("begin templates")
+                if templates_out_path is not None \
+                        and self.template_searcher is not None and self.template_featurizer is not None:
+                    try:
+                        os.makedirs(templates_out_path, exist_ok=True)
+                        seq, dec = parsers.parse_fasta(load_txt(input_fasta_path))
+                        input_sequence = seq[0]
+                        msa_for_templates = parsers.truncate_stockholm_msa(
+                            uniref90_out_path, max_sequences=10000
+                        )
+                        msa_for_templates = parsers.deduplicate_stockholm_msa(msa_for_templates)
+                        msa_for_templates = parsers.remove_empty_columns_from_stockholm_msa(
+                            msa_for_templates
+                        )
+                        if self.template_searcher.input_format == "sto":
+                            pdb_templates_result = self.template_searcher.query(msa_for_templates)
+                        elif self.template_searcher.input_format == "a3m":
+                            uniref90_msa_as_a3m = parsers.convert_stockholm_to_a3m(msa_for_templates)
+                            pdb_templates_result = self.template_searcher.query(uniref90_msa_as_a3m)
+                        else:
+                            raise ValueError(
+                                "Unrecognized template input format: "
+                                f"{self.template_searcher.input_format}"
+                            )
+                        pdb_hits_out_path = os.path.join(
+                            templates_out_path, f"pdb_hits.{self.template_searcher.output_format}.pkl.gz"
+                        )
+                        with open(os.path.join(
+                                templates_out_path, f"pdb_hits.{self.template_searcher.output_format}"
+                        ), "w") as f:
+                            f.write(pdb_templates_result)
+                        pdb_template_hits = self.template_searcher.get_template_hits(
+                            output_string=pdb_templates_result, input_sequence=input_sequence
+                        )
+                        templates_result = self.template_featurizer.get_templates(
+                            query_sequence=input_sequence, hits=pdb_template_hits
+                        )
+                    except Exception as e:
+                        logging.exception("An error in template searching")
+                    dump_pkl(templates_result.features, pdb_hits_out_path, compress=True)
+        if self.uniprot_jackhmmer_runner is not None and not os.path.exists(pkl_save_path_msa_uni):
+            if not os.path.exists(uniprot_out_path) or not use_precompute:
+                self.uniprot_jackhmmer_runner(input_fasta_path, uniprot_out_path)
+        if self.reduced_bfd_jackhmmer_runner is not None and not os.path.exists(pkl_save_path_msa):
+            if not os.path.exists(reduced_bfd_out_path) or not use_precompute:
+                self.reduced_bfd_jackhmmer_runner(input_fasta_path, reduced_bfd_out_path)
+        if self.mgnify_jackhmmer_runner is not None and not os.path.exists(pkl_save_path_msa):
+            if not os.path.exists(mgnify_out_path) or not use_precompute:
+                self.mgnify_jackhmmer_runner(input_fasta_path, mgnify_out_path)
+        if self.bfd_uniref30_hhblits_runner is not None and not os.path.exists(pkl_save_path_msa):
+            if not os.path.exists(bfd_uniref30_out_path) or not use_precompute:
+                self.bfd_uniref30_hhblits_runner(input_fasta_path, bfd_uniref30_out_path)
+        if self.bfd_uniclust30_hhblits_runner is not None and not os.path.exists(pkl_save_path_msa):
+            if not os.path.exists(bfd_uniclust30_out_path) or not use_precompute:
+                self.bfd_uniclust30_hhblits_runner(input_fasta_path, bfd_uniclust30_out_path)
+        # if self.rfam_nhmmer_runner is not None:
+        #     if not os.path.exists(rfam_out_path) or not use_precompute:
+        #         self.rfam_nhmmer_runner(input_fasta_path, rfam_out_path)
+        # # print(self.rna_realign_runner is not None, os.path.exists(rfam_out_path))
+        # if self.rna_realign_runner is not None and os.path.exists(rfam_out_path):
+        #     self.rna_realign_runner(input_fasta_path, rfam_out_path, rfam_out_realigned_path)
+        # if self.rnacentral_nhmmer_runner is not None:
+        #     if not os.path.exists(rnacentral_out_path) or not use_precompute:
+        #         self.rnacentral_nhmmer_runner(input_fasta_path, rnacentral_out_path)
+        # if self.rna_realign_runner is not None and os.path.exists(rnacentral_out_path):
+        #     self.rna_realign_runner(input_fasta_path, rnacentral_out_path, rnacentral_out_realigned_path)
+        # if self.nt_nhmmer_runner is not None:
+        #     if not os.path.exists(nt_out_path) or not use_precompute:
+        #         self.nt_nhmmer_runner(input_fasta_path, nt_out_path)
+        # if self.rna_realign_runner is not None and os.path.exists(nt_out_path):
+        #     # print("realign",nt_out_path,nt_out_realigned_path)
+        #     self.rna_realign_runner(input_fasta_path, nt_out_path, nt_out_realigned_path)
+class DataProcessor:
+    def __init__(
+            self,
+            alphafold3_database_path,
+            jackhmmer_binary_path: Optional[str] = None,
+            hhblits_binary_path: Optional[str] = None,
+            nhmmer_binary_path: Optional[str] = None,
+            kalign_binary_path: Optional[str] = None,
+            hmmbuild_binary_path: Optional[str] = None,
+            hmmalign_binary_path: Optional[str] = None,
+            hhsearch_binary_path: Optional[str] = None,
+            template_searcher: Optional[TemplateSearcher] = None,
+            template_featurizer: Optional[templates.TemplateHitFeaturizer] = None,
+            n_cpus: int = 8,
+            n_workers: int = 1,
+    ):
+        '''
+        Database Versions:
+            Training:
+                uniref90:   v2022_05
+                bfd:
+                reduces_bfd:
+                uniclust30: v2018_08
+                uniprot:    v2020_05
+                mgnify:     v2022_05
+                rfam:       v14.9
+                rnacentral: v21.0
+                nt:         v2023_02_23
+            Inference:
+                uniref90:   v2022_05
+                bfd:
+                reduces_bfd:
+                uniclust30: v2018_08
+                uniprot:    v2021_04   *
+                mgnify:     v2022_05
+                rfam:       v14.9
+                rnacentral: v21.0
+                nt:         v2023_02_23
+            Inference Ligand:
+                uniref90:   v2020_01   *
+                bfd:
+                reduces_bfd:
+                uniclust30: v2018_08
+                uniprot:    v2020_05
+                mgnify:     v2018_12   *
+                rfam:       v14.9
+                rnacentral: v21.0
+                nt:         v2023_02_23
+        Args:
+            alphafold3_database_path: Database dir that contains all alphafold3 databases
+            jackhmmer_binary_path:
+            hhblits_binary_path:
+            nhmmer_binary_path:
+            kalign_binary_path:
+            hmmaligh_binary_path:
+            n_cpus:
+            n_workers:
+        '''
+        self.jackhmmer_binary_path = jackhmmer_binary_path
+        self.hhblits_binary_path = hhblits_binary_path
+        self.nhmmer_binary_path = nhmmer_binary_path
+        self.hmmbuild_binary_path = hmmbuild_binary_path
+        self.hmmalign_binary_path = hmmalign_binary_path
+        self.hhsearch_binary_path = hhsearch_binary_path
+        self.template_searcher = template_searcher
+        self.template_featurizer = template_featurizer
+        self.n_cpus = n_cpus
+        self.n_workers = n_workers
+        self.uniref90_database_path = os.path.join(
+            alphafold3_database_path, "uniref90", "uniref90.fasta"
+        )
+        ################### TODO: DEBUG
+        self.uniprot_database_path = os.path.join(
+            alphafold3_database_path, "uniprot", "uniprot.fasta"
+        )
+        self.bfd_database_path = os.path.join(
+            alphafold3_database_path, "bfd", "bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt"
+        )
+        self.uniclust30_database_path = os.path.join(
+            alphafold3_database_path, "uniclust30", "uniclust30_2018_08", "uniclust30_2018_08"
+        )
+        ################### TODO: check alphafold2 multimer uniref30 version
+        self.uniref_30_database_path = os.path.join(
+            alphafold3_database_path, "uniref30", "v2020_06"
+        )
+        # self.reduced_bfd_database_path = os.path.join(
+        #     alphafold3_database_path,"reduced_bfd"
+        # )
+        self.mgnify_database_path = os.path.join(
+            alphafold3_database_path, "mgnify", "mgnify", "mgy_clusters.fa"
+        )
+        self.rfam_database_path = os.path.join(
+            alphafold3_database_path, "rfam", "v14.9", "Rfam_af3_clustered_rep_seq.fasta"
+        )
+        self.rnacentral_database_path = os.path.join(
+            alphafold3_database_path, "rnacentral", "v21.0", "rnacentral_db_rep_seq.fasta"
+        )
+        self.nt_database_path = os.path.join(
+            # alphafold3_database_path, "nt", "v2023_02_23", "nt_af3_clustered_rep_seq.fasta" # DEBUG
+            alphafold3_database_path, "nt", "v2023_02_23", "nt.fasta"
+        )
+        self.runner_args_map = {
+            "uniref90": {
+                "jackhmmer_binary_path": self.jackhmmer_binary_path,
+                "uniref90_database_path": self.uniref90_database_path,
+            },
+            "bfd_uniclust30": {
+                "hhblits_binary_path": self.hhblits_binary_path,
+                "bfd_database_path": self.bfd_database_path,
+                "uniclust30_database_path": self.uniclust30_database_path
+            },
+            "bfd_uniref30": {
+                "hhblits_binary_path": self.hhblits_binary_path,
+                "bfd_database_path": self.bfd_database_path,
+                "uniref_30_database_path": self.uniref_30_database_path
+            },
+            "mgnify": {
+                "jackhmmer_binary_path": self.jackhmmer_binary_path,
+                "mgnify_database_path": self.mgnify_database_path,
+            },
+            "uniprot": {
+                "jackhmmer_binary_path": self.jackhmmer_binary_path,
+                "uniprot_database_path": self.uniprot_database_path,
+            },
+            ###################### RNA ########################
+            "rfam": {
+                "nhmmer_binary_path": self.nhmmer_binary_path,
+                "rfam_database_path": self.rfam_database_path,
+                "hmmbuild_binary_path": self.hmmbuild_binary_path,
+                "hmmalign_binary_path": self.hmmalign_binary_path,
+            },
+            "rnacentral": {
+                "nhmmer_binary_path": self.nhmmer_binary_path,
+                "rnacentral_database_path": self.rnacentral_database_path,
+                "hmmbuild_binary_path": self.hmmbuild_binary_path,
+                "hmmalign_binary_path": self.hmmalign_binary_path,
+            },
+            "nt": {
+                "nhmmer_binary_path": self.nhmmer_binary_path,
+                "nt_database_path": self.nt_database_path,
+                "hmmbuild_binary_path": self.hmmbuild_binary_path,
+                "hmmalign_binary_path": self.hmmalign_binary_path,
+            },
+            ###################################################
+            "alphafold2": {
+                "jackhmmer_binary_path": self.jackhmmer_binary_path,
+                "hhblits_binary_path": self.hhblits_binary_path,
+                "uniref90_database_path": self.uniref90_database_path,
+                "bfd_database_path": self.bfd_database_path,
+                "uniclust30_database_path": self.uniclust30_database_path,
+                "mgnify_database_path": self.mgnify_database_path,
+            },
+            "alphafold2_multimer": {
+                "jackhmmer_binary_path": self.jackhmmer_binary_path,
+                "hhblits_binary_path": self.hhblits_binary_path,
+                "uniref90_database_path": self.uniref90_database_path,
+                "bfd_database_path": self.bfd_database_path,
+                "uniref_30_database_path": self.uniref_30_database_path,
+                "mgnify_database_path": self.mgnify_database_path,
+                "uniprot_database_path": self.uniprot_database_path,
+            },
+            "alphafold3": {
+                "jackhmmer_binary_path": self.jackhmmer_binary_path,
+                "hhblits_binary_path": self.hhblits_binary_path,
+                "template_searcher": self.template_searcher,
+                "template_featurizer": self.template_featurizer,
+                "uniref90_database_path": self.uniref90_database_path,
+                "bfd_database_path": self.bfd_database_path,
+                "uniclust30_database_path": self.uniclust30_database_path,
+                "mgnify_database_path": self.mgnify_database_path,
+                "uniprot_database_path": self.uniprot_database_path,
+            },
+            "rna": {
+                "nhmmer_binary_path": self.nhmmer_binary_path,
+                "rfam_database_path": self.rfam_database_path,
+                "rnacentral_database_path": self.rnacentral_database_path,
+                "hmmbuild_binary_path": self.hmmbuild_binary_path,
+                "hmmalign_binary_path": self.hmmalign_binary_path,
+            },
+        }
+    def _parse_io_tuples(self, input_fasta_path, output_dir, convert_md5=True, prefix="protein"):
+        os.makedirs(output_dir, exist_ok=True)
+        if isinstance(input_fasta_path, list):
+            input_fasta_paths = input_fasta_path
+        elif os.path.isdir(input_fasta_path):
+            input_fasta_paths = [os.path.join(input_fasta_path, i) for i in os.listdir(input_fasta_path)]
+        elif os.path.isfile(input_fasta_path):
+            input_fasta_paths = [input_fasta_path]
+        else:
+            input_fasta_paths = []
+            Exception("Can't parse input fasta path!")
+        seqs = [parse_fasta(load_txt(i))[0][0] for i in input_fasta_paths]
+        # sequences = [parsers.parse_fasta(load_txt(path))[0][0] for path in input_fasta_paths]
+        # TODO: debug
+        if convert_md5:
+            output_msas_dirs = [os.path.join(output_dir, convert_md5_string(f"{prefix}:{i}")) for i in
+                                seqs]
+        else:
+            output_msas_dirs = [os.path.join(output_dir, os.path.split(i)[1].split(".")[0]) for i in input_fasta_paths]
+        io_tuples = [(i, o) for i, o in zip(input_fasta_paths, output_msas_dirs)]
+        return io_tuples
+    def _process_iotuple(self, io_tuple, msas_type):
+        i, o = io_tuple
+        alignment_runner = AlignmentRunner(
+            **self.runner_args_map[msas_type],
+            no_cpus=self.n_cpus
+        )
+        try:
+            alignment_runner.run(i, o)
+        except:
+            logging.warning(f"{i}:{o} task failed!")
+    def process(self, input_fasta_path, output_dir, msas_type="rfam", convert_md5=True):
+        prefix = "rna" if msas_type in ["rfam", "rnacentral", "nt", "rna"] else "protein"
+        io_tuples = self._parse_io_tuples(input_fasta_path, output_dir, convert_md5=convert_md5, prefix=prefix)
+        run_pool_tasks(partial(self._process_iotuple, msas_type=msas_type), io_tuples, num_workers=self.n_workers,
+                       return_dict=False)
+    def convert_output_to_md5(self, input_fasta_path, output_dir, md5_output_dir, prefix="protein"):
+        io_tuples = self._parse_io_tuples(input_fasta_path, output_dir, convert_md5=False, prefix=prefix)
+        io_tuples_md5 = self._parse_io_tuples(input_fasta_path, md5_output_dir, convert_md5=True, prefix=prefix)
+        for io0, io1 in tqdm.tqdm(zip(io_tuples, io_tuples_md5)):
+            o, o_md5 = io0[1], io1[1]
+            os.system(f"cp -r {os.path.abspath(o)} {os.path.abspath(o_md5)}")

PhysDock/data/tools/convert_unifold_template_to_stfold.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import sys
+import os
+import numpy as np
+import torch
+sys.path.append("../")
+from PhysDock.utils.io_utils import load_pkl, dump_pkl
+from PhysDock.data.tools.residue_constants import \
+    hhblits_id_to_standard_residue_id_np, af3_if_to_residue_id
+def dgram_from_positions(
+        pos: torch.Tensor,
+        min_bin: float = 3.25,
+        max_bin: float = 50.75,
+        no_bins: float = 39,
+        inf: float = 1e8,
+):
+    dgram = torch.sum(
+        (pos[..., None, :] - pos[..., None, :, :]) ** 2, dim=-1, keepdim=True
+    )
+    lower = torch.linspace(min_bin, max_bin, no_bins, device=pos.device) ** 2
+    upper = torch.cat([lower[1:], lower.new_tensor([inf])], dim=-1)
+    dgram = ((dgram > lower) * (dgram < upper)).type(dgram.dtype)
+    return dgram
+def convert_unifold_template_feature_to_stfold_unifold_feature(unifold_template_feature):
+    try:
+        print(unifold_template_feature)
+        md5_string = os.path.basename(unifold_template_feature)[:-6]
+        out_path = os.path.dirname(unifold_template_feature)
+        out_path = os.path.dirname(out_path)
+        unifold_template_feature = os.path.join(out_path, "msas", md5_string)
+        out_path_final = os.path.join(out_path, "msas", "template_features")
+        final = os.path.join(out_path_final, f"{md5_string}.pkl.gz")
+        if os.path.exists(final):
+            return dict()
+        unifold_template_feature = os.path.join(unifold_template_feature, "templates", "pdb_hits.hhr.pkl.gz")
+        if isinstance(unifold_template_feature, str):
+            data = load_pkl(unifold_template_feature)
+        else:
+            data = unifold_template_feature
+        template_restype = af3_if_to_residue_id[
+            hhblits_id_to_standard_residue_id_np[np.argmax(data["template_aatype"], axis=-1)]]
+        assert np.all(template_restype != -1)
+        assert len(template_restype) >= 1
+        assert len(template_restype[0, :]) >= 4
+        # shape = template_restype.shape
+        # template_restype=template_restype.view([-1])[].view(shape)
+        bb_x_gt = torch.from_numpy(data["template_all_atom_positions"][..., :3, :])
+        bb_x_mask = torch.from_numpy(data["template_all_atom_masks"][..., :3])
+        bb_x_gt_beta1 = data["template_all_atom_positions"][..., 3, :]
+        bb_x_gt_beta_mask1 = data["template_all_atom_masks"][..., 3]
+        bb_x_gt_beta2 = data["template_all_atom_positions"][..., 1, :]
+        bb_x_gt_beta_mask2 = data["template_all_atom_masks"][..., 1]
+        is_gly = template_restype == 7
+        template_pseudo_beta = np.where(is_gly[..., None], bb_x_gt_beta2, bb_x_gt_beta1)
+        template_pseudo_beta_mask = np.where(is_gly, bb_x_gt_beta_mask2, bb_x_gt_beta_mask1)
+        template_backbone_frame_mask = bb_x_mask[..., 0] * bb_x_mask[..., 1] * bb_x_mask[..., 2]
+        out = {
+            "template_restype": template_restype.astype(np.int8),
+            "template_backbone_frame_mask": template_backbone_frame_mask.numpy().astype(np.int8),
+            "template_backbone_frame": bb_x_gt.numpy().astype(np.float32),
+            "template_pseudo_beta": template_pseudo_beta.astype(np.float32),
+            "template_pseudo_beta_mask": template_pseudo_beta_mask.astype(np.int8),
+            # "template_backbone_mask": template_mask.numpy().astype(np.int8),
+        }
+        # for k,v in out.items():
+        #     print(k,v.shape)
+        dump_pkl(out, os.path.join(out_path_final, f"{md5_string}.pkl.gz"), compress=True)
+    except:
+        pass
+        out = dict()
+    print(f"dump templ feats to {md5_string}.pkl.gz")
+    return out
+# HHBLITS_ID_TO_AA = {
+#     0: "ALA",
+#     1: "CYS",  # Also U.
+#     2: "ASP",  # Also B.
+#     3: "GLU",  # Also Z.
+#     4: "PHE",
+#     5: "GLY",
+#     6: "HIS",
+#     7: "ILE",
+#     8: "LYS",
+#     9: "LEU",
+#     10: "MET",
+#     11: "ASN",
+#     12: "PRO",
+#     13: "GLN",
+#     14: "ARG",
+#     15: "SER",
+#     16: "THR",
+#     17: "VAL",
+#     18: "TRP",
+#     19: "TYR",
+#     20: "UNK",  # Includes J and O.
+#     21: "GAP",
+# }
+#
+# # Usage: Convert hhblits msa to af3 aatype
+# #        msa = hhblits_id_to_standard_residue_id_np[hhblits_msa.astype(np.int64)]
+# hhblits_id_to_standard_residue_id_np = np.array(
+#     [standard_ccds.index(ccd) for id, ccd in HHBLITS_ID_TO_AA.items()]
+# )
+#
+# of_restypes = [
+#     "A", "R", "N", "D", "C", "Q", "E", "G", "H", "I",
+#     "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V", "X", "-"
+# ]
+#
+# af3_restypes = [amino_acid_3to1[ccd] if ccd in amino_acid_3to1 else "-" if ccd == "GAP" else "None" for ccd in
+#                 standard_ccds
+#                 ]
+#
+# af3_if_to_residue_id = np.array(
+#     [af3_restypes.index(restype) if restype in of_restypes else -1 for restype in af3_restypes])

PhysDock/data/tools/dataset_manager.py ADDED Viewed

	@@ -0,0 +1,570 @@

+import os.path
+from typing import Optional
+from functools import partial
+import numpy as np
+from PhysDock.utils.io_utils import run_pool_tasks, load_json, load_txt, dump_pkl, \
+    convert_md5_string
+from PhysDock.data.tools.parsers import parse_fasta
+from PhysDock.data.tools.parse_msas import parse_protein_alignment_dir, parse_uniprot_alignment_dir, \
+    parse_rna_alignment_dir
+from PhysDock.data.alignment_runner import DataProcessor
+from PhysDock.data.tools.residue_constants import standard_ccds, amino_acid_3to1
+from PhysDock.data.tools.PDBData import protein_letters_3to1_extended, nucleic_letters_3to1_extended
+def get_protein_md5(sequence_3):
+    ccds = sequence_3.split("-")
+    start = 0
+    end = 0
+    for i, ccd in enumerate(ccds):
+        if ccd not in ["UNK", "N  ", "DN ", "GAP"]:
+            start = i
+            break
+    for i, ccd in enumerate(ccds[::-1]):
+        if ccd not in ["UNK", "N  ", "DN ", "GAP"]:
+            end = i
+            break
+    # print(start,end)
+    ccds_strip_unk = ccds[start:-end] if end > 0 else ccds[start:]
+    sequence_0 = "".join(
+        [protein_letters_3to1_extended[ccd] if ccd in protein_letters_3to1_extended else "X" for ccd in ccds])
+    sequence_1 = "".join([amino_acid_3to1[ccd] if ccd in amino_acid_3to1 else "X" for ccd in ccds])
+    sequence_2 = "".join(
+        [protein_letters_3to1_extended[ccd] if ccd in protein_letters_3to1_extended else "X" for ccd in ccds_strip_unk])
+    sequence_3 = "".join([amino_acid_3to1[ccd] if ccd in amino_acid_3to1 else "X" for ccd in ccds_strip_unk])
+    sequences = []
+    for sequence in [sequence_0, sequence_1, sequence_2, sequence_3]:
+        if sequence not in sequences:
+            sequences.append(sequence)
+    return sequences, [convert_md5_string(f"protein:{i}") for i in sequences]
+def get_rna_md5(sequence_3):
+    ccds = sequence_3.split("-")
+    chs = [nucleic_letters_3to1_extended[ccd] if ccd not in ["UNK", "GAP", "N  ", "DN "] else "N" for ccd in ccds]
+    sequence = "".join(chs)
+    md5 = convert_md5_string(f"rna:{sequence}")
+    return sequence, md5
+class DatasetManager:
+    def __init__(
+            self,
+            dataset_path
+    ):
+        self.dataset_path = dataset_path
+        # Meta Data
+        self.chain_id_to_meta_info = load_json(os.path.join(dataset_path, "chain_id_to_meta_info.json"))
+        self.pdb_id_to_meta_info = load_json(os.path.join(dataset_path, "pdb_id_to_meta_info.json"))
+        self.ccd_id_to_meta_info = load_json(os.path.join(dataset_path, "ccd_id_to_meta_info.json"))
+        # filtering chains
+        # self.train_polymer_chain_ids = load_json()
+        # self.validation_polymer_chain_ids = load_json()
+    # def check_protein_msa_features_completeness(input_fasta_path):
+    #     pass
+    #
+    # def check_protein_uniprot_msa_features_completeness(self, chain_ids, num_workers):
+    #     def _run(chain_id):
+    #         if chain_id not in self.chain_id_to_meta_info:
+    #             return {
+    #                 "chain_id": f"Not find this chain {chain_id}"
+    #             }
+    #         sequence_3 = self.chain_id_to_meta_info[chain_id]["sequence_3"]
+    #
+    #     out = run_pool_tasks(_run, chain_ids, return_dict=True, num_workers=num_workers)
+    #     return out
+    @staticmethod
+    def homo_search(
+            input_fasta_path,
+            output_dir,
+            msas_type,
+            convert_md5,
+            alphafold3_database_path,
+            jackhmmer_binary_path: Optional[str] = None,
+            hhblits_binary_path: Optional[str] = None,
+            nhmmer_binary_path: Optional[str] = None,
+            kalign_binary_path: Optional[str] = None,
+            hmmbuild_binary_path: Optional[str] = None,
+            hmmalign_binary_path: Optional[str] = None,
+            n_cpus: int = 8,
+            n_workers: int = 1,
+    ):
+        data_processor = DataProcessor(
+            alphafold3_database_path=alphafold3_database_path,
+            jackhmmer_binary_path=jackhmmer_binary_path,
+            hhblits_binary_path=hhblits_binary_path,
+            nhmmer_binary_path=nhmmer_binary_path,
+            kalign_binary_path=kalign_binary_path,
+            hmmbuild_binary_path=hmmbuild_binary_path,
+            hmmalign_binary_path=hmmalign_binary_path,
+            n_cpus=n_cpus,
+            n_workers=n_workers,
+        )
+        data_processor.process(
+            input_fasta_path=input_fasta_path,
+            output_dir=output_dir,
+            msas_type=msas_type,
+            # msas_type="bfd_uniclust30", # alphafold3 # rna
+            # msas_type="uniprot", # alphafold3 # rna
+            convert_md5=convert_md5
+        )
+    @staticmethod
+    def get_unsearched_input_fasta_path(input_fasta_path, output_dir, msas_type, convert_md5, num_workers=128):
+        if isinstance(input_fasta_path, list):
+            input_fasta_paths = input_fasta_path
+        elif os.path.isdir(input_fasta_path):
+            input_fasta_paths = [os.path.join(input_fasta_path, i) for i in os.listdir(input_fasta_path)]
+        elif os.path.isfile(input_fasta_path):
+            input_fasta_paths = [input_fasta_path]
+        else:
+            input_fasta_paths = []
+            Exception("Can't parse input fasta path!")
+        prefix = {
+            "uniref90": "protein",
+            "bfd_uniclust30": "protein",
+            "bfd_uniref30": "protein",
+            "uniprot": "protein",
+            "mgnify": "protein",
+            "rfam": "rna",
+            "rnacentral": "rna",
+            "nt": "rna",
+        }[msas_type]
+        global _get_unsearched_input_fasta_path
+        def _get_unsearched_input_fasta_path(input_fasta_path, convert_md5, prefix, output_dir, msas_type):
+            # TODO
+            # seqs, decs = parse_fasta(i)
+            if convert_md5:
+                # dec = convert_md5_string(f"{prefix}:{input_fasta_path}")
+                dec = convert_md5_string(f"{prefix}:{seqs[0]}")
+            else:
+                dec = os.path.split(input_fasta_path)[1].split(".")[0]
+            if os.path.exists(os.path.join(output_dir, dec, f"{msas_type}_hits.sto")) or \
+                    os.path.exists(os.path.join(output_dir, dec, f"{msas_type}_hits.a3m")):
+                return dict()
+            else:
+                return {input_fasta_path: False}
+        out = run_pool_tasks(partial(
+            _get_unsearched_input_fasta_path,
+            convert_md5=convert_md5,
+            prefix=prefix,
+            output_dir=output_dir,
+            msas_type=msas_type,
+        ), input_fasta_paths, num_workers=num_workers, return_dict=True)
+        return list(out.keys())
+    @staticmethod
+    def convert_msas_out_to_msa_features(
+            input_fasta_path,
+            output_dir,
+            msa_feature_dir,
+            convert_md5=True,
+            num_workers=128
+    ):
+        if isinstance(input_fasta_path, list):
+            input_fasta_paths = input_fasta_path
+        elif os.path.isdir(input_fasta_path):
+            input_fasta_paths = [os.path.join(input_fasta_path, i) for i in os.listdir(input_fasta_path)]
+        elif os.path.isfile(input_fasta_path):
+            input_fasta_paths = [input_fasta_path]
+        else:
+            input_fasta_paths = []
+            Exception("Can't parse input fasta path!")
+        global _convert_msas_out_to_msa_features
+        def _convert_msas_out_to_msa_features(
+                input_fasta_path,
+                output_dir,
+                msa_feature_dir,
+                convert_md5=True,
+        ):
+            prefix = "protein"
+            max_seq = 16384
+            seqs, decs = parse_fasta(load_txt(input_fasta_path))
+            md5 = convert_md5_string(f"{prefix}:{seqs[0]}")
+            if convert_md5:
+                # TODO: debug
+                # dec = convert_md5_string(f"{prefix}:{input_fasta_path}")
+                dec = md5
+            else:
+                dec = os.path.split(input_fasta_path)[1].split(".")[0]
+            # DEBUG: whc homo search hits
+            if os.path.exists(os.path.join(output_dir, dec, "msas")):
+                dec = dec + "/msas"
+            pkl_save_path = os.path.join(msa_feature_dir, f"{md5}.pkl.gz")
+            if os.path.exists(pkl_save_path):
+                return dict()
+            if os.path.exists(os.path.join(output_dir, dec, "uniref90_hits.sto")) and \
+                    os.path.exists(os.path.join(output_dir, dec, "bfd_uniclust30_hits.a3m")) and \
+                    os.path.exists(os.path.join(output_dir, dec, "mgnify_hits.sto")):
+                msa_feature = parse_protein_alignment_dir(os.path.join(output_dir, dec))
+                sequence = "".join([amino_acid_3to1[standard_ccds[i]] for i in msa_feature["msa"][0]])
+                md5_string = convert_md5_string(f"protein:{sequence}")
+                if md5 == md5_string:
+                    feature = {
+                        "msa": msa_feature["msa"][:max_seq].astype(np.int8),
+                        "deletion_matrix": msa_feature["deletion_matrix"][:max_seq].astype(np.int8),
+                        "msa_species_identifiers": msa_feature["msa_species_identifiers"][:max_seq]
+                    }
+                    dump_pkl(feature, pkl_save_path)
+                    return dict()
+                else:
+                    return {input_fasta_path: f"seqs not equal, asset [{sequence}], but found [{seqs[0]}]"}
+            # DEBUG: whc
+            elif os.path.exists(os.path.join(output_dir, dec, "uniref90_hits.sto")) and \
+                    os.path.exists(os.path.join(output_dir, dec, "bfd_uniref_hits.a3m")) and \
+                    os.path.exists(os.path.join(output_dir, dec, "mgnify_hits.sto")):
+                msa_feature = parse_protein_alignment_dir(os.path.join(output_dir, dec))
+                sequence = "".join([amino_acid_3to1[standard_ccds[i]] for i in msa_feature["msa"][0]])
+                md5_string = convert_md5_string(f"protein:{sequence}")
+                if md5 == md5_string:
+                    feature = {
+                        "msa": msa_feature["msa"][:max_seq].astype(np.int8),
+                        "deletion_matrix": msa_feature["deletion_matrix"][:max_seq].astype(np.int8),
+                        "msa_species_identifiers": msa_feature["msa_species_identifiers"][:max_seq]
+                    }
+                    dump_pkl(feature, pkl_save_path)
+                    return dict()
+                else:
+                    return {input_fasta_path: f"seqs not equal, asset [{sequence}], but found [{seqs[0]}]"}
+            elif os.path.exists(os.path.join(output_dir, dec, "uniref90_hits.sto")) and \
+                    os.path.exists(os.path.join(output_dir, dec, "bfd_uniclust30_hits.a3m")):
+                msa_feature = parse_protein_alignment_dir(os.path.join(output_dir, dec))
+                if len(msa_feature["msa"]) < max_seq:
+                    return {
+                        input_fasta_path: f"MSA is not enough!"
+                    }
+                sequence = "".join([amino_acid_3to1[standard_ccds[i]] for i in msa_feature["msa"][0]])
+                md5_string = convert_md5_string(f"protein:{sequence}")
+                if md5 == md5_string:
+                    feature = {
+                        "msa": msa_feature["msa"][:max_seq].astype(np.int8),
+                        "deletion_matrix": msa_feature["deletion_matrix"][:max_seq].astype(np.int8),
+                        "msa_species_identifiers": msa_feature["msa_species_identifiers"][:max_seq]
+                    }
+                    dump_pkl(feature, pkl_save_path)
+                    return dict()
+                else:
+                    return {input_fasta_path: f"seqs not equal, asset [{sequence}], but found [{seqs[0]}]"}
+            elif os.path.exists(os.path.join(output_dir, dec, "uniref90_hits.sto")) and \
+                    os.path.exists(os.path.join(output_dir, dec, "mgnify_hits.sto")):
+                msa_feature = parse_protein_alignment_dir(os.path.join(output_dir, dec))
+                sequence = "".join([amino_acid_3to1[standard_ccds[i]] for i in msa_feature["msa"][0]])
+                md5_string = convert_md5_string(f"protein:{sequence}")
+                if md5 == md5_string:
+                    feature = {
+                        "msa": msa_feature["msa"][:max_seq].astype(np.int8),
+                        "deletion_matrix": msa_feature["deletion_matrix"][:max_seq].astype(np.int8),
+                        "msa_species_identifiers": msa_feature["msa_species_identifiers"][:max_seq]
+                    }
+                    dump_pkl(feature, pkl_save_path)
+                    return dict()
+                else:
+                    return {input_fasta_path: f"seqs not equal, asset [{sequence}], but found [{seqs[0]}]"}
+            else:
+                # msa_feature = parse_protein_alignment_dir(os.path.join(output_dir, dec))
+                #
+                # sequence = "".join([amino_acid_3to1[standard_ccds[i]] for i in msa_feature["msa"][0]])
+                # md5_string = convert_md5_string(f"protein:{sequence}")
+                # if md5 == md5_string:
+                #     feature = {
+                #         "msa": msa_feature["msa"][:max_seq].astype(np.int8),
+                #         "deletion_matrix": msa_feature["deletion_matrix"][:max_seq].astype(np.int8),
+                #         "msa_species_identifiers": msa_feature["msa_species_identifiers"][:max_seq]
+                #     }
+                #     dump_pkl(feature, pkl_save_path)
+                #     return dict()
+                # else:
+                #     return {input_fasta_path: f"seqs not equal, asset [{sequence}], but found [{seqs[0]}]"}
+                return {
+                    input_fasta_path: f"MSA is not enough!"
+                }
+        out = run_pool_tasks(partial(
+            _convert_msas_out_to_msa_features,
+            output_dir=output_dir,
+            msa_feature_dir=msa_feature_dir,
+            convert_md5=convert_md5
+        ), input_fasta_paths, num_workers=num_workers, return_dict=True)
+        return out
+    @staticmethod
+    def convert_msas_out_to_uniprot_msa_features(
+            input_fasta_path,
+            output_dir,
+            uniprot_msa_feature_dir,
+            convert_md5=True,
+            num_workers=128
+    ):
+        if isinstance(input_fasta_path, list):
+            input_fasta_paths = input_fasta_path
+        elif os.path.isdir(input_fasta_path):
+            input_fasta_paths = [os.path.join(input_fasta_path, i) for i in os.listdir(input_fasta_path)]
+        elif os.path.isfile(input_fasta_path):
+            input_fasta_paths = [input_fasta_path]
+        else:
+            input_fasta_paths = []
+            Exception("Can't parse input fasta path!")
+        global _convert_msas_out_to_uniprot_msa_features
+        def _convert_msas_out_to_uniprot_msa_features(
+                input_fasta_path,
+                output_dir,
+                uniprot_msa_feature_dir,
+                convert_md5=True,
+        ):
+            prefix = "protein"
+            max_seq = 50000
+            seqs, decs = parse_fasta(load_txt(input_fasta_path))
+            md5 = convert_md5_string(f"{prefix}:{seqs[0]}")
+            if convert_md5:
+                # TODO: debug
+                # dec = convert_md5_string(f"{prefix}:{input_fasta_path}")
+                dec = md5
+            else:
+                dec = os.path.split(input_fasta_path)[1].split(".")[0]
+            pkl_save_path = os.path.join(uniprot_msa_feature_dir, f"{md5}.pkl.gz")
+            if os.path.exists(pkl_save_path):
+                return dict()
+            if os.path.exists(os.path.join(output_dir, dec, "uniprot_hits.sto")):
+                msa_feature = parse_uniprot_alignment_dir(os.path.join(output_dir, dec))
+                sequence = "".join([amino_acid_3to1[standard_ccds[i]] for i in msa_feature["msa_all_seq"][0]])
+                md5_string = convert_md5_string(f"protein:{sequence}")
+                if md5 == md5_string:
+                    feature = {
+                        "msa_all_seq": msa_feature["msa_all_seq"][:max_seq].astype(np.int8),
+                        "deletion_matrix_all_seq": msa_feature["deletion_matrix_all_seq"][:max_seq].astype(np.int8),
+                        "msa_species_identifiers_all_seq": msa_feature["msa_species_identifiers_all_seq"][:max_seq]
+                    }
+                    dump_pkl(feature, pkl_save_path)
+                    return dict()
+                else:
+                    return {input_fasta_path: f"seqs not equal, asset [{sequence}], but found [{seqs[0]}]"}
+            else:
+                return {
+                    input_fasta_path: f"MSA is not enough!"
+                }
+        out = run_pool_tasks(partial(
+            _convert_msas_out_to_uniprot_msa_features,
+            output_dir=output_dir,
+            uniprot_msa_feature_dir=uniprot_msa_feature_dir,
+            convert_md5=convert_md5
+        ), input_fasta_paths, num_workers=num_workers, return_dict=True)
+        return out
+    @staticmethod
+    def convert_msas_out_to_rna_msa_features(
+            input_fasta_path,
+            output_dir,
+            rna_msa_feature_dir,
+            convert_md5=True,
+            num_workers=128
+    ):
+        import os
+        os.makedirs(rna_msa_feature_dir, exist_ok=True)
+        if isinstance(input_fasta_path, list):
+            input_fasta_paths = input_fasta_path
+        elif os.path.isdir(input_fasta_path):
+            input_fasta_paths = [os.path.join(input_fasta_path, i) for i in os.listdir(input_fasta_path)]
+        elif os.path.isfile(input_fasta_path):
+            input_fasta_paths = [input_fasta_path]
+        else:
+            input_fasta_paths = []
+            Exception("Can't parse input fasta path!")
+        global _convert_msas_out_to_rna_msa_features
+        def _convert_msas_out_to_rna_msa_features(
+                input_fasta_path,
+                output_dir,
+                rna_msa_feature_dir,
+                convert_md5=True,
+        ):
+            prefix = "rna"
+            max_seq = 16384
+            seqs, decs = parse_fasta(load_txt(input_fasta_path))
+            md5 = convert_md5_string(f"{prefix}:{seqs[0]}")
+            if convert_md5:
+                # TODO: debug
+                # dec = convert_md5_string(f"{prefix}:{input_fasta_path}")
+                dec = md5
+            else:
+                dec = os.path.split(input_fasta_path)[1].split(".")[0]
+            # DEBUG: whc homo search hits
+            if os.path.exists(os.path.join(output_dir, dec, "msas")):
+                dec = dec + "/msas"
+            pkl_save_path = os.path.join(rna_msa_feature_dir, f"{md5}.pkl.gz")
+            if os.path.exists(pkl_save_path):
+                return dict()
+            rna_msa_feature = parse_rna_alignment_dir(
+                os.path.join(output_dir, dec),
+                input_fasta_path
+            )
+            feature = {
+                "msa": rna_msa_feature["msa"][:max_seq].astype(np.int8),
+                "deletion_matrix": rna_msa_feature["deletion_matrix"][:max_seq].astype(np.int8),
+                "msa_species_identifiers": None
+            }
+            dump_pkl(feature, pkl_save_path)
+            return dict()
+        out = run_pool_tasks(partial(
+            _convert_msas_out_to_rna_msa_features,
+            output_dir=output_dir,
+            rna_msa_feature_dir=rna_msa_feature_dir,
+            convert_md5=convert_md5
+        ), input_fasta_paths, num_workers=num_workers, return_dict=True)
+        return out
+    @staticmethod
+    def find_chain_ids_without_msa_features(
+            polymer_filtering_out_json,
+            chain_id_to_meta_info_path,
+            dataset_dir,
+            uniprot=False,
+            num_workers=256,
+    ):
+        if not isinstance(polymer_filtering_out_json, list):
+            polymer_filtering_out_json = [polymer_filtering_out_json]
+        polymer_filtering_out = dict()
+        for i in polymer_filtering_out_json:
+            polymer_filtering_out.update(load_json(i))
+        chain_id_to_meta_info = load_json(chain_id_to_meta_info_path)
+        global find_chain_ids_without_msa_features
+        def find_chain_ids_without_msa_features(chain_id):
+            sequence_3 = chain_id_to_meta_info[chain_id]["sequence_3"]
+            seqs, md5s = get_protein_md5(sequence_3)
+            if uniprot:
+                dirs = ["uniprot_msa_features", "uniprot_msa_features_zkx", "uniprot_msa_features_unifold"]
+            else:
+                dirs = ["msa_features", "msa_features_zkx", "msa_features_whc", "msa_features_unifold"]
+            md5_dir = [[md5, dir] for dir in dirs for md5 in md5s]
+            for md5, dir in md5_dir:
+                if os.path.exists(os.path.join(dataset_dir, "features/", dir, f"{md5}.pkl.gz")):
+                    return dict()
+            return {chain_id: {"state": False, "seqs": seqs}}
+        chain_ids = [k for k, v in polymer_filtering_out.items() if
+                     v["state"] and chain_id_to_meta_info[k]["chain_class"] == "protein"]
+        out = run_pool_tasks(
+            find_chain_ids_without_msa_features, chain_ids, num_workers=num_workers, return_dict=True)
+        return out
+    def find_chain_ids_without_rna_msa_features(
+            self,
+            polymer_filtering_out_json,
+    ):
+        pass
+    @staticmethod
+    def check_msa_md5(msa_feature_dir):
+        pass
+    @staticmethod
+    def check_uniprot_msa_md5(uniprot_msa_feature_dir):
+        pass
+    @staticmethod
+    def check_rna_msa_md5(rna_msa_feature_dir):
+        pass
+    def get_training_pdbs(self):
+        pass
+class DataPipeline():
+    def __init__(self):
+        super().__init__()
+        self.data_manager = DatasetManager()
+# PDB:
+#     polymer_chain_id:
+#             weight_chain: contiguous_crop:1/3 spatial_crop: 2/3
+#     Interface
+#             weight_interface:
+#         [chain_id1, chain_id2] 0.2 contiguous_crop
+#         [chain_id1, chain_id2] 0.4 spatial_crop_interface
+#         [ < 20 chains]         0.4 spatial crop
+#
+#
+#
+#
+#     polymer chain contiguous crop sample weight w_chain*1/3  [chain_id]
+#     polymer chain spatial crop sample weight w_chain*2/3 [chain_id]
+#
+#     interface contiguous crop sample weight w_interface * 0.2 [chain_id, chain_id]
+#     interface spatial crop sample weight w_interface * 0.4 >[chain_id, chain_id]
+#     interface spatial crop interface sample weight w_interface * 0.4 [chain_id, chain_id]
+#
+#
+# pdb:
+#     chain:
+#         [chain_id]: 0.14
+#         [chain_id]: 0.23
+# @staticmethod
+# def get_pdb_info(pdb_id):
+#     all_chain_ids = pdb_id_to_meta_info[pdb_id]["chain_ids"]
+#
+#     chain_ids_info = {
+#         "protein": [],
+#         "rna": [],
+#         "dna": [],
+#         "ligand": []
+#     }
+#     for chain_id_ in all_chain_ids:
+#         chain_id = f"{pdb_id}_{chain_id_}"
+#         if chain_id in chain_id_to_meta_info:
+#             chain_class = chain_id_to_meta_info[chain_id]["chain_class"].split("_")[0]
+#             if chain_id in chain_ids and os.path.exists(os.path.join(stfold_data_path, f"{chain_id}.pkl.gz")):
+#                 if chain_class == "protein":
+#                     if check_protein_msa_features(chain_id, chain_id_to_meta_info)[chain_id]["state"]:
+#                         chain_ids_info[chain_class].append(chain_id)
+#                 elif chain_class == "rna":
+#                     if check_rna_msa_features(chain_id, chain_id_to_meta_info)[chain_id]["state"]:
+#                         chain_ids_info[chain_class].append(chain_id)
+#
+#             elif chain_class == "ligand" and os.path.exists(os.path.join(stfold_data_path, f"{chain_id}.pkl.gz")):
+#                 chain_ids_info[chain_class].append(chain_id)
+#     return {pdb_id: chain_ids_info}

PhysDock/data/tools/feature_processing_multimer.py ADDED Viewed

	@@ -0,0 +1,257 @@

+# Copyright 2021 DeepMind Technologies Limited
+# Copyright 2022 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature processing logic for multimer data pipeline."""
+from typing import Iterable, MutableMapping, List, Mapping, Dict, Any, Union
+from scipy.sparse import coo_matrix
+import numpy as np
+from . import msa_pairing
+FeatureDict = Dict[str, Union[np.ndarray, coo_matrix, None, Any]]
+# TODO: Move this into the config
+REQUIRED_FEATURES = frozenset({
+    'aatype', 'all_atom_mask', 'all_atom_positions', 'all_chains_entity_ids',
+    'all_crops_all_chains_mask', 'all_crops_all_chains_positions',
+    'all_crops_all_chains_residue_ids', 'assembly_num_chains', 'asym_id',
+    'bert_mask', 'cluster_bias_mask', 'deletion_matrix', 'deletion_mean',
+    'entity_id', 'entity_mask', 'mem_peak', 'msa', 'msa_mask', 'num_alignments',
+    'num_templates', 'queue_size', 'residue_index', 'resolution',
+    'seq_length', 'seq_mask', 'sym_id', 'template_aatype',
+    'template_all_atom_mask', 'template_all_atom_positions'
+})
+MAX_TEMPLATES = 4
+MSA_CROP_SIZE = 16384
+def _is_homomer_or_monomer(chains: Iterable[Mapping[str, np.ndarray]]) -> bool:
+    """Checks if a list of chains represents a homomer/monomer example."""
+    # Note that an entity_id of 0 indicates padding.
+    # num_unique_chains = len(np.unique(np.concatenate(
+    #     [np.unique(chain['entity_id'][chain['entity_id'] > 0]) for
+    #      chain in chains])))
+    # return num_unique_chains == 1
+    num_chains = len(chains)
+    return num_chains == 1
+def pair_and_merge(
+        all_chain_features: MutableMapping[str, Mapping[str, np.ndarray]],
+        is_homomer_or_monomer,
+) -> FeatureDict:
+    """Runs processing on features to augment, pair and merge.
+  Args:
+    all_chain_features: A MutableMap of dictionaries of features for each chain.
+  Returns:
+    A dictionary of features.
+  """
+    process_unmerged_features(all_chain_features)
+    np_chains_list = list(all_chain_features.values())
+    np_chains_list_prot = [chain for chain in np_chains_list if
+                           chain['chain_class'] in ['protein']]
+    np_chains_list_dna = [chain for chain in np_chains_list if
+                          chain['chain_class'] in ['dna']]
+    np_chains_list_rna = [chain for chain in np_chains_list if
+                          chain['chain_class'] in ['rna', ]]
+    # TODO: ligand?
+    np_chains_list_ligand = [chain for chain in np_chains_list if chain['chain_class'] in ['ligand']]
+    # np_chains_list_ligand = []
+    # pair_msa_sequences_prot = False
+    # if np_chains_list_prot:
+    #     pair_msa_sequences_prot = not _is_homomer_or_monomer(np_chains_list_prot)
+    pair_msa_sequences = not _is_homomer_or_monomer(np_chains_list)
+    pair_msa_sequences_prot = not is_homomer_or_monomer
+    if pair_msa_sequences_prot and np_chains_list_prot:
+        # uniprot : all_seq pairs
+        np_chains_list_prot = msa_pairing.create_paired_features(
+            chains=np_chains_list_prot
+        )
+        # deduplicate msa
+        # np_chains_list_prot = msa_pairing.deduplicate_unpaired_sequences(np_chains_list_prot)
+    else:
+        if np_chains_list_prot:
+            for prot in np_chains_list_prot:
+                prot["num_alignments"] = np.ones([], dtype=np.int32)
+    for chain in np_chains_list_prot:
+        chain.pop("msa_species_identifiers", None)
+        chain.pop("msa_species_identifiers_all_seq", None)
+    np_chains_list_prot.extend(np_chains_list_rna)
+    np_chains_list_prot.extend(np_chains_list_dna)
+    np_chains_list_prot.extend(np_chains_list_ligand)
+    np_chains_list = np_chains_list_prot
+    np_chains_list = crop_chains(
+        np_chains_list,
+        msa_crop_size=MSA_CROP_SIZE,
+        pair_msa_sequences=pair_msa_sequences_prot,
+        max_templates=MAX_TEMPLATES
+    )
+    np_example = msa_pairing.merge_chain_features(
+        np_chains_list=np_chains_list, pair_msa_sequences=pair_msa_sequences,
+        max_templates=MAX_TEMPLATES
+    )
+    # np_example = print_final(np_example)
+    return np_example
+def crop_chains(
+        chains_list: List[Mapping[str, np.ndarray]],
+        msa_crop_size: int,
+        pair_msa_sequences: bool,
+        max_templates: int
+) -> List[Mapping[str, np.ndarray]]:
+    """Crops the MSAs for a set of chains.
+  Args:
+    chains_list: A list of chains to be cropped.
+    msa_crop_size: The total number of sequences to crop from the MSA.
+    pair_msa_sequences: Whether we are operating in sequence-pairing mode.
+    max_templates: The maximum templates to use per chain.
+  Returns:
+    The chains cropped.
+  """
+    # Apply the cropping.
+    cropped_chains = []
+    for chain in chains_list:
+        if chain['chain_class'] in ['protein']:
+            # print(chain['chain_class'])
+            cropped_chain = _crop_single_chain(
+                chain,
+                msa_crop_size=msa_crop_size,
+                pair_msa_sequences=pair_msa_sequences,
+                max_templates=max_templates)
+        else:
+            msa_size = chain['msa'].shape[0]
+            msa_size_array = np.arange(msa_size)
+            target_size = MSA_CROP_SIZE
+            if msa_size < target_size:
+                sample_msa_id = np.random.choice(msa_size_array, target_size - msa_size, replace=True)
+                sample_msa = chain['msa'][sample_msa_id, :]
+                chain['msa'] = np.concatenate([chain['msa'], sample_msa], axis=0)
+                sample_msa_del = chain['deletion_matrix'][sample_msa_id, :]
+                chain['deletion_matrix'] = np.concatenate([chain['deletion_matrix'], sample_msa_del], axis=0)
+            else:
+                chain['msa'] = chain['msa'][:target_size, :]
+                msa_size = chain['msa'].shape[0]
+                msa_size_array = np.arange(msa_size)
+                chain['deletion_matrix'] = chain['deletion_matrix'][:target_size, :]
+            cropped_chain = chain
+        cropped_chains.append(cropped_chain)
+    return cropped_chains
+def _crop_single_chain(chain: Mapping[str, np.ndarray],
+                       msa_crop_size: int,
+                       pair_msa_sequences: bool,
+                       max_templates: int) -> Mapping[str, np.ndarray]:
+    """Crops msa sequences to `msa_crop_size`."""
+    msa_size = len(chain['msa'])
+    if pair_msa_sequences:
+        # print(chain.keys())
+        msa_size_all_seq = chain['msa_all_seq'].shape[0]
+        msa_crop_size_all_seq = np.minimum(msa_size_all_seq, msa_crop_size // 2)
+    else:
+        msa_crop_size_all_seq = 0
+    include_templates = 'template_aatype' in chain and max_templates
+    if include_templates:
+        num_templates = chain['template_aatype'].shape[0]
+        templates_crop_size = np.minimum(num_templates, max_templates)
+    target_size = MSA_CROP_SIZE - msa_crop_size_all_seq
+    if msa_size < target_size:
+        sample_msa_id = np.random.choice(np.arange(msa_size), target_size - msa_size, replace=True)
+    for k in chain:
+        k_split = k.split('_all_seq')[0]
+        if k_split in msa_pairing.TEMPLATE_FEATURES:
+            chain[k] = chain[k][:templates_crop_size, :]
+        elif k_split in msa_pairing.MSA_FEATURES:
+            if '_all_seq' in k:
+                chain[k] = chain[k][:msa_crop_size_all_seq, :]
+            else:
+                if msa_size < target_size:
+                    sample_msa = chain[k][sample_msa_id, :]
+                    chain[k] = np.concatenate([chain[k], sample_msa], axis=0)
+                else:
+                    chain[k] = chain[k][:target_size, :]
+    chain['num_alignments'] = np.asarray(len(chain['msa']), dtype=np.int32)
+    if include_templates:
+        chain['num_templates'] = np.asarray(templates_crop_size, dtype=np.int32)
+    if pair_msa_sequences:
+        chain['num_alignments_all_seq'] = np.asarray(
+            len(chain['msa_all_seq']), dtype=np.int32)
+    return chain
+def print_final(
+        np_example: Mapping[str, np.ndarray]
+) -> Mapping[str, np.ndarray]:
+    return np_example
+def _filter_features(
+        np_example: Mapping[str, np.ndarray]
+) -> Mapping[str, np.ndarray]:
+    """Filters features of example to only those requested."""
+    return {k: v for (k, v) in np_example.items() if k in REQUIRED_FEATURES}
+def process_unmerged_features(
+        all_chain_features: MutableMapping[str, Mapping[str, np.ndarray]]
+):
+    """Postprocessing stage for per-chain features before merging."""
+    num_chains = len(all_chain_features)
+    for chain_features in all_chain_features.values():
+        # if chain_features['chain_class'] in ['protein']:
+        chain_features['deletion_mean'] = np.mean(
+            chain_features['deletion_matrix'], axis=0
+        )
+        # Add assembly_num_chains.
+        chain_features['assembly_num_chains'] = np.asarray(num_chains)
+    # Add entity_mask.
+    for chain_features in all_chain_features.values():
+        chain_features['entity_mask'] = (
+                chain_features['entity_id'] != 0).astype(np.int32)

PhysDock/data/tools/get_metrics.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import sys
+sys.path.append("../")
+import os
+import torch
+import torch.nn.functional as F
+import numpy as np
+from typing import Optional
+import scipy
+import random
+import logging
+import collections
+from functools import partial
+from typing import Union, Tuple, Dict
+import itertools
+from PhysDock.utils.tensor_utils import tensor_tree_map
+from PhysDock.utils.io_utils import load_json, load_txt,dump_json,dump_txt,dump_pkl, load_pkl
+def _calculate_bin_centers(breaks: np.ndarray):
+  """Gets the bin centers from the bin edges.
+  Args:
+    breaks: [num_bins - 1] the error bin edges.
+  Returns:
+    bin_centers: [num_bins] the error bin centers.
+  """
+  step = (breaks[1] - breaks[0])
+  # Add half-step to get the center
+  bin_centers = breaks + step / 2
+  # Add a catch-all bin at the end.
+  bin_centers = np.concatenate([bin_centers, [bin_centers[-1] + step]],
+                               axis=0)
+  return bin_centers
+def _calculate_expected_aligned_error(
+    alignment_confidence_breaks: np.ndarray,
+    aligned_distance_error_probs: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+  """Calculates expected aligned distance errors for every pair of residues.
+  Args:
+    alignment_confidence_breaks: [num_bins - 1] the error bin edges.
+    aligned_distance_error_probs: [num_res, num_res, num_bins] the predicted
+      probs for each error bin, for each pair of residues.
+  Returns:
+    predicted_aligned_error: [num_res, num_res] the expected aligned distance
+      error for each pair of residues.
+    max_predicted_aligned_error: The maximum predicted error possible.
+  """
+  bin_centers = _calculate_bin_centers(alignment_confidence_breaks)
+  # Tuple of expected aligned distance error and max possible error.
+  return (np.sum(aligned_distance_error_probs * bin_centers, axis=-1),
+          np.asarray(bin_centers[-1]))
+def compute_plddt(logits: np.ndarray) -> np.ndarray:
+  """Computes per-residue pLDDT from logits.
+  Args:
+    logits: [num_res, num_bins] output from the PredictedLDDTHead.
+  Returns:
+    plddt: [num_res] per-residue pLDDT.
+  """
+  num_bins = logits.shape[-1]
+  bin_width = 1.0 / num_bins
+  bin_centers = np.arange(start=0.5 * bin_width, stop=1.0, step=bin_width)
+  probs = scipy.special.softmax(logits, axis=-1)
+  predicted_lddt_ca = np.sum(probs * bin_centers[None, :], axis=-1)
+  return predicted_lddt_ca * 100
+def predicted_tm_score(
+    logits: np.ndarray,
+    breaks: np.ndarray,
+    residue_weights: Optional[np.ndarray] = None,
+    asym_id: Optional[np.ndarray] = None,
+    interface: bool = False) -> np.ndarray:
+  """Computes predicted TM alignment or predicted interface TM alignment score.
+  Args:
+    logits: [num_res, num_res, num_bins] the logits output from
+      PredictedAlignedErrorHead.
+    breaks: [num_bins] the error bins.
+    residue_weights: [num_res] the per residue weights to use for the
+      expectation.
+    asym_id: [num_res] the asymmetric unit ID - the chain ID. Only needed for
+      ipTM calculation, i.e. when interface=True.
+    interface: If True, interface predicted TM score is computed.
+  Returns:
+    ptm_score: The predicted TM alignment or the predicted iTM score.
+  """
+  # residue_weights has to be in [0, 1], but can be floating-point, i.e. the
+  # exp. resolved head's probability.
+  if residue_weights is None:
+    residue_weights = np.ones(logits.shape[0])
+  bin_centers = _calculate_bin_centers(breaks)
+  num_res = int(np.sum(residue_weights))
+  # Clip num_res to avoid negative/undefined d0.
+  clipped_num_res = max(num_res, 19)
+  # Compute d_0(num_res) as defined by TM-score, eqn. (5) in Yang & Skolnick
+  # "Scoring function for automated assessment of protein structure template
+  # quality", 2004: http://zhanglab.ccmb.med.umich.edu/papers/2004_3.pdf
+  d0 = 1.24 * (clipped_num_res - 15) ** (1./3) - 1.8
+  # Convert logits to probs.
+  probs = scipy.special.softmax(logits, axis=-1)
+  # TM-Score term for every bin.
+  tm_per_bin = 1. / (1 + np.square(bin_centers) / np.square(d0))
+  # E_distances tm(distance).
+  predicted_tm_term = np.sum(probs * tm_per_bin, axis=-1)
+  pair_mask = np.ones_like(predicted_tm_term, dtype=bool)
+  if interface:
+    pair_mask *= asym_id[:, None] != asym_id[None, :]
+  predicted_tm_term *= pair_mask
+  pair_residue_weights = pair_mask * (
+      residue_weights[None, :] * residue_weights[:, None])
+  normed_residue_mask = pair_residue_weights / (1e-8 + np.sum(
+      pair_residue_weights, axis=-1, keepdims=True))
+  per_alignment = np.sum(predicted_tm_term * normed_residue_mask, axis=-1)
+  return np.asarray(per_alignment[(per_alignment * residue_weights).argmax()])
+def compute_predicted_aligned_error(
+    logits: np.ndarray,
+    breaks: np.ndarray) -> Dict[str, np.ndarray]:
+  """Computes aligned confidence metrics from logits.
+  Args:
+    logits: [num_res, num_res, num_bins] the logits output from
+      PredictedAlignedErrorHead.
+    breaks: [num_bins - 1] the error bin edges.
+  Returns:
+    aligned_confidence_probs: [num_res, num_res, num_bins] the predicted
+      aligned error probabilities over bins for each residue pair.
+    predicted_aligned_error: [num_res, num_res] the expected aligned distance
+      error for each pair of residues.
+    max_predicted_aligned_error: The maximum predicted error possible.
+  """
+  aligned_confidence_probs = scipy.special.softmax(
+      logits,
+      axis=-1)
+  predicted_aligned_error, max_predicted_aligned_error = (
+      _calculate_expected_aligned_error(
+          alignment_confidence_breaks=breaks,
+          aligned_distance_error_probs=aligned_confidence_probs))
+  return {
+      'aligned_confidence_probs': aligned_confidence_probs,
+      'predicted_aligned_error': predicted_aligned_error,
+      'max_predicted_aligned_error': max_predicted_aligned_error,
+  }
+def get_has_clash(atom_pos, atom_mask, asym_id, is_polymer_chain):
+    """
+    A structure is marked as having a clash (has_clash) if for any two
+    polymer chains A,B in the prediction clashes(A,B) > 100 or
+    clashes(A,B) / min(NA,NB) > 0.5 where NA is the number of atoms in
+    chain A.
+    Args:
+        atom_pos: [N_atom, 3]
+        atom_mask: [N_atom]
+        asym_id: [N_atom]
+        is_polymer_chain: [N_atom]
+    """
+    flag = np.logical_and(atom_mask == 1, is_polymer_chain == 1)
+    atom_pos = atom_pos[flag]
+    asym_id = asym_id[flag]
+    uniq_asym_ids = np.unique(asym_id)
+    n = len(uniq_asym_ids)
+    if n == 1:
+        return 0
+    for aid1 in uniq_asym_ids[:-1]:
+        for aid2 in uniq_asym_ids[1:]:
+            pos1 = atom_pos[asym_id == aid1]
+            pos2 = atom_pos[asym_id == aid2]
+            dist = np.sqrt(np.sum((pos1[None] - pos2[:, None]) ** 2, -1))
+            n_clash = np.sum(dist < 1.1).astype('float32')
+            if n_clash > 100 or n_clash / min(len(pos1), len(pos2)) > 0.5:
+                return 1
+    return 0
+def get_metrics(output, batch):
+    """
+    Args:
+        logits_plddt: (B, N_atom, b_plddt)
+        logits_pae: (B, N_token, N_token, b_pae)
+    Returns:
+        atom_plddts: (B, N_atom)
+        mean_plddt: (B,)
+        pae: (B, N_token, N_token)
+        ptm: (B,)
+        iptm: (B,)
+        has_clash: (B,)
+        ranking_confidence: (B,)
+    """
+    logit_value = output
+    # B = logit_value['p_pae'].shape[0]
+    breaks_pae = torch.linspace(0.,
+                                 0.5 * 64,
+                                 64 - 1)
+    inputs = {
+        's_mask': batch['s_mask'],
+        'asym_id': batch['asym_id'],
+        'breaks_pae': torch.tile(breaks_pae, [ 1]),
+        # 'perm_asym_id': batch['perm_asym_id'],
+        'is_polymer_chain': ((batch['is_protein'] +
+                              batch['is_dna'] + batch['is_rna']) > 0),
+        **logit_value,
+        **batch
+    }
+    ret_list = []
+    # for i in range(B):
+    cur_input = tensor_tree_map(lambda x: x.numpy(), inputs)
+    # cur_input = inputs
+    ret = get_all_atom_confidence_metrics(cur_input,0)
+    ret_list.append(ret)
+    metrics = {}
+    for k, v in ret_list[0].items():
+        metrics[k] = torch.from_numpy(np.stack([r[k] for r in ret_list]))
+    return metrics
+def get_all_atom_confidence_metrics(
+        prediction_result,b):
+    """get_all_atom_confidence_metrics."""
+    metrics = {}
+    metrics['atom_plddts'] = compute_plddt(
+            prediction_result['p_plddt'])
+    metrics['mean_plddt'] = metrics['atom_plddts'].mean()
+    metrics['pae'] = compute_predicted_aligned_error(
+            logits=prediction_result['p_pae'],
+            breaks=prediction_result['breaks_pae'])['predicted_aligned_error']
+    metrics['ptm'] = predicted_tm_score(
+            logits=prediction_result['p_pae'],
+            breaks=prediction_result['breaks_pae'],
+            residue_weights=prediction_result['s_mask'],
+            asym_id=None)
+    metrics['iptm'] = predicted_tm_score(
+            logits=prediction_result['p_pae'],
+            breaks=prediction_result['breaks_pae'],
+            residue_weights=prediction_result['s_mask'],
+            asym_id=prediction_result['asym_id'],
+            interface=True)
+    metrics['has_clash'] = get_has_clash(
+            prediction_result['x_pred'][b],
+            prediction_result['a_mask'],
+            prediction_result['asym_id'][prediction_result["atom_id_to_token_id"]],
+            ~prediction_result['is_ligand'][prediction_result["atom_id_to_token_id"]])
+    metrics['ranking_confidence'] = (
+            0.8 * metrics['iptm'] + 0.2 * metrics['ptm']
+            - 1.0 * metrics['has_clash'])
+    return metrics
+# output = load_pkl("../output.pkl.gz")
+# feats = load_pkl("../feats.pkl.gz")
+# for k,v in output.items():
+#     print(k,v.shape)
+#     # output[k] = torch.from_numpy(v)
+# for k,v in feats.items():
+#     print(k,v.shape)
+#     # feats[k] = torch.from_numpy(v)
+# # dump_pkl(output,"../output.pkl.gz")
+# # dump_pkl(feats,"../feats.pkl.gz")
+#
+# metrics = get_metrics(output,feats)
+#
+# for k,v in metrics.items():
+#     print(k,v)

PhysDock/data/tools/hhblits.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to run HHblits from Python."""
+import glob
+import logging
+import os
+import subprocess
+from typing import Any, List, Mapping, Optional, Sequence
+from . import utils
+_HHBLITS_DEFAULT_P = 20
+_HHBLITS_DEFAULT_Z = 500
+class HHBlits:
+    """Python wrapper of the HHblits binary."""
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        databases: Sequence[str],
+        n_cpu: int = 4,
+        n_iter: int = 3,
+        e_value: float = 0.001,
+        maxseq: int = 1_000_000,
+        realign_max: int = 100_000,
+        maxfilt: int = 100_000,
+        min_prefilter_hits: int = 1000,
+        all_seqs: bool = False,
+        alt: Optional[int] = None,
+        p: int = _HHBLITS_DEFAULT_P,
+        z: int = _HHBLITS_DEFAULT_Z,
+    ):
+        """Initializes the Python HHblits wrapper.
+        Args:
+          binary_path: The path to the HHblits executable.
+          databases: A sequence of HHblits database paths. This should be the
+            common prefix for the database files (i.e. up to but not including
+            _hhm.ffindex etc.)
+          n_cpu: The number of CPUs to give HHblits.
+          n_iter: The number of HHblits iterations.
+          e_value: The E-value, see HHblits docs for more details.
+          maxseq: The maximum number of rows in an input alignment. Note that this
+            parameter is only supported in HHBlits version 3.1 and higher.
+          realign_max: Max number of HMM-HMM hits to realign. HHblits default: 500.
+          maxfilt: Max number of hits allowed to pass the 2nd prefilter.
+            HHblits default: 20000.
+          min_prefilter_hits: Min number of hits to pass prefilter.
+            HHblits default: 100.
+          all_seqs: Return all sequences in the MSA / Do not filter the result MSA.
+            HHblits default: False.
+          alt: Show up to this many alternative alignments.
+          p: Minimum Prob for a hit to be included in the output hhr file.
+            HHblits default: 20.
+          z: Hard cap on number of hits reported in the hhr file.
+            HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.
+        Raises:
+          RuntimeError: If HHblits binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.databases = databases
+        for database_path in self.databases:
+            if not glob.glob(database_path + "_*"):
+                logging.error(
+                    "Could not find HHBlits database %s", database_path
+                )
+                raise ValueError(
+                    f"Could not find HHBlits database {database_path}"
+                )
+        self.n_cpu = n_cpu
+        self.n_iter = n_iter
+        self.e_value = e_value
+        self.maxseq = maxseq
+        self.realign_max = realign_max
+        self.maxfilt = maxfilt
+        self.min_prefilter_hits = min_prefilter_hits
+        self.all_seqs = all_seqs
+        self.alt = alt
+        self.p = p
+        self.z = z
+    def query(self, input_fasta_path: str) -> List[Mapping[str, Any]]:
+        """Queries the database using HHblits."""
+        with utils.tmpdir_manager() as query_tmp_dir:
+            a3m_path = os.path.join(query_tmp_dir, "output.a3m")
+            db_cmd = []
+            for db_path in self.databases:
+                db_cmd.append("-d")
+                db_cmd.append(db_path)
+            cmd = [
+                self.binary_path,
+                "-i",
+                input_fasta_path,
+                "-cpu",
+                str(self.n_cpu),
+                "-oa3m",
+                a3m_path,
+                "-o",
+                "/dev/null",
+                "-n",
+                str(self.n_iter),
+                "-e",
+                str(self.e_value),
+                "-maxseq",
+                str(self.maxseq),
+                "-realign_max",
+                str(self.realign_max),
+                "-maxfilt",
+                str(self.maxfilt),
+                "-min_prefilter_hits",
+                str(self.min_prefilter_hits),
+            ]
+            if self.all_seqs:
+                cmd += ["-all"]
+            if self.alt:
+                cmd += ["-alt", str(self.alt)]
+            if self.p != _HHBLITS_DEFAULT_P:
+                cmd += ["-p", str(self.p)]
+            if self.z != _HHBLITS_DEFAULT_Z:
+                cmd += ["-Z", str(self.z)]
+            cmd += db_cmd
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing("HHblits query"):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+            if retcode:
+                # Logs have a 15k character limit, so log HHblits error line by line.
+                logging.error("HHblits failed. HHblits stderr begin:")
+                for error_line in stderr.decode("utf-8").splitlines():
+                    if error_line.strip():
+                        logging.error(error_line.strip())
+                logging.error("HHblits stderr end")
+                raise RuntimeError(
+                    "HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n"
+                    % (stdout.decode("utf-8"), stderr[:500_000].decode("utf-8"))
+                )
+            with open(a3m_path) as f:
+                a3m = f.read()
+        raw_output = dict(
+            a3m=a3m,
+            output=stdout,
+            stderr=stderr,
+            n_iter=self.n_iter,
+            e_value=self.e_value,
+        )
+        return [raw_output]

PhysDock/data/tools/hhsearch.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to run HHsearch from Python."""
+import glob
+import logging
+import os
+import subprocess
+from typing import Sequence, Optional
+from . import parsers
+from . import utils
+class HHSearch:
+    """Python wrapper of the HHsearch binary."""
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        databases: Sequence[str],
+        n_cpu: int = 2,
+        maxseq: int = 1_000_000,
+    ):
+        """Initializes the Python HHsearch wrapper.
+        Args:
+          binary_path: The path to the HHsearch executable.
+          databases: A sequence of HHsearch database paths. This should be the
+            common prefix for the database files (i.e. up to but not including
+            _hhm.ffindex etc.)
+          n_cpu: The number of CPUs to use
+          maxseq: The maximum number of rows in an input alignment. Note that this
+            parameter is only supported in HHBlits version 3.1 and higher.
+        Raises:
+          RuntimeError: If HHsearch binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.databases = databases
+        self.n_cpu = n_cpu
+        self.maxseq = maxseq
+        for database_path in self.databases:
+            if not glob.glob(database_path + "_*"):
+                logging.error(
+                    "Could not find HHsearch database %s", database_path
+                )
+                raise ValueError(
+                    f"Could not find HHsearch database {database_path}"
+                )
+    @property
+    def output_format(self) -> str:
+        return 'hhr'
+    @property
+    def input_format(self) -> str:
+        # return 'sto'
+        return 'a3m'
+    def query(self, a3m: str, output_dir: Optional[str] = None) -> str:
+        """Queries the database using HHsearch using a given a3m."""
+        with utils.tmpdir_manager() as query_tmp_dir:
+            input_path = os.path.join(query_tmp_dir, "query.a3m")
+            output_dir = query_tmp_dir if output_dir is None else output_dir
+            hhr_path = os.path.join(output_dir, "hhsearch_output.hhr")
+            with open(input_path, "w") as f:
+                f.write(a3m)
+            db_cmd = []
+            for db_path in self.databases:
+                db_cmd.append("-d")
+                db_cmd.append(db_path)
+            cmd = [
+                self.binary_path,
+                "-i",
+                input_path,
+                "-o",
+                hhr_path,
+                "-maxseq",
+                str(self.maxseq),
+                "-cpu",
+                str(self.n_cpu),
+            ] + db_cmd
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing("HHsearch query"):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+            # if retcode:
+            #     # Stderr is truncated to prevent proto size errors in Beam.
+            #     raise RuntimeError(
+            #         "HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n"
+            #         % (stdout.decode("utf-8"), stderr[:100_000].decode("utf-8"))
+            #     )
+            with open(hhr_path) as f:
+                hhr = f.read()
+        return hhr
+    @staticmethod
+    def get_template_hits(
+        output_string: str,
+        input_sequence: str
+    ) -> Sequence[parsers.TemplateHit]:
+        """Gets parsed template hits from the raw string output by the tool"""
+        del input_sequence # Used by hmmsearch but not needed for hhsearch
+        return parsers.parse_hhr(output_string)

PhysDock/data/tools/hmmalign.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+import subprocess
+from typing import Optional, Sequence
+import logging
+from . import parsers
+from . import hmmbuild
+from . import utils
+class Hmmalign(object):
+    def __init__(
+            self,
+            *,
+            hmmbuild_binary_path: str,
+            hmmalign_binary_path: str,
+    ):
+        self.binary_path = hmmalign_binary_path
+        self.hmmbuild_runner = hmmbuild.Hmmbuild(binary_path=hmmbuild_binary_path)
+    @property
+    def output_format(self) -> str:
+        return 'sto'
+    @property
+    def input_format(self) -> str:
+        return 'sto'
+    def realign_sto_with_fasta(self, input_fasta_path, input_sto_path, output_sto_path: Optional = None) -> str:
+        delete_out = False if output_sto_path is not None else True
+        with utils.tmpdir_manager() as query_tmp_dir:
+            hmm_output_path = os.path.join(query_tmp_dir, 'query.hmm')
+            output_sto_path = os.path.join(query_tmp_dir,
+                                           "realigned.sto") if output_sto_path is None else output_sto_path
+            with open(input_fasta_path, "r") as f:
+                hmm = self.hmmbuild_runner.build_rna_profile_from_fasta(f.read())
+            with open(hmm_output_path, 'w') as f:
+                f.write(hmm)
+            cmd = [
+                self.binary_path,
+                '--rna',  # Don't include the alignment in stdout.
+                '--mapali', input_fasta_path,
+                "-o", output_sto_path,
+                hmm_output_path,
+                input_sto_path
+            ]
+            # print(cmd)
+            # print(" ".join(cmd))
+            logging.info('Launching sub-process %s', cmd)
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            with utils.timing(
+                    f'hmmsearch  query'):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+            if retcode:
+                raise RuntimeError(
+                    'hmmsearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' % (
+                        stdout.decode('utf-8'), stderr.decode('utf-8')))
+            if delete_out:
+                with open(output_sto_path) as f:
+                    out_msa = f.read()
+        if delete_out:
+            return out_msa

PhysDock/data/tools/hmmbuild.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A Python wrapper for hmmbuild - construct HMM profiles from MSA."""
+import os
+import re
+import subprocess
+import logging
+from . import utils
+class Hmmbuild(object):
+    """Python wrapper of the hmmbuild binary."""
+    def __init__(self,
+                 *,
+                 binary_path: str,
+                 singlemx: bool = False):
+        """Initializes the Python hmmbuild wrapper.
+        Args:
+          binary_path: The path to the hmmbuild executable.
+          singlemx: Whether to use --singlemx flag. If True, it forces HMMBuild to
+            just use a common substitution score matrix.
+        Raises:
+          RuntimeError: If hmmbuild binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.singlemx = singlemx
+    def build_profile_from_sto(self, sto: str, model_construction='fast') -> str:
+        """Builds a HHM for the aligned sequences given as an A3M string.
+        Args:
+          sto: A string with the aligned sequences in the Stockholm format.
+          model_construction: Whether to use reference annotation in the msa to
+            determine consensus columns ('hand') or default ('fast').
+        Returns:
+          A string with the profile in the HMM format.
+        Raises:
+          RuntimeError: If hmmbuild fails.
+        """
+        return self._build_profile(sto, model_construction=model_construction)
+    def build_profile_from_a3m(self, a3m: str) -> str:
+        """Builds a HHM for the aligned sequences given as an A3M string.
+        Args:
+          a3m: A string with the aligned sequences in the A3M format.
+        Returns:
+          A string with the profile in the HMM format.
+        Raises:
+          RuntimeError: If hmmbuild fails.
+        """
+        lines = []
+        for line in a3m.splitlines():
+            if not line.startswith('>'):
+                line = re.sub('[a-z]+', '', line)  # Remove inserted residues.
+            lines.append(line + '\n')
+        msa = ''.join(lines)
+        return self._build_profile(msa, model_construction='fast')
+    def _build_profile(self, msa: str, model_construction: str = 'fast') -> str:
+        """Builds a HMM for the aligned sequences given as an MSA string.
+        Args:
+          msa: A string with the aligned sequences, in A3M or STO format.
+          model_construction: Whether to use reference annotation in the msa to
+            determine consensus columns ('hand') or default ('fast').
+        Returns:
+          A string with the profile in the HMM format.
+        Raises:
+          RuntimeError: If hmmbuild fails.
+          ValueError: If unspecified arguments are provided.
+        """
+        if model_construction not in {'hand', 'fast'}:
+            raise ValueError(f'Invalid model_construction {model_construction} - only'
+                             'hand and fast supported.')
+        with utils.tmpdir_manager() as query_tmp_dir:
+            input_query = os.path.join(query_tmp_dir, 'query.msa')
+            output_hmm_path = os.path.join(query_tmp_dir, 'output.hmm')
+            with open(input_query, 'w') as f:
+                f.write(msa)
+            cmd = [self.binary_path]
+            # If adding flags, we have to do so before the output and input:
+            if model_construction == 'hand':
+                cmd.append(f'--{model_construction}')
+            if self.singlemx:
+                cmd.append('--singlemx')
+            cmd.extend([
+                '--amino',
+                output_hmm_path,
+                input_query,
+            ])
+            logging.info('Launching subprocess %s', cmd)
+            process = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                                       stderr=subprocess.PIPE)
+            with utils.timing('hmmbuild query'):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+                logging.info('hmmbuild stdout:\n%s\n\nstderr:\n%s\n',
+                             stdout.decode('utf-8'), stderr.decode('utf-8'))
+            if retcode:
+                raise RuntimeError('hmmbuild failed\nstdout:\n%s\n\nstderr:\n%s\n'
+                                   % (stdout.decode('utf-8'), stderr.decode('utf-8')))
+            with open(output_hmm_path, encoding='utf-8') as f:
+                hmm = f.read()
+        return hmm
+    def build_rna_profile_from_fasta(self, fasta: str):
+        with utils.tmpdir_manager() as query_tmp_dir:
+            input_query = os.path.join(query_tmp_dir, 'query.fasta')
+            output_hmm_path = os.path.join(query_tmp_dir, 'query.hmm')
+            with open(input_query, 'w') as f:
+                f.write(fasta)
+            cmd = [self.binary_path]
+            cmd.extend([
+                '--rna',
+                output_hmm_path,
+                input_query,
+            ])
+            logging.info('Launching subprocess %s', cmd)
+            process = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                                       stderr=subprocess.PIPE)
+            with utils.timing('hmmbuild query'):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+                logging.info('hmmbuild stdout:\n%s\n\nstderr:\n%s\n',
+                             stdout.decode('utf-8'), stderr.decode('utf-8'))
+            if retcode:
+                raise RuntimeError('hmmbuild failed\nstdout:\n%s\n\nstderr:\n%s\n'
+                                   % (stdout.decode('utf-8'), stderr.decode('utf-8')))
+            with open(output_hmm_path, encoding='utf-8') as f:
+                hmm = f.read()
+        return hmm

PhysDock/data/tools/hmmsearch.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A Python wrapper for hmmsearch - search profile against a sequence db."""
+import os
+import subprocess
+from typing import Optional, Sequence
+import logging
+from . import parsers
+from . import hmmbuild
+from . import utils
+class Hmmsearch(object):
+    """Python wrapper of the hmmsearch binary."""
+    def __init__(self,
+        *,
+        binary_path: str,
+        hmmbuild_binary_path: str,
+        database_path: str,
+        flags: Optional[Sequence[str]] = None
+    ):
+        """Initializes the Python hmmsearch wrapper.
+        Args:
+            binary_path: The path to the hmmsearch executable.
+            hmmbuild_binary_path: The path to the hmmbuild executable. Used to build
+                an hmm from an input a3m.
+            database_path: The path to the hmmsearch database (FASTA format).
+            flags: List of flags to be used by hmmsearch.
+        Raises:
+            RuntimeError: If hmmsearch binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.hmmbuild_runner = hmmbuild.Hmmbuild(binary_path=hmmbuild_binary_path)
+        self.database_path = database_path
+        if flags is None:
+            # Default hmmsearch run settings.
+            flags = ['--F1', '0.1',
+                             '--F2', '0.1',
+                             '--F3', '0.1',
+                             '--incE', '100',
+                             '-E', '100',
+                             '--domE', '100',
+                             '--incdomE', '100']
+        self.flags = flags
+        if not os.path.exists(self.database_path):
+            logging.error('Could not find hmmsearch database %s', database_path)
+            raise ValueError(f'Could not find hmmsearch database {database_path}')
+    @property
+    def output_format(self) -> str:
+        return 'sto'
+    @property
+    def input_format(self) -> str:
+        return 'sto'
+    def query(self, msa_sto: str, output_dir: Optional[str] = None) -> str:
+        """Queries the database using hmmsearch using a given stockholm msa."""
+        hmm = self.hmmbuild_runner.build_profile_from_sto(
+            msa_sto,
+            model_construction='hand'
+        )
+        return self.query_with_hmm(hmm, output_dir)
+    def query_with_hmm(self,
+        hmm: str,
+        output_dir: Optional[str] = None
+    ) -> str:
+        """Queries the database using hmmsearch using a given hmm."""
+        with utils.tmpdir_manager() as query_tmp_dir:
+            hmm_input_path = os.path.join(query_tmp_dir, 'query.hmm')
+            output_dir = query_tmp_dir if output_dir is None else output_dir
+            out_path = os.path.join(output_dir, 'hmm_output.sto')
+            with open(hmm_input_path, 'w') as f:
+                f.write(hmm)
+            cmd = [
+                    self.binary_path,
+                    '--noali',    # Don't include the alignment in stdout.
+                    '--cpu', '8'
+            ]
+            # If adding flags, we have to do so before the output and input:
+            if self.flags:
+                cmd.extend(self.flags)
+            cmd.extend([
+                    '-A', out_path,
+                    hmm_input_path,
+                    self.database_path,
+            ])
+            logging.info('Launching sub-process %s', cmd)
+            process = subprocess.Popen(
+                    cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            with utils.timing(
+                    f'hmmsearch ({os.path.basename(self.database_path)}) query'):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+            if retcode:
+                raise RuntimeError(
+                        'hmmsearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' % (
+                                stdout.decode('utf-8'), stderr.decode('utf-8')))
+            with open(out_path) as f:
+                out_msa = f.read()
+        return out_msa
+    @staticmethod
+    def get_template_hits(
+        output_string: str,
+        input_sequence: str
+    ) -> Sequence[parsers.TemplateHit]:
+        """Gets parsed template hits from the raw string output by the tool."""
+        template_hits = parsers.parse_hmmsearch_sto(
+            output_string,
+            input_sequence,
+        )
+        return template_hits

PhysDock/data/tools/jackhmmer.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to run Jackhmmer from Python."""
+from concurrent import futures
+import glob
+import logging
+import os
+import subprocess
+from typing import Any, Callable, Mapping, Optional, Sequence
+from urllib import request
+from . import parsers
+from . import utils
+class Jackhmmer:
+    """Python wrapper of the Jackhmmer binary."""
+    def __init__(
+            self,
+            *,
+            binary_path: str,
+            database_path: str,
+            n_cpu: int = 8,
+            n_iter: int = 1,
+            e_value: float = 0.0001,
+            z_value: Optional[int] = None,
+            get_tblout: bool = False,
+            filter_f1: float = 0.0005,
+            filter_f2: float = 0.00005,
+            filter_f3: float = 0.0000005,
+            seq_limit: int = 50000,
+            incdom_e: Optional[float] = None,
+            dom_e: Optional[float] = None,
+            num_streamed_chunks: Optional[int] = None,
+            streaming_callback: Optional[Callable[[int], None]] = None,
+    ):
+        """Initializes the Python Jackhmmer wrapper.
+        Args:
+          binary_path: The path to the jackhmmer executable.
+          database_path: The path to the jackhmmer database (FASTA format).
+          n_cpu: The number of CPUs to give Jackhmmer.
+          n_iter: The number of Jackhmmer iterations.
+          e_value: The E-value, see Jackhmmer docs for more details.
+          z_value: The Z-value, see Jackhmmer docs for more details.
+          get_tblout: Whether to save tblout string.
+          filter_f1: MSV and biased composition pre-filter, set to >1.0 to turn off.
+          filter_f2: Viterbi pre-filter, set to >1.0 to turn off.
+          filter_f3: Forward pre-filter, set to >1.0 to turn off.
+          incdom_e: Domain e-value criteria for inclusion of domains in MSA/next
+            round.
+          dom_e: Domain e-value criteria for inclusion in tblout.
+          num_streamed_chunks: Number of database chunks to stream over.
+          streaming_callback: Callback function run after each chunk iteration with
+            the iteration number as argument.
+        """
+        self.binary_path = binary_path
+        self.database_path = database_path
+        self.num_streamed_chunks = num_streamed_chunks
+        if (
+                not os.path.exists(self.database_path)
+                and num_streamed_chunks is None
+        ):
+            logging.error("Could not find Jackhmmer database %s", database_path)
+            raise ValueError(
+                f"Could not find Jackhmmer database {database_path}"
+            )
+        self.n_cpu = n_cpu
+        self.n_iter = n_iter
+        self.e_value = e_value
+        self.z_value = z_value
+        self.filter_f1 = filter_f1
+        self.filter_f2 = filter_f2
+        self.filter_f3 = filter_f3
+        self.seq_limit = seq_limit
+        self.incdom_e = incdom_e
+        self.dom_e = dom_e
+        self.get_tblout = get_tblout
+        self.streaming_callback = streaming_callback
+    def _query_chunk(
+            self,
+            input_fasta_path: str,
+            database_path: str,
+            max_sequences: Optional[int] = None
+    ) -> Mapping[str, Any]:
+        """Queries the database chunk using Jackhmmer."""
+        with utils.tmpdir_manager() as query_tmp_dir:
+            sto_path = os.path.join(query_tmp_dir, "output.sto")
+            # The F1/F2/F3 are the expected proportion to pass each of the filtering
+            # stages (which get progressively more expensive), reducing these
+            # speeds up the pipeline at the expensive of sensitivity.  They are
+            # currently set very low to make querying Mgnify run in a reasonable
+            # amount of time.
+            cmd_flags = [
+                # Don't pollute stdout with Jackhmmer output.
+                "-o",
+                "/dev/null",
+                "-A",
+                sto_path,
+                "--noali",
+                "--F1",
+                str(self.filter_f1),
+                "--F2",
+                str(self.filter_f2),
+                "--F3",
+                str(self.filter_f3),
+                # "--seq_limit",
+                # str(self.seq_limit),
+                "--incE",
+                str(self.e_value),
+                # Report only sequences with E-values <= x in per-sequence output.
+                "-E",
+                str(self.e_value),
+                "--cpu",
+                str(self.n_cpu),
+                "-N",
+                str(self.n_iter),
+            ]
+            if self.get_tblout:
+                tblout_path = os.path.join(query_tmp_dir, "tblout.txt")
+                cmd_flags.extend(["--tblout", tblout_path])
+            if self.z_value:
+                cmd_flags.extend(["-Z", str(self.z_value)])
+            if self.dom_e is not None:
+                cmd_flags.extend(["--domE", str(self.dom_e)])
+            if self.incdom_e is not None:
+                cmd_flags.extend(["--incdomE", str(self.incdom_e)])
+            cmd = (
+                    [self.binary_path]
+                    + cmd_flags
+                    + [input_fasta_path, database_path]
+            )
+            # print(cmd)
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing(
+                    f"Jackhmmer ({os.path.basename(database_path)}) query"
+            ):
+                _, stderr = process.communicate()
+                retcode = process.wait()
+            if retcode:
+                raise RuntimeError(
+                    "Jackhmmer failed\nstderr:\n%s\n" % stderr.decode("utf-8")
+                )
+            # Get e-values for each target name
+            tbl = ""
+            if self.get_tblout:
+                with open(tblout_path) as f:
+                    tbl = f.read()
+            if (max_sequences is None):
+                with open(sto_path) as f:
+                    sto = f.read()
+            else:
+                sto = parsers.truncate_stockholm_msa(sto_path, max_sequences)
+        raw_output = dict(
+            sto=sto,
+            tbl=tbl,
+            stderr=stderr,
+            n_iter=self.n_iter,
+            e_value=self.e_value,
+        )
+        return raw_output
+    def query(self,
+              input_fasta_path: str,
+              max_sequences: Optional[int] = None
+              ) -> Sequence[Sequence[Mapping[str, Any]]]:
+        return self.query_multiple([input_fasta_path], max_sequences)
+    def query_multiple(self,
+                       input_fasta_paths: Sequence[str],
+                       max_sequences: Optional[int] = None
+                       ) -> Sequence[Sequence[Mapping[str, Any]]]:
+        """Queries the database using Jackhmmer."""
+        if self.num_streamed_chunks is None:
+            single_chunk_results = []
+            for input_fasta_path in input_fasta_paths:
+                single_chunk_result = self._query_chunk(
+                    input_fasta_path, self.database_path, max_sequences,
+                )
+                single_chunk_results.append(single_chunk_result)
+            return single_chunk_results
+        db_basename = os.path.basename(self.database_path)
+        db_remote_chunk = lambda db_idx: f"{self.database_path}.{db_idx}"
+        db_local_chunk = lambda db_idx: f"/tmp/ramdisk/{db_basename}.{db_idx}"
+        # Remove existing files to prevent OOM
+        for f in glob.glob(db_local_chunk("[0-9]*")):
+            try:
+                os.remove(f)
+            except OSError:
+                print(f"OSError while deleting {f}")
+        # Download the (i+1)-th chunk while Jackhmmer is running on the i-th chunk
+        with futures.ThreadPoolExecutor(max_workers=2) as executor:
+            chunked_outputs = [[] for _ in range(len(input_fasta_paths))]
+            for i in range(1, self.num_streamed_chunks + 1):
+                # Copy the chunk locally
+                if i == 1:
+                    future = executor.submit(
+                        request.urlretrieve,
+                        db_remote_chunk(i),
+                        db_local_chunk(i),
+                    )
+                if i < self.num_streamed_chunks:
+                    next_future = executor.submit(
+                        request.urlretrieve,
+                        db_remote_chunk(i + 1),
+                        db_local_chunk(i + 1),
+                    )
+                # Run Jackhmmer with the chunk
+                future.result()
+                for fasta_idx, input_fasta_path in enumerate(input_fasta_paths):
+                    chunked_outputs[fasta_idx].append(
+                        self._query_chunk(
+                            input_fasta_path,
+                            db_local_chunk(i),
+                            max_sequences
+                        )
+                    )
+                # Remove the local copy of the chunk
+                os.remove(db_local_chunk(i))
+                # Do not set next_future for the last chunk so that this works
+                # even for databases with only 1 chunk
+                if (i < self.num_streamed_chunks):
+                    future = next_future
+                if self.streaming_callback:
+                    self.streaming_callback(i)
+        return chunked_outputs

PhysDock/data/tools/kalign.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A Python wrapper for Kalign."""
+import os
+import subprocess
+from typing import Sequence
+import logging
+from . import utils
+def _to_a3m(sequences: Sequence[str]) -> str:
+    """Converts sequences to an a3m file."""
+    names = ["sequence %d" % i for i in range(1, len(sequences) + 1)]
+    a3m = []
+    for sequence, name in zip(sequences, names):
+        a3m.append(u">" + name + u"\n")
+        a3m.append(sequence + u"\n")
+    return "".join(a3m)
+class Kalign:
+    """Python wrapper of the Kalign binary."""
+    def __init__(self, *, binary_path: str):
+        """Initializes the Python Kalign wrapper.
+        Args:
+          binary_path: The path to the Kalign binary.
+        Raises:
+          RuntimeError: If Kalign binary not found within the path.
+        """
+        self.binary_path = binary_path
+    def align(self, sequences: Sequence[str]) -> str:
+        """Aligns the sequences and returns the alignment in A3M string.
+        Args:
+          sequences: A list of query sequence strings. The sequences have to be at
+            least 6 residues long (Kalign requires this). Note that the order in
+            which you give the sequences might alter the output slightly as
+            different alignment tree might get constructed.
+        Returns:
+          A string with the alignment in a3m format.
+        Raises:
+          RuntimeError: If Kalign fails.
+          ValueError: If any of the sequences is less than 6 residues long.
+        """
+        logging.info("Aligning %d sequences", len(sequences))
+        for s in sequences:
+            if len(s) < 6:
+                raise ValueError(
+                    "Kalign requires all sequences to be at least 6 "
+                    "residues long. Got %s (%d residues)." % (s, len(s))
+                )
+        with utils.tmpdir_manager() as query_tmp_dir:
+            input_fasta_path = os.path.join(query_tmp_dir, "input.fasta")
+            output_a3m_path = os.path.join(query_tmp_dir, "output.a3m")
+            with open(input_fasta_path, "w") as f:
+                f.write(_to_a3m(sequences))
+            cmd = [
+                self.binary_path,
+                "-i",
+                input_fasta_path,
+                "-o",
+                output_a3m_path,
+                "-format",
+                "fasta",
+            ]
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing("Kalign query"):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+                logging.info(
+                    "Kalign stdout:\n%s\n\nstderr:\n%s\n",
+                    stdout.decode("utf-8"),
+                    stderr.decode("utf-8"),
+                )
+            if retcode:
+                raise RuntimeError(
+                    "Kalign failed\nstdout:\n%s\n\nstderr:\n%s\n"
+                    % (stdout.decode("utf-8"), stderr.decode("utf-8"))
+                )
+            with open(output_a3m_path) as f:
+                a3m = f.read()
+            return a3m

PhysDock/data/tools/mmcif_parsing.py ADDED Viewed

	@@ -0,0 +1,519 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parses the mmCIF file format."""
+import collections
+import dataclasses
+import functools
+import io
+import json
+import logging
+import os
+from typing import Any, Mapping, Optional, Sequence, Tuple
+import numpy as np
+from Bio import PDB
+from . import PDBData
+@dataclasses.dataclass
+class residue_constants:
+    atom_type_num = 37
+    atom_types = ["N", "CA", "C", "CB", "O", "CG", "CG1", "CG2", "OG", "OG1", "SG", "CD", "CD1", "CD2", "ND1", "ND2",
+                  "OD1",
+                  "OD2", "SD", "CE", "CE1", "CE2", "CE3", "NE", "NE1", "NE2", "OE1", "OE2", "CH2", "NH1", "NH2", "OH",
+                  "CZ",
+                  "CZ2", "CZ3", "NZ", "OXT", ]
+    atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
+"""General-purpose errors used throughout the data pipeline"""
+class Error(Exception):
+    """Base class for exceptions."""
+class MultipleChainsError(Error):
+    """An error indicating that multiple chains were found for a given ID."""
+# Type aliases:
+ChainId = str
+PdbHeader = Mapping[str, Any]
+PdbStructure = PDB.Structure.Structure
+SeqRes = str
+MmCIFDict = Mapping[str, Sequence[str]]
+@dataclasses.dataclass(frozen=True)
+class Monomer:
+    id: str
+    num: int
+# Note - mmCIF format provides no guarantees on the type of author-assigned
+# sequence numbers. They need not be integers.
+@dataclasses.dataclass(frozen=True)
+class AtomSite:
+    residue_name: str
+    author_chain_id: str
+    mmcif_chain_id: str
+    author_seq_num: str
+    mmcif_seq_num: int
+    insertion_code: str
+    hetatm_atom: str
+    model_num: int
+# Used to map SEQRES index to a residue in the structure.
+@dataclasses.dataclass(frozen=True)
+class ResiduePosition:
+    chain_id: str
+    residue_number: int
+    insertion_code: str
+@dataclasses.dataclass(frozen=True)
+class ResidueAtPosition:
+    position: Optional[ResiduePosition]
+    name: str
+    is_missing: bool
+    hetflag: str
+@dataclasses.dataclass(frozen=True)
+class MmcifObject:
+    """Representation of a parsed mmCIF file.
+    Contains:
+      file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
+        files being processed.
+      header: Biopython header.
+      structure: Biopython structure.
+      chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
+        {'A': 'ABCDEFG'}
+      seqres_to_structure: Dict; for each chain_id contains a mapping between
+        SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,
+                                                          1: ResidueAtPosition,
+                                                          ...}}
+      raw_string: The raw string used to construct the MmcifObject.
+    """
+    file_id: str
+    header: PdbHeader
+    structure: PdbStructure
+    chain_to_seqres: Mapping[ChainId, SeqRes]
+    seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]]
+    raw_string: Any
+@dataclasses.dataclass(frozen=True)
+class ParsingResult:
+    """Returned by the parse function.
+    Contains:
+      mmcif_object: A MmcifObject, may be None if no chain could be successfully
+        parsed.
+      errors: A dict mapping (file_id, chain_id) to any exception generated.
+    """
+    mmcif_object: Optional[MmcifObject]
+    errors: Mapping[Tuple[str, str], Any]
+class ParseError(Exception):
+    """An error indicating that an mmCIF file could not be parsed."""
+def mmcif_loop_to_list(
+        prefix: str, parsed_info: MmCIFDict
+) -> Sequence[Mapping[str, str]]:
+    """Extracts loop associated with a prefix from mmCIF data as a list.
+    Reference for loop_ in mmCIF:
+      http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html
+    Args:
+      prefix: Prefix shared by each of the data items in the loop.
+        e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
+        _entity_poly_seq.mon_id. Should include the trailing period.
+      parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
+        parser.
+    Returns:
+      Returns a list of dicts; each dict represents 1 entry from an mmCIF loop.
+    """
+    cols = []
+    data = []
+    for key, value in parsed_info.items():
+        if key.startswith(prefix):
+            cols.append(key)
+            data.append(value)
+    assert all([len(xs) == len(data[0]) for xs in data]), (
+            "mmCIF error: Not all loops are the same length: %s" % cols
+    )
+    return [dict(zip(cols, xs)) for xs in zip(*data)]
+def mmcif_loop_to_dict(
+        prefix: str,
+        index: str,
+        parsed_info: MmCIFDict,
+) -> Mapping[str, Mapping[str, str]]:
+    """Extracts loop associated with a prefix from mmCIF data as a dictionary.
+    Args:
+      prefix: Prefix shared by each of the data items in the loop.
+        e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
+        _entity_poly_seq.mon_id. Should include the trailing period.
+      index: Which item of loop data should serve as the key.
+      parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
+        parser.
+    Returns:
+      Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop,
+      indexed by the index column.
+    """
+    entries = mmcif_loop_to_list(prefix, parsed_info)
+    return {entry[index]: entry for entry in entries}
+@functools.lru_cache(16, typed=False)
+def parse(
+        *, file_id: str, mmcif_string: str, catch_all_errors: bool = True
+) -> ParsingResult:
+    """Entry point, parses an mmcif_string.
+    Args:
+      file_id: A string identifier for this file. Should be unique within the
+        collection of files being processed.
+      mmcif_string: Contents of an mmCIF file.
+      catch_all_errors: If True, all exceptions are caught and error messages are
+        returned as part of the ParsingResult. If False exceptions will be allowed
+        to propagate.
+    Returns:
+      A ParsingResult.
+    """
+    errors = {}
+    try:
+        parser = PDB.MMCIFParser(QUIET=True)
+        handle = io.StringIO(mmcif_string)
+        full_structure = parser.get_structure("", handle)
+        first_model_structure = _get_first_model(full_structure)
+        # Extract the _mmcif_dict from the parser, which contains useful fields not
+        # reflected in the Biopython structure.
+        parsed_info = parser._mmcif_dict  # pylint:disable=protected-access
+        # Ensure all values are lists, even if singletons.
+        for key, value in parsed_info.items():
+            if not isinstance(value, list):
+                parsed_info[key] = [value]
+        header = _get_header(parsed_info)
+        # Determine the protein chains, and their start numbers according to the
+        # internal mmCIF numbering scheme (likely but not guaranteed to be 1).
+        valid_chains = _get_protein_chains(parsed_info=parsed_info)
+        if not valid_chains:
+            return ParsingResult(
+                None, {(file_id, ""): "No protein chains found in this file."}
+            )
+        seq_start_num = {
+            chain_id: min([monomer.num for monomer in seq])
+            for chain_id, seq in valid_chains.items()
+        }
+        # Loop over the atoms for which we have coordinates. Populate two mappings:
+        # -mmcif_to_author_chain_id (maps internal mmCIF chain ids to chain ids used
+        # the authors / Biopython).
+        # -seq_to_structure_mappings (maps idx into sequence to ResidueAtPosition).
+        mmcif_to_author_chain_id = {}
+        seq_to_structure_mappings = {}
+        for atom in _get_atom_site_list(parsed_info):
+            if atom.model_num != "1":
+                # We only process the first model at the moment.
+                continue
+            mmcif_to_author_chain_id[atom.mmcif_chain_id] = atom.author_chain_id
+            if atom.mmcif_chain_id in valid_chains:
+                hetflag = " "
+                if atom.hetatm_atom == "HETATM":
+                    # Water atoms are assigned a special hetflag of W in Biopython. We
+                    # need to do the same, so that this hetflag can be used to fetch
+                    # a residue from the Biopython structure by id.
+                    if atom.residue_name in ("HOH", "WAT"):
+                        hetflag = "W"
+                    else:
+                        hetflag = "H_" + atom.residue_name
+                insertion_code = atom.insertion_code
+                if not _is_set(atom.insertion_code):
+                    insertion_code = " "
+                position = ResiduePosition(
+                    chain_id=atom.author_chain_id,
+                    residue_number=int(atom.author_seq_num),
+                    insertion_code=insertion_code,
+                )
+                seq_idx = (
+                        int(atom.mmcif_seq_num) - seq_start_num[atom.mmcif_chain_id]
+                )
+                current = seq_to_structure_mappings.get(
+                    atom.author_chain_id, {}
+                )
+                current[seq_idx] = ResidueAtPosition(
+                    position=position,
+                    name=atom.residue_name,
+                    is_missing=False,
+                    hetflag=hetflag,
+                )
+                seq_to_structure_mappings[atom.author_chain_id] = current
+        # Add missing residue information to seq_to_structure_mappings.
+        for chain_id, seq_info in valid_chains.items():
+            author_chain = mmcif_to_author_chain_id[chain_id]
+            current_mapping = seq_to_structure_mappings[author_chain]
+            for idx, monomer in enumerate(seq_info):
+                if idx not in current_mapping:
+                    current_mapping[idx] = ResidueAtPosition(
+                        position=None,
+                        name=monomer.id,
+                        is_missing=True,
+                        hetflag=" ",
+                    )
+        author_chain_to_sequence = {}
+        for chain_id, seq_info in valid_chains.items():
+            author_chain = mmcif_to_author_chain_id[chain_id]
+            seq = []
+            for monomer in seq_info:
+                code = PDBData.protein_letters_3to1_extended.get(monomer.id, "X")
+                seq.append(code if len(code) == 1 else "X")
+            seq = "".join(seq)
+            author_chain_to_sequence[author_chain] = seq
+        mmcif_object = MmcifObject(
+            file_id=file_id,
+            header=header,
+            structure=first_model_structure,
+            chain_to_seqres=author_chain_to_sequence,
+            seqres_to_structure=seq_to_structure_mappings,
+            raw_string=parsed_info,
+        )
+        return ParsingResult(mmcif_object=mmcif_object, errors=errors)
+    except Exception as e:  # pylint:disable=broad-except
+        errors[(file_id, "")] = e
+        if not catch_all_errors:
+            raise
+        return ParsingResult(mmcif_object=None, errors=errors)
+def _get_first_model(structure: PdbStructure) -> PdbStructure:
+    """Returns the first model in a Biopython structure."""
+    return next(structure.get_models())
+_MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDE = 21
+def get_release_date(parsed_info: MmCIFDict) -> str:
+    """Returns the oldest revision date."""
+    revision_dates = parsed_info["_pdbx_audit_revision_history.revision_date"]
+    return min(revision_dates)
+def _get_header(parsed_info: MmCIFDict) -> PdbHeader:
+    """Returns a basic header containing method, release date and resolution."""
+    header = {}
+    experiments = mmcif_loop_to_list("_exptl.", parsed_info)
+    header["structure_method"] = ",".join(
+        [experiment["_exptl.method"].lower() for experiment in experiments]
+    )
+    # Note: The release_date here corresponds to the oldest revision. We prefer to
+    # use this for dataset filtering over the deposition_date.
+    if "_pdbx_audit_revision_history.revision_date" in parsed_info:
+        header["release_date"] = get_release_date(parsed_info)
+    else:
+        logging.warning(
+            "Could not determine release_date: %s", parsed_info["_entry.id"]
+        )
+    header["resolution"] = 0.00
+    for res_key in (
+            "_refine.ls_d_res_high",
+            "_em_3d_reconstruction.resolution",
+            "_reflns.d_resolution_high",
+    ):
+        if res_key in parsed_info:
+            try:
+                raw_resolution = parsed_info[res_key][0]
+                header["resolution"] = float(raw_resolution)
+            except ValueError:
+                logging.debug(
+                    "Invalid resolution format: %s", parsed_info[res_key]
+                )
+    return header
+def _get_atom_site_list(parsed_info: MmCIFDict) -> Sequence[AtomSite]:
+    """Returns list of atom sites; contains data not present in the structure."""
+    return [
+        AtomSite(*site)
+        for site in zip(  # pylint:disable=g-complex-comprehension
+            parsed_info["_atom_site.label_comp_id"],
+            parsed_info["_atom_site.auth_asym_id"],
+            parsed_info["_atom_site.label_asym_id"],
+            parsed_info["_atom_site.auth_seq_id"],
+            parsed_info["_atom_site.label_seq_id"],
+            parsed_info["_atom_site.pdbx_PDB_ins_code"],
+            parsed_info["_atom_site.group_PDB"],
+            parsed_info["_atom_site.pdbx_PDB_model_num"],
+        )
+    ]
+def _get_protein_chains(
+        *, parsed_info: Mapping[str, Any]
+) -> Mapping[ChainId, Sequence[Monomer]]:
+    """Extracts polymer information for protein chains only.
+    Args:
+      parsed_info: _mmcif_dict produced by the Biopython parser.
+    Returns:
+      A dict mapping mmcif chain id to a list of Monomers.
+    """
+    # Get polymer information for each entity in the structure.
+    entity_poly_seqs = mmcif_loop_to_list("_entity_poly_seq.", parsed_info)
+    polymers = collections.defaultdict(list)
+    for entity_poly_seq in entity_poly_seqs:
+        polymers[entity_poly_seq["_entity_poly_seq.entity_id"]].append(
+            Monomer(
+                id=entity_poly_seq["_entity_poly_seq.mon_id"],
+                num=int(entity_poly_seq["_entity_poly_seq.num"]),
+            )
+        )
+    # Get chemical compositions. Will allow us to identify which of these polymers
+    # are proteins.
+    chem_comps = mmcif_loop_to_dict("_chem_comp.", "_chem_comp.id", parsed_info)
+    # Get chains information for each entity. Necessary so that we can return a
+    # dict keyed on chain id rather than entity.
+    struct_asyms = mmcif_loop_to_list("_struct_asym.", parsed_info)
+    entity_to_mmcif_chains = collections.defaultdict(list)
+    for struct_asym in struct_asyms:
+        chain_id = struct_asym["_struct_asym.id"]
+        entity_id = struct_asym["_struct_asym.entity_id"]
+        entity_to_mmcif_chains[entity_id].append(chain_id)
+    # Identify and return the valid protein chains.
+    valid_chains = {}
+    for entity_id, seq_info in polymers.items():
+        chain_ids = entity_to_mmcif_chains[entity_id]
+        # Reject polymers without any peptide-like components, such as DNA/RNA.
+        if any(
+                [
+                    "peptide" in chem_comps[monomer.id]["_chem_comp.type"]
+                    for monomer in seq_info
+                ]
+        ):
+            for chain_id in chain_ids:
+                valid_chains[chain_id] = seq_info
+    return valid_chains
+def _is_set(data: str) -> bool:
+    """Returns False if data is a special mmCIF character indicating 'unset'."""
+    return data not in (".", "?")
+def get_atom_coords(
+        mmcif_object: MmcifObject,
+        chain_id: str,
+        _zero_center_positions: bool = False
+) -> Tuple[np.ndarray, np.ndarray]:
+    # Locate the right chain
+    chains = list(mmcif_object.structure.get_chains())
+    relevant_chains = [c for c in chains if c.id == chain_id]
+    if len(relevant_chains) != 1:
+        raise MultipleChainsError(
+            f"Expected exactly one chain in structure with id {chain_id}."
+        )
+    chain = relevant_chains[0]
+    # Extract the coordinates
+    num_res = len(mmcif_object.chain_to_seqres[chain_id])
+    all_atom_positions = np.zeros(
+        [num_res, residue_constants.atom_type_num, 3], dtype=np.float32
+    )
+    all_atom_mask = np.zeros(
+        [num_res, residue_constants.atom_type_num], dtype=np.float32
+    )
+    for res_index in range(num_res):
+        pos = np.zeros([residue_constants.atom_type_num, 3], dtype=np.float32)
+        mask = np.zeros([residue_constants.atom_type_num], dtype=np.float32)
+        res_at_position = mmcif_object.seqres_to_structure[chain_id][res_index]
+        if not res_at_position.is_missing:
+            res = chain[
+                (
+                    res_at_position.hetflag,
+                    res_at_position.position.residue_number,
+                    res_at_position.position.insertion_code,
+                )
+            ]
+            for atom in res.get_atoms():
+                atom_name = atom.get_name()
+                x, y, z = atom.get_coord()
+                if atom_name in residue_constants.atom_order.keys():
+                    pos[residue_constants.atom_order[atom_name]] = [x, y, z]
+                    mask[residue_constants.atom_order[atom_name]] = 1.0
+                elif atom_name.upper() == "SE" and res.get_resname() == "MSE":
+                    # Put the coords of the selenium atom in the sulphur column
+                    pos[residue_constants.atom_order["SD"]] = [x, y, z]
+                    mask[residue_constants.atom_order["SD"]] = 1.0
+            # Fix naming errors in arginine residues where NH2 is incorrectly
+            # assigned to be closer to CD than NH1
+            cd = residue_constants.atom_order['CD']
+            nh1 = residue_constants.atom_order['NH1']
+            nh2 = residue_constants.atom_order['NH2']
+            if (
+                    res.get_resname() == 'ARG' and
+                    all(mask[atom_index] for atom_index in (cd, nh1, nh2)) and
+                    (np.linalg.norm(pos[nh1] - pos[cd]) >
+                     np.linalg.norm(pos[nh2] - pos[cd]))
+            ):
+                pos[nh1], pos[nh2] = pos[nh2].copy(), pos[nh1].copy()
+                mask[nh1], mask[nh2] = mask[nh2].copy(), mask[nh1].copy()
+        all_atom_positions[res_index] = pos
+        all_atom_mask[res_index] = mask
+    if _zero_center_positions:
+        binary_mask = all_atom_mask.astype(bool)
+        translation_vec = all_atom_positions[binary_mask].mean(axis=0)
+        all_atom_positions[binary_mask] -= translation_vec
+    return all_atom_positions, all_atom_mask

PhysDock/data/tools/msa_identifiers.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for extracting identifiers from MSA sequence descriptions."""
+import dataclasses
+import re
+from typing import Optional
+# Sequences coming from UniProtKB database come in the
+# `db|UniqueIdentifier|EntryName` format, e.g. `tr|A0A146SKV9|A0A146SKV9_FUNHE`
+# or `sp|P0C2L1|A3X1_LOXLA` (for TREMBL/Swiss-Prot respectively).
+_UNIPROT_PATTERN = re.compile(
+    r"""
+        ^
+        # UniProtKB/TrEMBL or UniProtKB/Swiss-Prot
+        (?:tr|sp)
+        \|
+        # A primary accession number of the UniProtKB entry.
+        (?P<AccessionIdentifier>[A-Za-z0-9]{6,10})
+        # Occasionally there is a _0 or _1 isoform suffix, which we ignore.
+        (?:_\d)?
+        \|
+        # TREMBL repeats the accession ID here. Swiss-Prot has a mnemonic
+        # protein ID code.
+        (?:[A-Za-z0-9]+)
+        _
+        # A mnemonic species identification code.
+        (?P<SpeciesIdentifier>([A-Za-z0-9]){1,5})
+        # Small BFD uses a final value after an underscore, which we ignore.
+        (?:_\d+)?
+        $
+        """,
+    re.VERBOSE,
+)
+@dataclasses.dataclass(frozen=True)
+class Identifiers:
+    species_id: str = ""
+def _parse_sequence_identifier(msa_sequence_identifier: str) -> Identifiers:
+    """Gets accession id and species from an msa sequence identifier.
+    The sequence identifier has the format specified by
+    _UNIPROT_TREMBL_ENTRY_NAME_PATTERN or _UNIPROT_SWISSPROT_ENTRY_NAME_PATTERN.
+    An example of a sequence identifier: `tr|A0A146SKV9|A0A146SKV9_FUNHE`
+    Args:
+        msa_sequence_identifier: a sequence identifier.
+    Returns:
+        An `Identifiers` instance with a species_id. These
+        can be empty in the case where no identifier was found.
+    """
+    matches = re.search(_UNIPROT_PATTERN, msa_sequence_identifier.strip())
+    if matches:
+        return Identifiers(species_id=matches.group("SpeciesIdentifier"))
+    return Identifiers()
+def _extract_sequence_identifier(description: str) -> Optional[str]:
+    """Extracts sequence identifier from description. Returns None if no match."""
+    split_description = description.split()
+    if split_description:
+        return split_description[0].partition("/")[0]
+    else:
+        return None
+def get_identifiers(description: str) -> Identifiers:
+    """Computes extra MSA features from the description."""
+    sequence_identifier = _extract_sequence_identifier(description)
+    if sequence_identifier is None:
+        return Identifiers()
+    else:
+        return _parse_sequence_identifier(sequence_identifier)

PhysDock/data/tools/msa_pairing.py ADDED Viewed

	@@ -0,0 +1,496 @@

+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pairing logic for multimer data pipeline."""
+import collections
+import functools
+import string
+from typing import Any, Dict, Iterable, List, Sequence, Mapping
+import numpy as np
+import pandas as pd
+import scipy.linalg
+from scipy.linalg import block_diag
+# TODO: This stuff should probably also be in a config
+MSA_GAP_IDX = 31
+SEQUENCE_GAP_CUTOFF = 0.5
+SEQUENCE_SIMILARITY_CUTOFF = 0.9
+MAX_MSA_SIZE = 16384
+MSA_PAD_VALUES = {'msa_all_seq': MSA_GAP_IDX,
+                  'msa_mask_all_seq': 1,
+                  'deletion_matrix_all_seq': 0,
+                  'deletion_matrix_int_all_seq': 0,
+                  'msa': MSA_GAP_IDX,
+                  'msa_mask': 1,
+                  'deletion_matrix': 0,
+                  'deletion_matrix_int': 0}
+MSA_FEATURES = ('msa', 'msa_mask', 'deletion_matrix')
+SEQ_FEATURES = ('residue_index', 'aatype', 'all_atom_positions',
+                'all_atom_mask', 'seq_mask', 'between_segment_residues',
+                'has_alt_locations', 'has_hetatoms', 'asym_id', 'entity_id',
+                'sym_id', 'entity_mask', 'deletion_mean',
+                'prediction_atom_mask',
+                'literature_positions', 'atom_indices_to_group_indices',
+                'rigid_group_default_frame',
+                'restype', 'token_index', 'token_exists', 's_mask',
+                "token_id_to_centre_atom_id",
+                "token_id_to_pseudo_beta_atom_id",
+                "token_id_to_chunk_sizes",
+                "token_id_to_conformer_id",
+                "is_protein",
+                "is_dna",
+                "is_rna",
+                "is_ligand",
+                "atom_index",
+                "atom_id_to_token_id",
+                "ref_space_uid",
+                "ref_pos",
+                "ref_feat",
+                "x_gt",
+                "x_exists",
+                "b_factors",
+                "a_mask"
+                )
+TEMPLATE_FEATURES = ('template_aatype', 'template_all_atom_positions',
+                     'template_all_atom_mask')
+CHAIN_FEATURES = ('num_alignments', 'seq_length')
+def create_paired_features(
+        chains: Iterable[Mapping[str, np.ndarray]],
+) -> List[Mapping[str, np.ndarray]]:
+    """Returns the original chains with paired NUM_SEQ features.
+  Args:
+    chains:  A list of feature dictionaries for each chain.
+            = list(all_chain_features.values())
+  Returns:
+    A list of feature dictionaries with sequence features including only
+    rows to be paired.
+  """
+    chains = list(chains)
+    chain_keys = chains[0].keys()
+    if len(chains) < 2:
+        return chains
+    else:
+        updated_chains = []
+        paired_chains_to_paired_row_indices = pair_sequences(chains)
+        paired_rows = reorder_paired_rows(
+            paired_chains_to_paired_row_indices)
+        for chain_num, chain in enumerate(chains):
+            new_chain = {k: v for k, v in chain.items() if '_all_seq' not in k}
+            for feature_name in chain_keys:
+                if feature_name.endswith('_all_seq'):
+                    feats_padded = pad_features(chain[feature_name], feature_name)
+                    new_chain[feature_name] = feats_padded[paired_rows[:, chain_num]]
+            new_chain['num_alignments_all_seq'] = np.asarray(
+                len(paired_rows[:, chain_num]))
+            updated_chains.append(new_chain)
+        return updated_chains
+def pad_features(feature: np.ndarray, feature_name: str) -> np.ndarray:
+    """Add a 'padding' row at the end of the features list.
+  The padding row will be selected as a 'paired' row in the case of partial
+  alignment - for the chain that doesn't have paired alignment.
+  Args:
+    feature: The feature to be padded.
+    feature_name: The name of the feature to be padded.
+  Returns:
+    The feature with an additional padding row.
+  """
+    assert feature.dtype != np.dtype(np.string_)
+    if feature_name in ('msa_all_seq',
+                        'deletion_matrix_all_seq'):
+        num_res = feature.shape[1]
+        padding = MSA_PAD_VALUES[feature_name] * np.ones([1, num_res],
+                                                         feature.dtype)
+    elif feature_name == 'msa_species_identifiers_all_seq':
+        padding = [b'']
+    else:
+        return feature
+    feats_padded = np.concatenate([feature, padding], axis=0)
+    return feats_padded
+def _make_msa_df(chain_features: Mapping[str, np.ndarray]) -> pd.DataFrame:
+    """Makes dataframe with msa features needed for msa pairing."""
+    chain_msa = chain_features['msa_all_seq']
+    query_seq = chain_msa[0]
+    per_seq_similarity = np.sum(
+        query_seq[None] == chain_msa, axis=-1) / float(len(query_seq))
+    per_seq_gap = np.sum(chain_msa == 31, axis=-1) / float(len(query_seq))
+    msa_df = pd.DataFrame({
+        'msa_species_identifiers':
+            chain_features['msa_species_identifiers_all_seq'],
+        'msa_row':
+            np.arange(len(
+                chain_features['msa_species_identifiers_all_seq'])),
+        'msa_similarity': per_seq_similarity,
+        'gap': per_seq_gap
+    })
+    return msa_df
+def _create_species_dict(msa_df: pd.DataFrame) -> Dict[bytes, pd.DataFrame]:
+    """Creates mapping from species to msa dataframe of that species."""
+    species_lookup = {}
+    for species, species_df in msa_df.groupby('msa_species_identifiers'):
+        species_lookup[species] = species_df
+    return species_lookup
+def _match_rows_by_sequence_similarity(this_species_msa_dfs: List[pd.DataFrame]
+                                       ) -> List[List[int]]:
+    """Finds MSA sequence pairings across chains based on sequence similarity.
+  Each chain's MSA sequences are first sorted by their sequence similarity to
+  their respective target sequence. The sequences are then paired, starting
+  from the sequences most similar to their target sequence.
+  Args:
+    this_species_msa_dfs: a list of dataframes containing MSA features for
+      sequences for a specific species.
+  Returns:
+   A list of lists, each containing M indices corresponding to paired MSA rows,
+   where M is the number of chains.
+  """
+    all_paired_msa_rows = []
+    num_seqs = [len(species_df) for species_df in this_species_msa_dfs
+                if species_df is not None]
+    take_num_seqs = np.min(num_seqs)
+    sort_by_similarity = (
+        lambda x: x.sort_values('msa_similarity', axis=0, ascending=False))
+    for species_df in this_species_msa_dfs:
+        if species_df is not None:
+            species_df_sorted = sort_by_similarity(species_df)
+            msa_rows = species_df_sorted.msa_row.iloc[:take_num_seqs].values
+        else:
+            msa_rows = [-1] * take_num_seqs  # take the last 'padding' row
+        all_paired_msa_rows.append(msa_rows)
+    all_paired_msa_rows = list(np.array(all_paired_msa_rows).transpose())
+    return all_paired_msa_rows
+def pair_sequences(
+        examples: List[Mapping[str, np.ndarray]],
+) -> Dict[int, np.ndarray]:
+    """Returns indices for paired MSA sequences across chains."""
+    num_examples = len(examples)
+    all_chain_species_dict = []
+    common_species = set()
+    for chain_features in examples:
+        msa_df = _make_msa_df(chain_features)  # """Makes dataframe with msa features needed for msa pairing."""
+        species_dict = _create_species_dict(msa_df)
+        all_chain_species_dict.append(species_dict)
+        common_species.update(set(species_dict))
+    common_species = sorted(common_species)
+    common_species.remove(b'')  # Remove target sequence species.
+    all_paired_msa_rows = [np.zeros(len(examples), int)]
+    all_paired_msa_rows_dict = {k: [] for k in range(num_examples)}
+    all_paired_msa_rows_dict[num_examples] = [np.zeros(len(examples), int)]
+    for species in common_species:
+        if not species:
+            continue
+        this_species_msa_dfs = []
+        species_dfs_present = 0
+        for species_dict in all_chain_species_dict:
+            if species in species_dict:
+                this_species_msa_dfs.append(species_dict[species])
+                species_dfs_present += 1
+            else:
+                this_species_msa_dfs.append(None)
+        # Skip species that are present in only one chain.
+        if species_dfs_present <= 1:
+            continue
+        if np.any(
+                np.array([len(species_df) for species_df in
+                          this_species_msa_dfs if
+                          isinstance(species_df, pd.DataFrame)]) > 600):
+            continue
+        paired_msa_rows = _match_rows_by_sequence_similarity(this_species_msa_dfs)
+        all_paired_msa_rows.extend(paired_msa_rows)
+        all_paired_msa_rows_dict[species_dfs_present].extend(paired_msa_rows)
+    all_paired_msa_rows_dict = {
+        num_examples: np.array(paired_msa_rows) for
+        num_examples, paired_msa_rows in all_paired_msa_rows_dict.items()
+    }
+    return all_paired_msa_rows_dict
+def reorder_paired_rows(all_paired_msa_rows_dict: Dict[int, np.ndarray]
+                        ) -> np.ndarray:
+    """Creates a list of indices of paired MSA rows across chains.
+  Args:
+    all_paired_msa_rows_dict: a mapping from the number of paired chains to the
+      paired indices.
+  Returns:
+    a list of lists, each containing indices of paired MSA rows across chains.
+    The paired-index lists are ordered by:
+      1) the number of chains in the paired alignment, i.e, all-chain pairings
+         will come first.
+      2) e-values
+  """
+    all_paired_msa_rows = []
+    for num_pairings in sorted(all_paired_msa_rows_dict, reverse=True):
+        paired_rows = all_paired_msa_rows_dict[num_pairings]
+        paired_rows_product = abs(np.array([np.prod(rows) for rows in paired_rows]))
+        paired_rows_sort_index = np.argsort(paired_rows_product)
+        all_paired_msa_rows.extend(paired_rows[paired_rows_sort_index])
+    return np.array(all_paired_msa_rows)
+# def block_diag(*arrs: np.ndarray, pad_value: float = 0.0) -> np.ndarray:
+#   """Like scipy.linalg.block_diag but with an optional padding value."""
+#   ones_arrs = [np.ones_like(x) for x in arrs]
+#   off_diag_mask = 1.0 - scipy.linalg.block_diag(*ones_arrs)
+#   diag = scipy.linalg.block_diag(*arrs)
+#   diag += (off_diag_mask * pad_value).astype(diag.dtype)
+#   return diag
+def _correct_post_merged_feats(
+        np_example: Mapping[str, np.ndarray],
+        np_chains_list: Sequence[Mapping[str, np.ndarray]],
+        pair_msa_sequences: bool
+) -> Mapping[str, np.ndarray]:
+    """Adds features that need to be computed/recomputed post merging."""
+    np_example['seq_length'] = np.asarray(
+        len(np_example['sequence_3'].split("-")),
+        dtype=np.int32
+    )
+    np_example['num_alignments'] = np.asarray(
+        np_example['msa'].shape[0],
+        dtype=np.int32
+    )
+    return np_example
+def _pad_templates(chains: Sequence[Mapping[str, np.ndarray]],
+                   max_templates: int) -> Sequence[Mapping[str, np.ndarray]]:
+    """For each chain pad the number of templates to a fixed size.
+  Args:
+    chains: A list of protein chains.
+    max_templates: Each chain will be padded to have this many templates.
+  Returns:
+    The list of chains, updated to have template features padded to
+    max_templates.
+  """
+    for chain in chains:
+        for k, v in chain.items():
+            if k in TEMPLATE_FEATURES:
+                padding = np.zeros_like(v.shape)
+                padding[0] = max_templates - v.shape[0]
+                padding = [(0, p) for p in padding]
+                chain[k] = np.pad(v, padding, mode='constant')
+    return chains
+def _merge_features_from_multiple_chains(
+        chains: Sequence[Mapping[str, np.ndarray]],
+        pair_msa_sequences: bool) -> Mapping[str, np.ndarray]:
+    """Merge features from multiple chains.
+  Args:
+    chains: A list of feature dictionaries that we want to merge.
+    pair_msa_sequences: Whether to concatenate MSA features along the
+      num_res dimension (if True), or to block diagonalize them (if False).
+  Returns:
+    A feature dictionary for the merged example.
+  """
+    merged_example = {}
+    for feature_name in chains[0]:
+        if feature_name == "msa_species_identifiers" or feature_name == "num_alignments_all_seq" or feature_name == "num_alignments":
+            continue
+        feats = [x[feature_name] for x in chains]
+        feature_name_split = feature_name.split('_all_seq')[0]
+        if feature_name_split in MSA_FEATURES:
+            merged_example[feature_name] = np.concatenate(feats, axis=1)
+        elif feature_name_split == "templ_feat":
+            num_templ = feats[0].shape[0]
+            total_len = sum([feat.shape[1] for feat in feats])
+            out_mat = np.zeros([num_templ, total_len, total_len, 108], dtype=np.float32)
+            start = 0
+            end = 0
+            for feat in feats:
+                end += feat.shape[1]
+                out_mat[:, start:end, start:end] = feat
+                start = end
+            merged_example[feature_name] = out_mat
+        elif feature_name_split in SEQ_FEATURES:
+            merged_example[feature_name] = np.concatenate(feats, axis=0)
+        elif feature_name_split in TEMPLATE_FEATURES:
+            merged_example[feature_name] = np.concatenate(feats, axis=1)
+        elif feature_name_split in CHAIN_FEATURES:
+            merged_example[feature_name] = np.sum(x for x in feats).astype(np.int32)
+        else:
+            merged_example[feature_name] = feats[0]
+    return merged_example
+def _merge_homomers_dense_msa(
+        chains: Iterable[Mapping[str, np.ndarray]]) -> Sequence[Mapping[str, np.ndarray]]:
+    """Merge all identical chains, making the resulting MSA dense.
+  Args:
+    chains: An iterable of features for each chain.
+  Returns:
+    A list of feature dictionaries.  All features with the same entity_id
+    will be merged - MSA features will be concatenated along the num_res
+    dimension - making them dense.
+  """
+    entity_chains = collections.defaultdict(list)
+    for chain in chains:
+        entity_id = chain['entity_id'][0]
+        entity_chains[entity_id].append(chain)
+    grouped_chains = []
+    for entity_id in sorted(entity_chains):
+        chains = entity_chains[entity_id]
+        grouped_chains.append(chains)
+    chains = [
+        _merge_features_from_multiple_chains(chains, pair_msa_sequences=True)
+        for chains in grouped_chains]
+    return chains
+def _concatenate_paired_and_unpaired_features(
+        example: Mapping[str, np.ndarray]) -> Mapping[str, np.ndarray]:
+    """Merges paired and block-diagonalised features."""
+    features = MSA_FEATURES
+    for feature_name in features:
+        feat_all_seq_name = feature_name + '_all_seq'
+        if feature_name in example and feat_all_seq_name in example:
+            feat = example[feature_name]
+            feat_all_seq = example[feature_name + '_all_seq']
+            merged_feat = np.concatenate([feat_all_seq, feat], axis=0)
+            example[feature_name] = merged_feat
+            example.pop(feature_name + '_all_seq', None)
+    # example['num_alignments_all_seq'] = np.array(example['msa_all_seq'].shape[0],
+    #                                              dtype=np.int32)
+    return example
+def merge_chain_features(np_chains_list: List[Mapping[str, np.ndarray]],
+                         pair_msa_sequences: bool,
+                         max_templates: int) -> Mapping[str, np.ndarray]:
+    """Merges features for multiple chains to single FeatureDict.
+  Args:
+    np_chains_list: List of FeatureDicts for each chain.
+    pair_msa_sequences: Whether to merge paired MSAs.
+    max_templates: The maximum number of templates to include.
+  Returns:
+    Single FeatureDict for entire complex.
+  """
+    # print(np_chains_list)
+    # for chain in np_chains_list:
+    #     for k,v in chain.items():
+    #         try:
+    #             print(k,v.shape)
+    #         except:
+    #             print(k)
+    #     print("################")
+    np_chains_list = _pad_templates(
+        np_chains_list, max_templates=max_templates)
+    # group chains
+    # print(np_chains_list)
+    np_chains_list = _merge_homomers_dense_msa(np_chains_list)
+    np_chains_list = [_concatenate_paired_and_unpaired_features(np_chain) for np_chain in np_chains_list]
+    assert len(np_chains_list) > 0, f"np_chain_list, error, {np_chains_list}"
+    np_example = _merge_features_from_multiple_chains(
+        np_chains_list, pair_msa_sequences=False)
+    np_example = _correct_post_merged_feats(
+        np_example=np_example,
+        np_chains_list=np_chains_list,
+        pair_msa_sequences=pair_msa_sequences)
+    return np_example
+def deduplicate_unpaired_sequences(
+        np_chains: List[Mapping[str, np.ndarray]]) -> List[Mapping[str, np.ndarray]]:
+    """Removes unpaired sequences which duplicate a paired sequence."""
+    feature_names = np_chains[0].keys()
+    msa_features = MSA_FEATURES
+    for chain in np_chains:
+        # Convert the msa_all_seq numpy array to a tuple for hashing.
+        sequence_set = set(tuple(s) for s in chain['msa_all_seq'])
+        keep_rows = []
+        # Go through unpaired MSA seqs and remove any rows that correspond to the
+        # sequences that are already present in the paired MSA.
+        for row_num, seq in enumerate(chain['msa']):
+            if tuple(seq) not in sequence_set:
+                keep_rows.append(row_num)
+        if keep_rows is not None:
+            for feature_name in feature_names:
+                if feature_name in msa_features:
+                    chain[feature_name] = chain[feature_name][keep_rows]
+        chain['num_alignments'] = np.array(chain['msa'].shape[0], dtype=np.int32)
+    return np_chains

PhysDock/data/tools/nhmmer.py ADDED Viewed

	@@ -0,0 +1,257 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to run Jackhmmer from Python."""
+from concurrent import futures
+import glob
+import logging
+import os
+import subprocess
+from typing import Any, Callable, Mapping, Optional, Sequence
+from urllib import request
+from . import parsers
+from . import utils
+class Nhmmer:
+    """Python wrapper of the Jackhmmer binary."""
+    def __init__(
+            self,
+            *,
+            binary_path: str,
+            database_path: str,
+            n_cpu: int = 8,  # --cpu
+            e_value: float = 0.001,  # -E --incE | Z-value is not used in AF3
+            filter_f3: float = 0.00005,  # --F3 0.02 for sequences shorter than 50 nucleotides
+            get_tblout: bool = False,
+            incdom_e: Optional[float] = None,
+            dom_e: Optional[float] = None,
+            num_streamed_chunks: Optional[int] = None,
+            streaming_callback: Optional[Callable[[int], None]] = None,
+    ):
+        """Initializes the Python Nhmmer wrapper.
+        Args:
+          binary_path: The path to the jackhmmer executable.
+          database_path: The path to the jackhmmer database (FASTA format).
+          n_cpu: The number of CPUs to give Jackhmmer.
+          e_value: The E-value, see Jackhmmer docs for more details.
+          get_tblout: Whether to save tblout string.
+          filter_f1: MSV and biased composition pre-filter, set to >1.0 to turn off.
+          filter_f2: Viterbi pre-filter, set to >1.0 to turn off.
+          filter_f3: Forward pre-filter, set to >1.0 to turn off.
+          incdom_e: Domain e-value criteria for inclusion of domains in MSA/next
+            round.
+          dom_e: Domain e-value criteria for inclusion in tblout.
+          num_streamed_chunks: Number of database chunks to stream over.
+          streaming_callback: Callback function run after each chunk iteration with
+            the iteration number as argument.
+        """
+        self.binary_path = binary_path
+        self.database_path = database_path
+        self.num_streamed_chunks = num_streamed_chunks
+        if (
+                not os.path.exists(self.database_path)
+                and num_streamed_chunks is None
+        ):
+            logging.error("Could not find Jackhmmer database %s", database_path)
+            raise ValueError(
+                f"Could not find Jackhmmer database {database_path}"
+            )
+        self.n_cpu = n_cpu
+        self.e_value = e_value
+        self.filter_f3 = filter_f3
+        self.incdom_e = incdom_e
+        self.dom_e = dom_e
+        self.get_tblout = get_tblout
+        self.streaming_callback = streaming_callback
+    def _query_chunk(
+            self,
+            input_fasta_path: str,
+            database_path: str,
+            max_sequences: Optional[int] = None
+    ) -> Mapping[str, Any]:
+        """Queries the database chunk using Jackhmmer."""
+        with open(input_fasta_path, "r") as f:
+            sequences, desc = parsers.parse_fasta(f.read())
+        assert len(sequences) == 1, f"Parse Fasta File with only 1 Sequence, but found {len(sequences)}"
+        if len(sequences[0]) < 50:
+            self.filter_f3 = 0.02
+        else:
+            self.filter_f3 = 0.00005
+        with utils.tmpdir_manager() as query_tmp_dir:
+            sto_path = os.path.join(query_tmp_dir, "output.sto")
+            # The F1/F2/F3 are the expected proportion to pass each of the filtering
+            # stages (which get progressively more expensive), reducing these
+            # speeds up the pipeline at the expensive of sensitivity.  They are
+            # currently set very low to make querying Mgnify run in a reasonable
+            # amount of time.
+            cmd_flags = [
+                # Don't pollute stdout with Jackhmmer output.
+                "-o",
+                "/dev/null",
+                "-A",
+                sto_path,
+                "--noali",
+                # Report only sequences with E-values <= x in per-sequence output.
+                "-E",
+                str(self.e_value),
+                "--incE",
+                str(self.e_value),
+                "--rna",
+                "--watson",
+                "--F3",  # Only F3 is used
+                str(self.filter_f3),
+                "--cpu",
+                str(self.n_cpu),
+            ]
+            if self.get_tblout:
+                tblout_path = os.path.join(query_tmp_dir, "tblout.txt")
+                cmd_flags.extend(["--tblout", tblout_path])
+            if self.dom_e is not None:
+                cmd_flags.extend(["--domE", str(self.dom_e)])
+            if self.incdom_e is not None:
+                cmd_flags.extend(["--incdomE", str(self.incdom_e)])
+            cmd = (
+                    [self.binary_path]
+                    + cmd_flags
+                    + [input_fasta_path, database_path]
+            )
+            # print(cmd)
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing(
+                    f"Nhmmer ({os.path.basename(database_path)}) query"
+            ):
+                _, stderr = process.communicate()
+                retcode = process.wait()
+            if retcode:
+                raise RuntimeError(
+                    "Nhmmer failed\nstderr:\n%s\n" % stderr.decode("utf-8")
+                )
+            # Get e-values for each target name
+            tbl = ""
+            if self.get_tblout:
+                with open(tblout_path) as f:
+                    tbl = f.read()
+            if (max_sequences is None):
+                with open(sto_path) as f:
+                    sto = f.read()
+            else:
+                sto = parsers.truncate_stockholm_msa(sto_path, max_sequences)
+        raw_output = dict(
+            sto=sto,
+            tbl=tbl,
+            stderr=stderr,
+            e_value=self.e_value,
+        )
+        return raw_output
+    def query(self,
+              input_fasta_path: str,
+              max_sequences: Optional[int] = None
+              ) -> Sequence[Sequence[Mapping[str, Any]]]:
+        return self.query_multiple([input_fasta_path], max_sequences)
+    def query_multiple(self,
+                       input_fasta_paths: Sequence[str],
+                       max_sequences: Optional[int] = None
+                       ) -> Sequence[Sequence[Mapping[str, Any]]]:
+        """Queries the database using Nhmmer."""
+        if self.num_streamed_chunks is None:
+            single_chunk_results = []
+            for input_fasta_path in input_fasta_paths:
+                single_chunk_result = self._query_chunk(
+                    input_fasta_path, self.database_path, max_sequences,
+                )
+                single_chunk_results.append(single_chunk_result)
+            return single_chunk_results
+        db_basename = os.path.basename(self.database_path)
+        db_remote_chunk = lambda db_idx: f"{self.database_path}.{db_idx}"
+        db_local_chunk = lambda db_idx: f"/tmp/ramdisk/{db_basename}.{db_idx}"
+        # Remove existing files to prevent OOM
+        for f in glob.glob(db_local_chunk("[0-9]*")):
+            try:
+                os.remove(f)
+            except OSError:
+                print(f"OSError while deleting {f}")
+        # Download the (i+1)-th chunk while Jackhmmer is running on the i-th chunk
+        with futures.ThreadPoolExecutor(max_workers=2) as executor:
+            chunked_outputs = [[] for _ in range(len(input_fasta_paths))]
+            for i in range(1, self.num_streamed_chunks + 1):
+                # Copy the chunk locally
+                if i == 1:
+                    future = executor.submit(
+                        request.urlretrieve,
+                        db_remote_chunk(i),
+                        db_local_chunk(i),
+                    )
+                if i < self.num_streamed_chunks:
+                    next_future = executor.submit(
+                        request.urlretrieve,
+                        db_remote_chunk(i + 1),
+                        db_local_chunk(i + 1),
+                    )
+                # Run Jackhmmer with the chunk
+                future.result()
+                for fasta_idx, input_fasta_path in enumerate(input_fasta_paths):
+                    chunked_outputs[fasta_idx].append(
+                        self._query_chunk(
+                            input_fasta_path,
+                            db_local_chunk(i),
+                            max_sequences
+                        )
+                    )
+                # Remove the local copy of the chunk
+                os.remove(db_local_chunk(i))
+                # Do not set next_future for the last chunk so that this works
+                # even for databases with only 1 chunk
+                if (i < self.num_streamed_chunks):
+                    future = next_future
+                if self.streaming_callback:
+                    self.streaming_callback(i)
+        return chunked_outputs
+if __name__ == '__main__':
+    # pass
+    nhmmer = Nhmmer(binary_path="/usr/bin/nhmmer",
+                    # database_path="/group1/share01/data/alphafold3/rnacentral/v21.0/rnacentral.fasta")
+                    database_path="/group1/share01/data/alphafold3/rfam/v14.9/Rfam_af3_clustered_all_seqs.fasta")
+    out = nhmmer.query(input_fasta_path="./test.fa")
+    print(out[0]["sto"])

PhysDock/data/tools/parse_msas.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import os.path
+import numpy as np
+from typing import Optional, Sequence, Dict, OrderedDict, Any, Union
+from scipy.sparse import coo_matrix
+from .parsers import parse_fasta, parse_hhr, parse_stockholm, parse_a3m, parse_hmmsearch_a3m, \
+    parse_hmmsearch_sto, Msa, parse_stockholm_file, Msa
+from . import msa_identifiers
+FeatureDict = Dict[str, Union[np.ndarray, coo_matrix, None, Any]]
+def load_txt(fname):
+    with open(fname, "r") as f:
+        data = f.read()
+    return data
+amino_acids = ["ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE",
+               "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL", "UNK", ]
+HHBLITS_AA_TO_AA = {
+    "A": "A",
+    "B": "D",
+    "C": "C",
+    "D": "D",
+    "E": "E",
+    "F": "F",
+    "G": "G",
+    "H": "H",
+    "I": "I",
+    "J": "X",
+    "K": "K",
+    "L": "L",
+    "M": "M",
+    "N": "N",
+    "O": "X",
+    "P": "P",
+    "Q": "Q",
+    "R": "R",
+    "S": "S",
+    "T": "T",
+    "U": "C",
+    "V": "V",
+    "W": "W",
+    "X": "X",
+    "Y": "Y",
+    "Z": "E",
+    "-": "-",
+}
+standard_protein = ["ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE",
+                    "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL", "UNK", ]
+amino_acid_1to3 = {
+    "A": "ALA",
+    "R": "ARG",
+    "N": "ASN",
+    "D": "ASP",
+    "C": "CYS",
+    "Q": "GLN",
+    "E": "GLU",
+    "G": "GLY",
+    "H": "HIS",
+    "I": "ILE",
+    "L": "LEU",
+    "K": "LYS",
+    "M": "MET",
+    "F": "PHE",
+    "P": "PRO",
+    "S": "SER",
+    "T": "THR",
+    "W": "TRP",
+    "Y": "TYR",
+    "V": "VAL",
+    "X": "UNK",
+}
+amino_acid_3to1 = {v: k for k, v in amino_acid_1to3.items()}
+AA_TO_ID = {
+    amino_acid_3to1[ccd]: amino_acids.index(ccd) for ccd in standard_protein
+}
+AA_TO_ID["-"] = 31
+robon_nucleic_acids = ["A", "G", "C", "U", "N", ]
+RNA_TO_ID = {ch: robon_nucleic_acids.index(ch) + 21 for ch in robon_nucleic_acids}
+RNA_TO_ID["-"] = 31
+# DEBUG
+# RNA_TO_ID["."] = 31
+def make_msa_features(msas: Sequence[Msa], is_rna=False) -> FeatureDict:
+    """Constructs a feature dict of MSA features."""
+    if not msas:
+        raise ValueError("At least one MSA must be provided.")
+    int_msa = []
+    deletion_matrix = []
+    species_ids = []
+    seen_sequences = set()
+    for msa_index, msa in enumerate(msas):
+        if not msa:
+            raise ValueError(f"MSA {msa_index} must contain at least one sequence.")
+        for sequence_index, (sequence, msa_deletion_matrix) in enumerate(
+                zip(msa.sequences, msa.deletion_matrix)):
+            if sequence in seen_sequences:
+                continue
+            seen_sequences.add(sequence)
+            if is_rna:
+                int_msa.append(
+                    [RNA_TO_ID.get(res, RNA_TO_ID["N"]) for res in sequence]
+                )
+                # deletion_matrix.append([
+                #     msa_deletion_matrix[id] for id, res in enumerate(sequence)
+                # ])
+            else:
+                int_msa.append(
+                    [AA_TO_ID[HHBLITS_AA_TO_AA[res]] for res in sequence]
+                )
+            deletion_matrix.append(msa_deletion_matrix)
+            identifiers = msa_identifiers.get_identifiers(
+                msa.descriptions[sequence_index]
+            )
+            species_ids.append(identifiers.species_id.encode("utf-8"))
+    features = {}
+    features["deletion_matrix"] = np.array(deletion_matrix, dtype=np.int8)
+    features["msa"] = np.array(int_msa, dtype=np.int8)
+    features["msa_species_identifiers"] = np.array(species_ids, dtype=np.object_)
+    return features
+def parse_alignment_dir(
+        alignment_dir,
+):
+    # MSA Order: uniref90 bfd_uniclust30/bfd_uniref30 mgnify
+    uniref90_out_path = os.path.join(alignment_dir, "uniref90_hits.sto")
+    uniprot_out_path = os.path.join(alignment_dir, "uniprot_hits.sto")
+    reduced_bfd_out_path = os.path.join(alignment_dir, "reduced_bfd_hits.sto")
+    mgnify_out_path = os.path.join(alignment_dir, "mgnify_hits.sto")
+    bfd_uniref30_out_path = os.path.join(alignment_dir, f"bfd_uniref30_hits.a3m")
+    bfd_uniclust30_out_path = os.path.join(alignment_dir, f"bfd_uniclust30_hits.a3m")
+    rfam_out_path = os.path.join(alignment_dir, f"rfam_hits2.sto")
+    rnacentral_out_path = os.path.join(alignment_dir, f"rnacentral_hits.sto")
+    nt_out_path = os.path.join(alignment_dir, f"nt_hits.sto")
+    uniref90_msa = None
+    bfd_uniclust30_msa = None
+    bfd_uniref30_msa = None
+    reduced_bfd_msa = None
+    mgnify_msa = None
+    uniprot_msa = None
+    rfam_msa = None
+    rnacentral_msa = None
+    nt_msa = None
+    if os.path.exists(uniref90_out_path):
+        uniref90_msa = parse_stockholm(load_txt(uniref90_out_path))
+    if os.path.exists(bfd_uniclust30_out_path):
+        bfd_uniclust30_msa = parse_a3m(load_txt(bfd_uniclust30_out_path))
+    if os.path.exists(bfd_uniref30_out_path):
+        bfd_uniref30_msa = parse_a3m(load_txt(bfd_uniref30_out_path))
+    if os.path.exists(reduced_bfd_out_path):
+        reduced_bfd_msa = parse_stockholm(load_txt(reduced_bfd_out_path))
+    if os.path.exists(mgnify_out_path):
+        mgnify_msa = parse_stockholm(load_txt(mgnify_out_path))
+    if os.path.exists(uniprot_out_path):
+        uniprot_msa = parse_stockholm(load_txt(uniprot_out_path))
+    if os.path.exists(rfam_out_path):
+        # rfam_msa = parse_stockholm(load_txt(rfam_out_path))
+        rfam_msa = parse_stockholm_file(rfam_out_path)
+    if os.path.exists(rnacentral_out_path):
+        # rnacentral_msa = parse_stockholm(load_txt(rnacentral_out_path))
+        rnacentral_msa = parse_stockholm_file(rnacentral_out_path)
+    if os.path.exists(nt_out_path):
+        # nt_msa = parse_stockholm(load_txt(nt_out_path))
+        nt_msa = parse_stockholm_file(nt_out_path)
+    protein_msas = [uniref90_msa, bfd_uniclust30_msa, bfd_uniref30_msa, reduced_bfd_msa, mgnify_msa]
+    uniprot_msas = [uniprot_msa]
+    rna_msas = [rfam_msa, rnacentral_msa, nt_msa]
+    protein_msas = [i for i in protein_msas if i is not None]
+    uniprot_msas = [i for i in uniprot_msas if i is not None]
+    rna_msas = [i for i in rna_msas if i is not None]
+    output = dict()
+    if len(uniprot_msas) > 0:
+        uniprot_msa_features = make_msa_features(uniprot_msas)
+        output["msa_all_seq"] = uniprot_msa_features.pop("msa")
+        output["deletion_matrix_all_seq"] = uniprot_msa_features.pop("deletion_matrix")
+        output["msa_species_identifiers_all_seq"] = uniprot_msa_features.pop("msa_species_identifiers")
+    if len(protein_msas) > 0:
+        msa_features = make_msa_features(protein_msas)
+        output["msa"] = msa_features.pop("msa")
+        output["deletion_matrix"] = msa_features.pop("deletion_matrix")
+        output["msa_species_identifiers"] = msa_features.pop("msa_species_identifiers")
+    # TODO: DEBUG parse rna sto and
+    if len(rna_msas) > 0:
+        assert len(protein_msas) == 0
+        msa_features = make_msa_features(rna_msas, is_rna=True)
+        output["msa"] = msa_features.pop("msa")
+        output["deletion_matrix"] = msa_features.pop("deletion_matrix")
+        output["msa_species_identifiers"] = msa_features.pop("msa_species_identifiers")
+    return output
+def parse_protein_alignment_dir(alignment_dir):
+    # MSA Order: uniref90 bfd_uniclust30/bfd_uniref30 mgnify
+    uniref90_out_path = os.path.join(alignment_dir, "uniref90_hits.sto")
+    reduced_bfd_out_path = os.path.join(alignment_dir, "reduced_bfd_hits.sto")
+    mgnify_out_path = os.path.join(alignment_dir, "mgnify_hits.sto")
+    bfd_uniref30_out_path = os.path.join(alignment_dir, f"bfd_uniref_hits.a3m")
+    bfd_uniclust30_out_path = os.path.join(alignment_dir, f"bfd_uniclust30_hits.a3m")
+    uniref90_msa = None
+    bfd_uniclust30_msa = None
+    bfd_uniref30_msa = None
+    reduced_bfd_msa = None
+    mgnify_msa = None
+    if os.path.exists(uniref90_out_path):
+        uniref90_msa = parse_stockholm(load_txt(uniref90_out_path))
+    if os.path.exists(bfd_uniclust30_out_path):
+        bfd_uniclust30_msa = parse_a3m(load_txt(bfd_uniclust30_out_path))
+    if os.path.exists(bfd_uniref30_out_path):
+        bfd_uniref30_msa = parse_a3m(load_txt(bfd_uniref30_out_path))
+    if os.path.exists(reduced_bfd_out_path):
+        reduced_bfd_msa = parse_stockholm(load_txt(reduced_bfd_out_path))
+    if os.path.exists(mgnify_out_path):
+        mgnify_msa = parse_stockholm(load_txt(mgnify_out_path))
+    protein_msas = [uniref90_msa, bfd_uniclust30_msa, bfd_uniref30_msa, reduced_bfd_msa, mgnify_msa]
+    protein_msas = [i for i in protein_msas if i is not None]
+    output = dict()
+    if len(protein_msas) > 0:
+        msa_features = make_msa_features(protein_msas)
+        output["msa"] = msa_features.pop("msa")
+        output["deletion_matrix"] = msa_features.pop("deletion_matrix")
+        output["msa_species_identifiers"] = msa_features.pop("msa_species_identifiers")
+    return output
+def parse_uniprot_alignment_dir(
+        alignment_dir,
+):
+    uniprot_out_path = os.path.join(alignment_dir, "uniprot_hits.sto")
+    uniprot_msa = None
+    if os.path.exists(uniprot_out_path):
+        uniprot_msa = parse_stockholm(load_txt(uniprot_out_path))
+    uniprot_msas = [uniprot_msa]
+    uniprot_msas = [i for i in uniprot_msas if i is not None]
+    output = dict()
+    if len(uniprot_msas) > 0:
+        uniprot_msa_features = make_msa_features(uniprot_msas)
+        output["msa_all_seq"] = uniprot_msa_features.pop("msa")
+        output["deletion_matrix_all_seq"] = uniprot_msa_features.pop("deletion_matrix")
+        output["msa_species_identifiers_all_seq"] = uniprot_msa_features.pop("msa_species_identifiers")
+    return output
+def parse_rna_from_input_fasta_path(input_fasta_path):
+    with open(input_fasta_path, "r") as f:
+        query_sequence, dec = parse_fasta(f.read())
+        deletion_matrix = [[0] * len(query_sequence[0])]
+    query_msa = Msa(
+        sequences=query_sequence,
+        deletion_matrix=deletion_matrix,
+        descriptions=dec
+    )
+    return query_msa
+def parse_rna_single_alignment(input_fasta_path):
+    query_msa = parse_rna_from_input_fasta_path(input_fasta_path)
+    rna_msas = [query_msa]
+    msa_features = make_msa_features(rna_msas, is_rna=True)
+    output = dict()
+    output["msa"] = msa_features.pop("msa")
+    output["deletion_matrix"] = msa_features.pop("deletion_matrix")
+    return output
+def parse_rna_alignment_dir(
+        alignment_dir,
+        input_fasta_path,
+):
+    rfam_out_path = os.path.join(alignment_dir, f"rfam_hits_realigned.sto")
+    rnacentral_out_path = os.path.join(alignment_dir, f"rnacentral_hits_realigned.sto")
+    nt_out_path = os.path.join(alignment_dir, f"nt_hits_realigned.sto")
+    rfam_msa = None
+    rnacentral_msa = None
+    nt_msa = None
+    if os.path.exists(rfam_out_path):
+        # rfam_msa = parse_stockholm(load_txt(rfam_out_path))
+        rfam_msa = parse_stockholm_file(rfam_out_path)
+    if os.path.exists(rnacentral_out_path):
+        # rnacentral_msa = parse_stockholm(load_txt(rnacentral_out_path))
+        rnacentral_msa = parse_stockholm_file(rnacentral_out_path)
+    if os.path.exists(nt_out_path):
+        # nt_msa = parse_stockholm(load_txt(nt_out_path))
+        nt_msa = parse_stockholm_file(nt_out_path)
+    query_msa = parse_rna_from_input_fasta_path(input_fasta_path)
+    rna_msas = [query_msa, rfam_msa, rnacentral_msa, nt_msa]
+    rna_msas = [i for i in rna_msas if i is not None and len(i) > 0]
+    # rna_msas_gt0 = [i for i in rna_msas if len(i) > 0]
+    output = dict()
+    msa_features = make_msa_features(rna_msas, is_rna=True)
+    output["msa"] = msa_features.pop("msa")
+    output["deletion_matrix"] = msa_features.pop("deletion_matrix")
+    return output

PhysDock/data/tools/parsers.py ADDED Viewed

	@@ -0,0 +1,727 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for parsing various file formats."""
+import collections
+import dataclasses
+import itertools
+import re
+import string
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Set
+DeletionMatrix = Sequence[Sequence[int]]
+@dataclasses.dataclass(frozen=True)
+class Msa:
+    """Class representing a parsed MSA file"""
+    sequences: Sequence[str]
+    deletion_matrix: DeletionMatrix
+    descriptions: Optional[Sequence[str]]
+    def __post_init__(self):
+        if (not (
+                len(self.sequences) ==
+                len(self.deletion_matrix) ==
+                len(self.descriptions)
+        )):
+            raise ValueError(
+                "All fields for an MSA must have the same length"
+            )
+    def __len__(self):
+        return len(self.sequences)
+    def truncate(self, max_seqs: int):
+        return Msa(
+            sequences=self.sequences[:max_seqs],
+            deletion_matrix=self.deletion_matrix[:max_seqs],
+            descriptions=self.descriptions[:max_seqs],
+        )
+@dataclasses.dataclass(frozen=True)
+class TemplateHit:
+    """Class representing a template hit."""
+    index: int
+    name: str
+    aligned_cols: int
+    sum_probs: Optional[float]
+    query: str
+    hit_sequence: str
+    indices_query: List[int]
+    indices_hit: List[int]
+def parse_fasta(fasta_string: str) -> Tuple[Sequence[str], Sequence[str]]:
+    """Parses FASTA string and returns list of strings with amino-acid sequences.
+    Arguments:
+        fasta_string: The string contents of a FASTA file.
+    Returns:
+        A tuple of two lists:
+        * A list of sequences.
+        * A list of sequence descriptions taken from the comment lines. In the
+            same order as the sequences.
+    """
+    sequences = []
+    descriptions = []
+    index = -1
+    for line in fasta_string.splitlines():
+        line = line.strip()
+        if line.startswith(">"):
+            index += 1
+            descriptions.append(line[1:])  # Remove the '>' at the beginning.
+            sequences.append("")
+            continue
+        elif line.startswith("#"):
+            continue
+        elif not line:
+            continue  # Skip blank lines.
+        sequences[index] += line
+    return sequences, descriptions
+def parse_stockholm(stockholm_string: str) -> Msa:
+    """Parses sequences and deletion matrix from stockholm format alignment.
+    Args:
+        stockholm_string: The string contents of a stockholm file. The first
+            sequence in the file should be the query sequence.
+    Returns:
+        A tuple of:
+            * A list of sequences that have been aligned to the query. These
+                might contain duplicates.
+            * The deletion matrix for the alignment as a list of lists. The element
+                at `deletion_matrix[i][j]` is the number of residues deleted from
+                the aligned sequence i at residue position j.
+            * The names of the targets matched, including the jackhmmer subsequence
+                suffix.
+    """
+    name_to_sequence = collections.OrderedDict()
+    for line in stockholm_string.splitlines():
+        line = line.strip()
+        if not line or line.startswith(("#", "//")):
+            continue
+        name, sequence = line.split()
+        if name not in name_to_sequence:
+            name_to_sequence[name] = ""
+        name_to_sequence[name] += sequence
+    msa = []
+    deletion_matrix = []
+    query = ""
+    keep_columns = []
+    for seq_index, sequence in enumerate(name_to_sequence.values()):
+        if seq_index == 0:
+            # Gather the columns with gaps from the query
+            query = sequence
+            keep_columns = [i for i, res in enumerate(query) if res != "-"]
+        # Remove the columns with gaps in the query from all sequences.
+        aligned_sequence = "".join([sequence[c] for c in keep_columns])
+        msa.append(aligned_sequence)
+        # Count the number of deletions w.r.t. query.
+        deletion_vec = []
+        deletion_count = 0
+        for seq_res, query_res in zip(sequence, query):
+            if seq_res != "-" or query_res != "-":
+                if query_res == "-":
+                    deletion_count += 1
+                else:
+                    deletion_vec.append(deletion_count)
+                    deletion_count = 0
+        deletion_matrix.append(deletion_vec)
+    return Msa(
+        sequences=msa,
+        deletion_matrix=deletion_matrix,
+        descriptions=list(name_to_sequence.keys())
+    )
+def parse_stockholm_file(stockholm_file: str) -> Msa:
+    """Parses sequences and deletion matrix from stockholm format alignment.
+    Args:
+        stockholm_string: The string contents of a stockholm file. The first
+            sequence in the file should be the query sequence.
+    Returns:
+        A tuple of:
+            * A list of sequences that have been aligned to the query. These
+                might contain duplicates.
+            * The deletion matrix for the alignment as a list of lists. The element
+                at `deletion_matrix[i][j]` is the number of residues deleted from
+                the aligned sequence i at residue position j.
+            * The names of the targets matched, including the jackhmmer subsequence
+                suffix.
+    """
+    name_to_sequence = collections.OrderedDict()
+    with open(stockholm_file, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith(("#", "//")):
+                continue
+            name, sequence = line.split()
+            sequence = "".join([c for c in sequence if not c.islower() and c not in ["*","."]])
+            if name not in name_to_sequence:
+                name_to_sequence[name] = ""
+            name_to_sequence[name] += sequence
+    msa = []
+    deletion_matrix = []
+    query = ""
+    keep_columns = []
+    for seq_index, sequence in enumerate(name_to_sequence.values()):
+        if seq_index == 0:
+            # Gather the columns with gaps from the query
+            query = sequence
+            keep_columns = [i for i, res in enumerate(query) if res != "-"]
+        # Remove the columns with gaps in the query from all sequences.
+        aligned_sequence = "".join([sequence[c] for c in keep_columns])
+        msa.append(aligned_sequence)
+        # Count the number of deletions w.r.t. query.
+        deletion_vec = []
+        deletion_count = 0
+        for seq_res, query_res in zip(sequence, query):
+            if seq_res != "-" or query_res != "-":
+                if query_res == "-":
+                    deletion_count += 1
+                else:
+                    deletion_vec.append(deletion_count)
+                    deletion_count = 0
+        deletion_matrix.append(deletion_vec)
+    return Msa(
+        sequences=msa,
+        deletion_matrix=deletion_matrix,
+        descriptions=list(name_to_sequence.keys())
+    )
+def parse_a3m(a3m_string: str) -> Msa:
+    """Parses sequences and deletion matrix from a3m format alignment.
+    Args:
+        a3m_string: The string contents of a a3m file. The first sequence in the
+            file should be the query sequence.
+    Returns:
+        A tuple of:
+            * A list of sequences that have been aligned to the query. These
+                might contain duplicates.
+            * The deletion matrix for the alignment as a list of lists. The element
+                at `deletion_matrix[i][j]` is the number of residues deleted from
+                the aligned sequence i at residue position j.
+    """
+    sequences, descriptions = parse_fasta(a3m_string)
+    deletion_matrix = []
+    for msa_sequence in sequences:
+        deletion_vec = []
+        deletion_count = 0
+        for j in msa_sequence:
+            if j.islower():
+                deletion_count += 1
+            else:
+                deletion_vec.append(deletion_count)
+                deletion_count = 0
+        deletion_matrix.append(deletion_vec)
+    # Make the MSA matrix out of aligned (deletion-free) sequences.
+    deletion_table = str.maketrans("", "", string.ascii_lowercase)
+    aligned_sequences = [s.translate(deletion_table) for s in sequences]
+    return Msa(
+        sequences=aligned_sequences,
+        deletion_matrix=deletion_matrix,
+        descriptions=descriptions
+    )
+def _convert_sto_seq_to_a3m(
+        query_non_gaps: Sequence[bool], sto_seq: str
+) -> Iterable[str]:
+    for is_query_res_non_gap, sequence_res in zip(query_non_gaps, sto_seq):
+        if is_query_res_non_gap:
+            yield sequence_res
+        elif sequence_res != "-":
+            yield sequence_res.lower()
+def convert_stockholm_to_a3m(
+        stockholm_format: str,
+        max_sequences: Optional[int] = None,
+        remove_first_row_gaps: bool = True,
+) -> str:
+    """Converts MSA in Stockholm format to the A3M format."""
+    descriptions = {}
+    sequences = {}
+    reached_max_sequences = False
+    for line in stockholm_format.splitlines():
+        reached_max_sequences = (
+                max_sequences and len(sequences) >= max_sequences
+        )
+        if line.strip() and not line.startswith(("#", "//")):
+            # Ignore blank lines, markup and end symbols - remainder are alignment
+            # sequence parts.
+            seqname, aligned_seq = line.split(maxsplit=1)
+            if seqname not in sequences:
+                if reached_max_sequences:
+                    continue
+                sequences[seqname] = ""
+            sequences[seqname] += aligned_seq
+    for line in stockholm_format.splitlines():
+        if line[:4] == "#=GS":
+            # Description row - example format is:
+            # #=GS UniRef90_Q9H5Z4/4-78            DE [subseq from] cDNA: FLJ22755 ...
+            columns = line.split(maxsplit=3)
+            seqname, feature = columns[1:3]
+            value = columns[3] if len(columns) == 4 else ""
+            if feature != "DE":
+                continue
+            if reached_max_sequences and seqname not in sequences:
+                continue
+            descriptions[seqname] = value
+            if len(descriptions) == len(sequences):
+                break
+    # Convert sto format to a3m line by line
+    a3m_sequences = {}
+    if (remove_first_row_gaps):
+        # query_sequence is assumed to be the first sequence
+        query_sequence = next(iter(sequences.values()))
+        query_non_gaps = [res != "-" for res in query_sequence]
+    for seqname, sto_sequence in sequences.items():
+        # Dots are optional in a3m format and are commonly removed.
+        out_sequence = sto_sequence.replace('.', '')
+        if (remove_first_row_gaps):
+            out_sequence = ''.join(
+                _convert_sto_seq_to_a3m(query_non_gaps, out_sequence)
+            )
+        a3m_sequences[seqname] = out_sequence
+    fasta_chunks = (
+        f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}"
+        for k in a3m_sequences
+    )
+    return "\n".join(fasta_chunks) + "\n"  # Include terminating newline.
+def _keep_line(line: str, seqnames: Set[str]) -> bool:
+    """Function to decide which lines to keep."""
+    if not line.strip():
+        return True
+    if line.strip() == '//':  # End tag
+        return True
+    if line.startswith('# STOCKHOLM'):  # Start tag
+        return True
+    if line.startswith('#=GC RF'):  # Reference Annotation Line
+        return True
+    if line[:4] == '#=GS':  # Description lines - keep if sequence in list.
+        _, seqname, _ = line.split(maxsplit=2)
+        return seqname in seqnames
+    elif line.startswith('#'):  # Other markup - filter out
+        return False
+    else:  # Alignment data - keep if sequence in list.
+        seqname = line.partition(' ')[0]
+        return seqname in seqnames
+def truncate_stockholm_msa(stockholm_msa_path: str, max_sequences: int) -> str:
+    """Reads + truncates a Stockholm file while preventing excessive RAM usage."""
+    seqnames = set()
+    filtered_lines = []
+    with open(stockholm_msa_path) as f:
+        for line in f:
+            if line.strip() and not line.startswith(('#', '//')):
+                # Ignore blank lines, markup and end symbols - remainder are alignment
+                # sequence parts.
+                seqname = line.partition(' ')[0]
+                seqnames.add(seqname)
+                if len(seqnames) >= max_sequences:
+                    break
+        f.seek(0)
+        for line in f:
+            if _keep_line(line, seqnames):
+                filtered_lines.append(line)
+    return ''.join(filtered_lines)
+def remove_empty_columns_from_stockholm_msa(stockholm_msa: str) -> str:
+    """Removes empty columns (dashes-only) from a Stockholm MSA."""
+    processed_lines = {}
+    unprocessed_lines = {}
+    for i, line in enumerate(stockholm_msa.splitlines()):
+        if line.startswith('#=GC RF'):
+            reference_annotation_i = i
+            reference_annotation_line = line
+            # Reached the end of this chunk of the alignment. Process chunk.
+            _, _, first_alignment = line.rpartition(' ')
+            mask = []
+            for j in range(len(first_alignment)):
+                for _, unprocessed_line in unprocessed_lines.items():
+                    prefix, _, alignment = unprocessed_line.rpartition(' ')
+                    if alignment[j] != '-':
+                        mask.append(True)
+                        break
+                else:  # Every row contained a hyphen - empty column.
+                    mask.append(False)
+            # Add reference annotation for processing with mask.
+            unprocessed_lines[reference_annotation_i] = reference_annotation_line
+            if not any(mask):  # All columns were empty. Output empty lines for chunk.
+                for line_index in unprocessed_lines:
+                    processed_lines[line_index] = ''
+            else:
+                for line_index, unprocessed_line in unprocessed_lines.items():
+                    prefix, _, alignment = unprocessed_line.rpartition(' ')
+                    masked_alignment = ''.join(itertools.compress(alignment, mask))
+                    processed_lines[line_index] = f'{prefix} {masked_alignment}'
+            # Clear raw_alignments.
+            unprocessed_lines = {}
+        elif line.strip() and not line.startswith(('#', '//')):
+            unprocessed_lines[i] = line
+        else:
+            processed_lines[i] = line
+    return '\n'.join((processed_lines[i] for i in range(len(processed_lines))))
+def deduplicate_stockholm_msa(stockholm_msa: str) -> str:
+    """Remove duplicate sequences (ignoring insertions wrt query)."""
+    sequence_dict = collections.defaultdict(str)
+    # First we must extract all sequences from the MSA.
+    for line in stockholm_msa.splitlines():
+        # Only consider the alignments - ignore reference annotation, empty lines,
+        # descriptions or markup.
+        if line.strip() and not line.startswith(('#', '//')):
+            line = line.strip()
+            seqname, alignment = line.split()
+            sequence_dict[seqname] += alignment
+    seen_sequences = set()
+    seqnames = set()
+    # First alignment is the query.
+    query_align = next(iter(sequence_dict.values()))
+    mask = [c != '-' for c in query_align]  # Mask is False for insertions.
+    for seqname, alignment in sequence_dict.items():
+        # Apply mask to remove all insertions from the string.
+        masked_alignment = ''.join(itertools.compress(alignment, mask))
+        if masked_alignment in seen_sequences:
+            continue
+        else:
+            seen_sequences.add(masked_alignment)
+            seqnames.add(seqname)
+    filtered_lines = []
+    for line in stockholm_msa.splitlines():
+        if _keep_line(line, seqnames):
+            filtered_lines.append(line)
+    return '\n'.join(filtered_lines) + '\n'
+def _get_hhr_line_regex_groups(
+        regex_pattern: str, line: str
+) -> Sequence[Optional[str]]:
+    match = re.match(regex_pattern, line)
+    if match is None:
+        raise RuntimeError(f"Could not parse query line {line}")
+    return match.groups()
+def _update_hhr_residue_indices_list(
+        sequence: str, start_index: int, indices_list: List[int]
+):
+    """Computes the relative indices for each residue with respect to the original sequence."""
+    counter = start_index
+    for symbol in sequence:
+        if symbol == "-":
+            indices_list.append(-1)
+        else:
+            indices_list.append(counter)
+            counter += 1
+def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit:
+    """Parses the detailed HMM HMM comparison section for a single Hit.
+    This works on .hhr files generated from both HHBlits and HHSearch.
+    Args:
+        detailed_lines: A list of lines from a single comparison section between 2
+            sequences (which each have their own HMM's)
+    Returns:
+        A dictionary with the information from that detailed comparison section
+    Raises:
+        RuntimeError: If a certain line cannot be processed
+    """
+    # Parse first 2 lines.
+    number_of_hit = int(detailed_lines[0].split()[-1])
+    name_hit = detailed_lines[1][1:]
+    # Parse the summary line.
+    pattern = (
+        "Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t"
+        " ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)[\t "
+        "]*Template_Neff=(.*)"
+    )
+    match = re.match(pattern, detailed_lines[2])
+    if match is None:
+        raise RuntimeError(
+            "Could not parse section: %s. Expected this: \n%s to contain summary."
+            % (detailed_lines, detailed_lines[2])
+        )
+    (_, _, _, aligned_cols, _, _, sum_probs, _) = [
+        float(x) for x in match.groups()
+    ]
+    # The next section reads the detailed comparisons. These are in a 'human
+    # readable' format which has a fixed length. The strategy employed is to
+    # assume that each block starts with the query sequence line, and to parse
+    # that with a regexp in order to deduce the fixed length used for that block.
+    query = ""
+    hit_sequence = ""
+    indices_query = []
+    indices_hit = []
+    length_block = None
+    for line in detailed_lines[3:]:
+        # Parse the query sequence line
+        if (
+                line.startswith("Q ")
+                and not line.startswith("Q ss_dssp")
+                and not line.startswith("Q ss_pred")
+                and not line.startswith("Q Consensus")
+        ):
+            # Thus the first 17 characters must be 'Q <query_name> ', and we can parse
+            # everything after that.
+            #              start    sequence       end       total_sequence_length
+            patt = r"[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)"
+            groups = _get_hhr_line_regex_groups(patt, line[17:])
+            # Get the length of the parsed block using the start and finish indices,
+            # and ensure it is the same as the actual block length.
+            start = int(groups[0]) - 1  # Make index zero based.
+            delta_query = groups[1]
+            end = int(groups[2])
+            num_insertions = len([x for x in delta_query if x == "-"])
+            length_block = end - start + num_insertions
+            assert length_block == len(delta_query)
+            # Update the query sequence and indices list.
+            query += delta_query
+            _update_hhr_residue_indices_list(delta_query, start, indices_query)
+        elif line.startswith("T "):
+            # Parse the hit sequence.
+            if (
+                    not line.startswith("T ss_dssp")
+                    and not line.startswith("T ss_pred")
+                    and not line.startswith("T Consensus")
+            ):
+                # Thus the first 17 characters must be 'T <hit_name> ', and we can
+                # parse everything after that.
+                #              start    sequence       end     total_sequence_length
+                patt = r"[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)"
+                groups = _get_hhr_line_regex_groups(patt, line[17:])
+                start = int(groups[0]) - 1  # Make index zero based.
+                delta_hit_sequence = groups[1]
+                assert length_block == len(delta_hit_sequence)
+                # Update the hit sequence and indices list.
+                hit_sequence += delta_hit_sequence
+                _update_hhr_residue_indices_list(
+                    delta_hit_sequence, start, indices_hit
+                )
+    return TemplateHit(
+        index=number_of_hit,
+        name=name_hit,
+        aligned_cols=int(aligned_cols),
+        sum_probs=sum_probs,
+        query=query,
+        hit_sequence=hit_sequence,
+        indices_query=indices_query,
+        indices_hit=indices_hit,
+    )
+def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]:
+    """Parses the content of an entire HHR file."""
+    lines = hhr_string.splitlines()
+    # Each .hhr file starts with a results table, then has a sequence of hit
+    # "paragraphs", each paragraph starting with a line 'No <hit number>'. We
+    # iterate through each paragraph to parse each hit.
+    block_starts = [i for i, line in enumerate(lines) if line.startswith("No ")]
+    hits = []
+    if block_starts:
+        block_starts.append(len(lines))  # Add the end of the final block.
+        for i in range(len(block_starts) - 1):
+            hits.append(
+                _parse_hhr_hit(lines[block_starts[i]: block_starts[i + 1]])
+            )
+    return hits
+def parse_e_values_from_tblout(tblout: str) -> Dict[str, float]:
+    """Parse target to e-value mapping parsed from Jackhmmer tblout string."""
+    e_values = {"query": 0}
+    lines = [line for line in tblout.splitlines() if line[0] != "#"]
+    # As per http://eddylab.org/software/hmmer/Userguide.pdf fields are
+    # space-delimited. Relevant fields are (1) target name:  and
+    # (5) E-value (full sequence) (numbering from 1).
+    for line in lines:
+        fields = line.split()
+        e_value = fields[4]
+        target_name = fields[0]
+        e_values[target_name] = float(e_value)
+    return e_values
+def _get_indices(sequence: str, start: int) -> List[int]:
+    """Returns indices for non-gap/insert residues starting at the given index."""
+    indices = []
+    counter = start
+    for symbol in sequence:
+        # Skip gaps but add a placeholder so that the alignment is preserved.
+        if symbol == '-':
+            indices.append(-1)
+        # Skip deleted residues, but increase the counter.
+        elif symbol.islower():
+            counter += 1
+        # Normal aligned residue. Increase the counter and append to indices.
+        else:
+            indices.append(counter)
+            counter += 1
+    return indices
+@dataclasses.dataclass(frozen=True)
+class HitMetadata:
+    pdb_id: str
+    chain: str
+    start: int
+    end: int
+    length: int
+    text: str
+def _parse_hmmsearch_description(description: str) -> HitMetadata:
+    """Parses the hmmsearch A3M sequence description line."""
+    # Example 1: >4pqx_A/2-217 [subseq from] mol:protein length:217  Free text
+    # Example 2: >5g3r_A/1-55 [subseq from] mol:protein length:352
+    match = re.match(
+        r'^>?([a-z0-9]+)_(\w+)/([0-9]+)-([0-9]+).*protein length:([0-9]+) *(.*)$',
+        description.strip())
+    if not match:
+        raise ValueError(f'Could not parse description: "{description}".')
+    return HitMetadata(
+        pdb_id=match[1],
+        chain=match[2],
+        start=int(match[3]),
+        end=int(match[4]),
+        length=int(match[5]),
+        text=match[6]
+    )
+def parse_hmmsearch_a3m(
+        query_sequence: str,
+        a3m_string: str,
+        skip_first: bool = True
+) -> Sequence[TemplateHit]:
+    """Parses an a3m string produced by hmmsearch.
+    Args:
+      query_sequence: The query sequence.
+      a3m_string: The a3m string produced by hmmsearch.
+      skip_first: Whether to skip the first sequence in the a3m string.
+    Returns:
+      A sequence of `TemplateHit` results.
+    """
+    # Zip the descriptions and MSAs together, skip the first query sequence.
+    parsed_a3m = list(zip(*parse_fasta(a3m_string)))
+    if skip_first:
+        parsed_a3m = parsed_a3m[1:]
+    indices_query = _get_indices(query_sequence, start=0)
+    hits = []
+    for i, (hit_sequence, hit_description) in enumerate(parsed_a3m, start=1):
+        if 'mol:protein' not in hit_description:
+            continue  # Skip non-protein chains.
+        metadata = _parse_hmmsearch_description(hit_description)
+        # Aligned columns are only the match states.
+        aligned_cols = sum([r.isupper() and r != '-' for r in hit_sequence])
+        indices_hit = _get_indices(hit_sequence, start=metadata.start - 1)
+        hit = TemplateHit(
+            index=i,
+            name=f'{metadata.pdb_id}_{metadata.chain}',
+            aligned_cols=aligned_cols,
+            sum_probs=None,
+            query=query_sequence,
+            hit_sequence=hit_sequence.upper(),
+            indices_query=indices_query,
+            indices_hit=indices_hit,
+        )
+        hits.append(hit)
+    return hits
+def parse_hmmsearch_sto(
+        output_string: str,
+        input_sequence: str
+) -> Sequence[TemplateHit]:
+    """Gets parsed template hits from the raw string output by the tool."""
+    a3m_string = convert_stockholm_to_a3m(
+        output_string,
+        remove_first_row_gaps=False
+    )
+    template_hits = parse_hmmsearch_a3m(
+        query_sequence=input_sequence,
+        a3m_string=a3m_string,
+        skip_first=False
+    )
+    return template_hits

PhysDock/data/tools/rdkit.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import os
+import rdkit
+from rdkit import Chem
+from rdkit.Chem import AllChem, rdmolops
+from rdkit.Chem.rdchem import ChiralType, BondType
+import numpy as np
+import copy
+from PhysDock.data.constants.periodic_table import PeriodicTable
+from PhysDock.utils.io_utils import load_txt
+def get_ref_mol(string):
+    if Chem.MolFromSmiles(string) is not None:
+        mol = Chem.MolFromSmiles(string)
+    elif os.path.isfile(string) and string.split(".")[-1] == "smi":
+        mol = Chem.MolFromSmiles(load_txt(string).strip())
+    else:
+        mol = None
+    if mol is not None:
+        AllChem.EmbedMolecule(mol,maxAttempts=100000)
+        # mol2 = Chem.MolFromPDBBlock(Chem.MolToPDBBlock(mol))
+        # for atom in mol2.GetAtoms():
+        #     # if atom.GetChiralTag() != ChiralType.CHI_UNSPECIFIED:
+        #     print(f"Atom {atom.GetIdx()} has chiral tag: {atom.GetChiralTag()}")
+        # mol = Chem.RemoveAllHs(mol2)
+        mol = Chem.RemoveAllHs(mol)
+    return mol
+Hybridization = {
+    Chem.rdchem.HybridizationType.S: 0,
+    Chem.rdchem.HybridizationType.SP: 1,
+    Chem.rdchem.HybridizationType.SP2: 2,
+    Chem.rdchem.HybridizationType.SP3: 3,
+    Chem.rdchem.HybridizationType.SP3D: 4,
+    Chem.rdchem.HybridizationType.SP3D2: 5,
+}
+Chirality = {ChiralType.CHI_TETRAHEDRAL_CW: 0,
+             ChiralType.CHI_TETRAHEDRAL_CCW: 1,
+             ChiralType.CHI_UNSPECIFIED: 2,
+             ChiralType.CHI_OTHER: 2}
+# Add None
+Bonds = {BondType.SINGLE: 0, BondType.DOUBLE: 1, BondType.TRIPLE: 2, BondType.AROMATIC: 3}
+dihedral_pattern = Chem.MolFromSmarts('[*]~[*]~[*]~[*]')
+# Feats From SMI
+# Feats From MOL
+# Feats From SDF
+def get_features_from_ref_mol(
+        ref_mol,
+        remove_hs=True
+):
+    if remove_hs:
+        ref_mol = Chem.RemoveAllHs(ref_mol)
+    # print(ref_mol)
+    # if ref_mol.GetNumConformers()==0:
+    #     AllChem.EmbedMolecule(ref_mol,useExpTorsionAnglePrefs=True, useBasicKnowledge=True,maxAttempts=100000)
+    ref_conf = ref_mol.GetConformer()
+    x_gt = []
+    for atom_id, atom in enumerate(ref_mol.GetAtoms()):
+        atom_pos = ref_conf.GetAtomPosition(atom_id)
+        x_gt.append(np.array([atom_pos.x, atom_pos.y, atom_pos.z]))
+    x_gt = np.stack(x_gt, axis=0).astype(np.float32)
+    x_exists = np.ones_like(x_gt[:, 0]).astype(np.int64)
+    a_mask = np.ones_like(x_gt[:, 0]).astype(np.int64)
+    # Ref Mol
+    AllChem.EmbedMolecule(ref_mol,maxAttempts=100000)
+    AllChem.MMFFOptimizeMolecule(ref_mol)
+    num_atoms = ref_mol.GetNumAtoms()
+    conf = ref_mol.GetConformer()
+    ring = ref_mol.GetRingInfo()
+    # Filtering Conditions
+    # if ref_mol.GetNumAtoms() < 4:
+    #     return None
+    # if ref_mol.GetNumBonds() < 4:
+    #     return None
+    #
+    # k = 0
+    # for conf in [conf]:
+    #     # skip mols with atoms with more than 4 neighbors for now
+    #     n_neighbors = [len(a.GetNeighbors()) for a in ref_mol.GetAtoms()]
+    #     if np.max(n_neighbors) > 4:
+    #         continue
+    #     try:
+    #         conf_canonical_smi = Chem.MolToSmiles(Chem.RemoveHs(ref_mol))
+    #     except Exception as e:
+    #         continue
+    #     k += 1
+    # if k == 0:
+    #     return None
+    ref_pos = []
+    ref_charge = []
+    ref_element = []
+    ref_is_aromatic = []
+    ref_degree = []
+    ref_hybridization = []
+    ref_implicit_valence = []
+    ref_chirality = []
+    ref_in_ring_of_3 = []
+    ref_in_ring_of_4 = []
+    ref_in_ring_of_5 = []
+    ref_in_ring_of_6 = []
+    ref_in_ring_of_7 = []
+    ref_in_ring_of_8 = []
+    for atom_id, atom in enumerate(ref_mol.GetAtoms()):
+        atom_pos = conf.GetAtomPosition(atom_id)
+        ref_pos.append(np.array([atom_pos.x, atom_pos.y, atom_pos.z]))
+        ref_charge.append(atom.GetFormalCharge())
+        ref_element.append(atom.GetAtomicNum() - 1)
+        ref_is_aromatic.append(int(atom.GetIsAromatic()))
+        ref_degree.append(min(atom.GetDegree(), 8))
+        ref_hybridization.append(Hybridization.get(atom.GetHybridization(), 6))
+        ref_implicit_valence.append(min(atom.GetImplicitValence(), 8))
+        ref_chirality.append(Chirality.get(atom.GetChiralTag(), 2))
+        ref_in_ring_of_3.append(int(ring.IsAtomInRingOfSize(atom_id, 3)))
+        ref_in_ring_of_4.append(int(ring.IsAtomInRingOfSize(atom_id, 4)))
+        ref_in_ring_of_5.append(int(ring.IsAtomInRingOfSize(atom_id, 5)))
+        ref_in_ring_of_6.append(int(ring.IsAtomInRingOfSize(atom_id, 6)))
+        ref_in_ring_of_7.append(int(ring.IsAtomInRingOfSize(atom_id, 7)))
+        ref_in_ring_of_8.append(int(ring.IsAtomInRingOfSize(atom_id, 8)))
+    ref_pos = np.stack(ref_pos, axis=0).astype(np.float32)
+    ref_charge = np.array(ref_charge).astype(np.float32)
+    ref_element = np.array(ref_element).astype(np.int8)
+    ref_is_aromatic = np.array(ref_is_aromatic).astype(np.int8)
+    ref_degree = np.array(ref_degree).astype(np.int8)
+    ref_hybridization = np.array(ref_hybridization).astype(np.int8)
+    ref_implicit_valence = np.array(ref_implicit_valence).astype(np.int8)
+    ref_chirality = np.array(ref_chirality).astype(np.int8)
+    ref_in_ring_of_3 = np.array(ref_in_ring_of_3).astype(np.int8)
+    ref_in_ring_of_4 = np.array(ref_in_ring_of_4).astype(np.int8)
+    ref_in_ring_of_5 = np.array(ref_in_ring_of_5).astype(np.int8)
+    ref_in_ring_of_6 = np.array(ref_in_ring_of_6).astype(np.int8)
+    ref_in_ring_of_7 = np.array(ref_in_ring_of_7).astype(np.int8)
+    ref_in_ring_of_8 = np.array(ref_in_ring_of_8).astype(np.int8)
+    d_token = np.zeros([num_atoms, num_atoms], dtype=np.int8)
+    token_bonds = np.zeros([num_atoms, num_atoms], dtype=np.int8)
+    bond_type = np.zeros([num_atoms, num_atoms], dtype=np.int8)
+    bond_as_double = np.zeros([num_atoms, num_atoms], dtype=np.int8)
+    bond_in_ring = np.zeros([num_atoms, num_atoms], dtype=np.int8)
+    bond_is_aromatic = np.zeros([num_atoms, num_atoms], dtype=np.int8)
+    bond_is_conjugated = np.zeros([num_atoms, num_atoms], dtype=np.int8)
+    for i in range(num_atoms - 1):
+        for j in range(i + 1, num_atoms):
+            dist = len(rdmolops.GetShortestPath(ref_mol, i, j)) - 1
+            dist = min(30, dist)
+            d_token[i, j] = dist
+            d_token[j, i] = dist
+    for bond_id, bond in enumerate(ref_mol.GetBonds()):
+        i = bond.GetBeginAtomIdx()
+        j = bond.GetEndAtomIdx()
+        token_bonds[i, j] = 1
+        token_bonds[j, i] = 1
+        bond_type[i, j] = Bonds.get(bond.GetBondType(), 4)
+        bond_type[j, i] = Bonds.get(bond.GetBondType(), 4)
+        bond_as_double[i, j] = bond.GetBondTypeAsDouble()
+        bond_as_double[j, i] = bond.GetBondTypeAsDouble()
+        bond_in_ring[i, j] = bond.IsInRing()
+        bond_in_ring[j, i] = bond.IsInRing()
+        bond_is_conjugated[i, j] = bond.GetIsConjugated()
+        bond_is_conjugated[j, i] = bond.GetIsConjugated()
+        bond_is_aromatic[i, j] = bond.GetIsAromatic()
+        bond_is_aromatic[j, i] = bond.GetIsAromatic()
+    ref_atom_name_chars = [PeriodicTable[e] for e in ref_element.tolist()]
+    ref_mask_in_polymer = [1] * len(ref_pos)
+    # ccds, restype, residue_index, atom_id_to_conformer_atom_id, a_mask, x_gt, x_exists
+    num_atoms = len(x_gt)
+    label_feature = {
+        "x_gt": x_gt,
+        "x_exists": x_exists,
+        "a_mask": a_mask,
+        "restype": np.array([20]).astype(np.int64),
+        "residue_index": np.arange(1).astype(np.int64),
+        "atom_id_to_conformer_atom_id": np.arange(num_atoms).astype(np.int64),
+        "conformer_id_to_chunk_sizes": np.array([num_atoms]).astype(np.int64)
+    }
+    conf_feature = {
+        "ref_pos": ref_pos,
+        "ref_charge": ref_charge,
+        "ref_element": ref_element,
+        "ref_is_aromatic": ref_is_aromatic,
+        "ref_degree": ref_degree,
+        "ref_hybridization": ref_hybridization,
+        "ref_implicit_valence": ref_implicit_valence,
+        "ref_chirality": ref_chirality,
+        "ref_in_ring_of_3": ref_in_ring_of_3,
+        "ref_in_ring_of_4": ref_in_ring_of_4,
+        "ref_in_ring_of_5": ref_in_ring_of_5,
+        "ref_in_ring_of_6": ref_in_ring_of_6,
+        "ref_in_ring_of_7": ref_in_ring_of_7,
+        "ref_in_ring_of_8": ref_in_ring_of_8,
+        "d_token": d_token,
+        "token_bonds": token_bonds,
+        "bond_type": bond_type,
+        "bond_as_double": bond_as_double,
+        "bond_in_ring": bond_in_ring,
+        "bond_is_conjugated": bond_is_conjugated,
+        "bond_is_aromatic": bond_is_aromatic,
+        "ref_atom_name_chars": ref_atom_name_chars,
+        "ref_mask_in_polymer": ref_mask_in_polymer,
+    }
+    return label_feature, conf_feature, ref_mol
+def get_features_from_smi(smi, remove_hs=True):
+    ref_mol = get_ref_mol(smi)
+    label_feature, conf_feature, ref_mol = get_features_from_ref_mol(ref_mol, remove_hs=remove_hs)
+    return label_feature, conf_feature, ref_mol

PhysDock/data/tools/residue_constants.py ADDED Viewed

	@@ -0,0 +1,604 @@

+import numpy as np
+amino_acid_1to3 = {
+    "A": "ALA",
+    "R": "ARG",
+    "N": "ASN",
+    "D": "ASP",
+    "C": "CYS",
+    "Q": "GLN",
+    "E": "GLU",
+    "G": "GLY",
+    "H": "HIS",
+    "I": "ILE",
+    "L": "LEU",
+    "K": "LYS",
+    "M": "MET",
+    "F": "PHE",
+    "P": "PRO",
+    "S": "SER",
+    "T": "THR",
+    "W": "TRP",
+    "Y": "TYR",
+    "V": "VAL",
+    "X": "UNK",
+}
+amino_acid_3to1 = {v: k for k, v in amino_acid_1to3.items()}
+# Ligand Atom is representaed as "UNK" in token
+# standard_residue is also ccd
+standard_protein = ["ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE",
+                    "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL", "UNK", ]
+standard_rna = ["A  ", "G  ", "C  ", "U  ", "N  ", ]
+standard_dna = ["DA ", "DG ", "DC ", "DT ", "DN ", ]
+standard_nucleics = standard_rna + standard_dna
+standard_ccds_without_gap = standard_protein + standard_nucleics
+GAP = ["GAP"]  # used in msa one-hot
+standard_ccds = standard_protein + standard_nucleics + GAP
+standard_ccd_to_order = {ccd: id for id, ccd in enumerate(standard_ccds)}
+standard_purines = ["A  ", "G  ", "DA ", "DG "]
+standard_pyrimidines = ["C  ", "U  ", "DC ", "DT "]
+is_standard = lambda x: x in standard_ccds
+is_unk = lambda x: x in ["UNK", "N  ", "DN ", "GAP", "UNL"]
+is_protein = lambda x: x in standard_protein and not is_unk(x)
+is_rna = lambda x: x in standard_rna and not is_unk(x)
+is_dna = lambda x: x in standard_dna and not is_unk(x)
+is_nucleics = lambda x: x in standard_nucleics and not is_unk(x)
+is_purines = lambda x: x in standard_purines
+is_pyrimidines = lambda x: x in standard_pyrimidines
+standard_ccd_to_atoms_num = {s: n for s, n in zip(standard_ccds, [
+    5, 11, 8, 8, 6, 9, 9, 4, 10, 8,
+    8, 9, 8, 11, 7, 6, 7, 14, 12, 7, None,
+    22, 23, 20, 20, None,
+    21, 22, 19, 20, None,
+    None,
+])}
+standard_ccd_to_token_centre_atom_name = {
+    **{residue: "CA" for residue in standard_protein},
+    **{residue: "C1'" for residue in standard_nucleics},
+}
+standard_ccd_to_frame_atom_name_0 = {
+    **{residue: "N" for residue in standard_protein},
+    **{residue: "C1'" for residue in standard_nucleics},
+}
+standard_ccd_to_frame_atom_name_1 = {
+    **{residue: "CA" for residue in standard_protein},
+    **{residue: "C3'" for residue in standard_nucleics},
+}
+standard_ccd_to_frame_atom_name_2 = {
+    **{residue: "C" for residue in standard_protein},
+    **{residue: "C4'" for residue in standard_nucleics},
+}
+standard_ccd_to_token_pseudo_beta_atom_name = {
+    **{residue: "CB" for residue in standard_protein},
+    **{residue: "C4" for residue in standard_purines},
+    **{residue: "C2" for residue in standard_pyrimidines},
+}
+standard_ccd_to_token_pseudo_beta_atom_name.update({"GLY": "CA"})
+HHBLITS_ID_TO_AA = {
+    0: "ALA",
+    1: "CYS",  # Also U.
+    2: "ASP",  # Also B.
+    3: "GLU",  # Also Z.
+    4: "PHE",
+    5: "GLY",
+    6: "HIS",
+    7: "ILE",
+    8: "LYS",
+    9: "LEU",
+    10: "MET",
+    11: "ASN",
+    12: "PRO",
+    13: "GLN",
+    14: "ARG",
+    15: "SER",
+    16: "THR",
+    17: "VAL",
+    18: "TRP",
+    19: "TYR",
+    20: "UNK",  # Includes J and O.
+    21: "GAP",
+}
+# Usage: Convert hhblits msa to af3 aatype
+#        msa = hhblits_id_to_standard_residue_id_np[hhblits_msa.astype(np.int64)]
+hhblits_id_to_standard_residue_id_np = np.array(
+    [standard_ccds.index(ccd) for id, ccd in HHBLITS_ID_TO_AA.items()]
+)
+of_restypes = [
+    "A", "R", "N", "D", "C", "Q", "E", "G", "H", "I",
+    "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V", "X", "-"
+]
+af3_restypes = [
+    amino_acid_3to1[ccd] if ccd in amino_acid_3to1 else "-" if ccd == "GAP" else "None"
+    for ccd in standard_ccds
+]
+af3_if_to_residue_id = np.array(
+    [af3_restypes.index(restype) if restype in of_restypes else -1 for restype in af3_restypes])
+########################################################
+#    periodic table that used to encode elements       #
+########################################################
+periodic_table = [
+    "h", "he",
+    "li", "be", "b", "c", "n", "o", "f", "ne",
+    "na", "mg", "al", "si", "p", "s", "cl", "ar",
+    "k", "ca", "sc", "ti", "v", "cr", "mn", "fe", "co", "ni", "cu", "zn", "ga", "ge", "as", "se", "br", "kr",
+    "rb", "sr", "y", "zr", "nb", "mo", "tc", "ru", "rh", "pd", "ag", "cd", "in", "sn", "sb", "te", "i", "xe",
+    "cs", "ba",
+    "la", "ce", "pr", "nd", "pm", "sm", "eu", "gd", "tb", "dy", "ho", "er", "tm", "yb", "lu",
+    "hf", "ta", "w", "re", "os", "ir", "pt", "au", "hg", "tl", "pb", "bi", "po", "at", "rn",
+    "fr", "ra",
+    "ac", "th", "pa", "u", "np", "pu", "am", "cm", "bk", "cf", "es", "fm", "md", "no", "lr",
+    "rf", "db", "sg", "bh", "hs", "mt", "ds", "rg", "cn", "nh", "fl", "mc", "lv", "ts", "og"
+]
+get_element_id = {ele: ele_id for ele_id, ele in enumerate(periodic_table)}
+##########################################################
+standard_ccd_to_reference_features_table = {
+    # letters_3: [ref_pos,ref_charge, ref_mask, ref_elements, ref_atom_name_chars]
+    "ALA": [
+        [-0.966, 0.493, 1.500, 0., 1, "N", "N"],
+        [0.257, 0.418, 0.692, 0., 1, "C", "CA"],
+        [-0.094, 0.017, -0.716, 0., 1, "C", "C"],
+        [-1.056, -0.682, -0.923, 0., 1, "O", "O"],
+        [1.204, -0.620, 1.296, 0., 1, "C", "CB"],
+        [0.661, 0.439, -1.742, 0., 0, "O", "OXT"],
+    ],
+    "ARG": [
+        [-0.469, 1.110, -0.993, 0., 1, "N", "N"],
+        [0.004, 2.294, -1.708, 0., 1, "C", "CA"],
+        [-0.907, 2.521, -2.901, 0., 1, "C", "C"],
+        [-1.827, 1.789, -3.242, 0., 1, "O", "O"],
+        [1.475, 2.150, -2.127, 0., 1, "C", "CB"],
+        [1.745, 1.017, -3.130, 0., 1, "C", "CG"],
+        [3.210, 0.954, -3.557, 0., 1, "C", "CD"],
+        [4.071, 0.726, -2.421, 0., 1, "N", "NE"],
+        [5.469, 0.624, -2.528, 0., 1, "C", "CZ"],
+        [6.259, 0.404, -1.405, 0., 1, "N", "NH1"],
+        [6.078, 0.744, -3.773, 0., 1, "N", "NH2"],
+        [-0.588, 3.659, -3.574, 0., 0, "O", "OXT"],
+    ],
+    "ASN": [
+        [-0.293, 1.686, 0.094, 0., 1, "N", "N"],
+        [-0.448, 0.292, -0.340, 0., 1, "C", "CA"],
+        [-1.846, -0.179, -0.031, 0., 1, "C", "C"],
+        [-2.510, 0.402, 0.794, 0., 1, "O", "O"],
+        [0.562, -0.588, 0.401, 0., 1, "C", "CB"],
+        [1.960, -0.197, -0.002, 0., 1, "C", "CG"],
+        [2.132, 0.697, -0.804, 0., 1, "O", "OD1"],
+        [3.019, -0.841, 0.527, 0., 1, "N", "ND2"],
+        [-2.353, -1.243, -0.673, 0., 0, "O", "OXT"],
+    ],
+    "ASP": [
+        [-0.317, 1.688, 0.066, 0., 1, "N", "N"],
+        [-0.470, 0.286, -0.344, 0., 1, "C", "CA"],
+        [-1.868, -0.180, -0.029, 0., 1, "C", "C"],
+        [-2.534, 0.415, 0.786, 0., 1, "O", "O"],
+        [0.539, -0.580, 0.413, 0., 1, "C", "CB"],
+        [1.938, -0.195, 0.004, 0., 1, "C", "CG"],
+        [2.109, 0.681, -0.810, 0., 1, "O", "OD1"],
+        [2.992, -0.826, 0.543, 0., 1, "O", "OD2"],
+        [-2.374, -1.256, -0.652, 0., 0, "O", "OXT"],
+    ],
+    "CYS": [
+        [1.585, 0.483, -0.081, 0., 1, "N", "N"],
+        [0.141, 0.450, 0.186, 0., 1, "C", "CA"],
+        [-0.095, 0.006, 1.606, 0., 1, "C", "C"],
+        [0.685, -0.742, 2.143, 0., 1, "O", "O"],
+        [-0.533, -0.530, -0.774, 0., 1, "C", "CB"],
+        [-0.247, 0.004, -2.484, 0., 1, "S", "SG"],
+        [-1.174, 0.443, 2.275, 0., 0, "O", "OXT"],
+    ],
+    "GLN": [
+        [1.858, -0.148, 1.125, 0., 1, "N", "N"],
+        [0.517, 0.451, 1.112, 0., 1, "C", "CA"],
+        [-0.236, 0.022, 2.344, 0., 1, "C", "C"],
+        [-0.005, -1.049, 2.851, 0., 1, "O", "O"],
+        [-0.236, -0.013, -0.135, 0., 1, "C", "CB"],
+        [0.529, 0.421, -1.385, 0., 1, "C", "CG"],
+        [-0.213, -0.036, -2.614, 0., 1, "C", "CD"],
+        [-1.252, -0.650, -2.500, 0., 1, "O", "OE1"],
+        [0.277, 0.236, -3.839, 0., 1, "N", "NE2"],
+        [-1.165, 0.831, 2.878, 0., 0, "O", "OXT"],
+    ],
+    "GLU": [
+        [1.199, 1.867, -0.117, 0., 1, "N", "N"],
+        [1.138, 0.515, 0.453, 0., 1, "C", "CA"],
+        [2.364, -0.260, 0.041, 0., 1, "C", "C"],
+        [3.010, 0.096, -0.916, 0., 1, "O", "O"],
+        [-0.113, -0.200, -0.062, 0., 1, "C", "CB"],
+        [-1.360, 0.517, 0.461, 0., 1, "C", "CG"],
+        [-2.593, -0.187, -0.046, 0., 1, "C", "CD"],
+        [-2.485, -1.161, -0.753, 0., 1, "O", "OE1"],
+        [-3.811, 0.269, 0.287, 0., 1, "O", "OE2"],
+        [2.737, -1.345, 0.737, 0., 0, "O", "OXT"],
+    ],
+    "GLY": [
+        [1.931, 0.090, -0.034, 0., 1, "N", "N"],
+        [0.761, -0.799, -0.008, 0., 1, "C", "CA"],
+        [-0.498, 0.029, -0.005, 0., 1, "C", "C"],
+        [-0.429, 1.235, -0.023, 0., 1, "O", "O"],
+        [-1.697, -0.574, 0.018, 0., 0, "O", "OXT"],
+    ],
+    "HIS": [
+        [-0.040, -1.210, 0.053, 0., 1, "N", "N"],
+        [1.172, -1.709, 0.652, 0., 1, "C", "CA"],
+        [1.083, -3.207, 0.905, 0., 1, "C", "C"],
+        [0.040, -3.770, 1.222, 0., 1, "O", "O"],
+        [1.484, -0.975, 1.962, 0., 1, "C", "CB"],
+        [2.940, -1.060, 2.353, 0., 1, "C", "CG"],
+        [3.380, -2.075, 3.129, 0., 1, "N", "ND1"],
+        [3.960, -0.251, 2.046, 0., 1, "C", "CD2"],
+        [4.693, -1.908, 3.317, 0., 1, "C", "CE1"],
+        [5.058, -0.801, 2.662, 0., 1, "N", "NE2"],
+        [2.247, -3.882, 0.744, 0., 0, "O", "OXT"],
+    ],
+    "ILE": [
+        [-1.944, 0.335, -0.343, 0., 1, "N", "N"],
+        [-0.487, 0.519, -0.369, 0., 1, "C", "CA"],
+        [0.066, -0.032, -1.657, 0., 1, "C", "C"],
+        [-0.484, -0.958, -2.203, 0., 1, "O", "O"],
+        [0.140, -0.219, 0.814, 0., 1, "C", "CB"],
+        [-0.421, 0.341, 2.122, 0., 1, "C", "CG1"],
+        [1.658, -0.027, 0.788, 0., 1, "C", "CG2"],
+        [0.206, -0.397, 3.305, 0., 1, "C", "CD1"],
+        [1.171, 0.504, -2.197, 0., 0, "O", "OXT"],
+    ],
+    "LEU": [
+        [-1.661, 0.627, -0.406, 0., 1, "N", "N"],
+        [-0.205, 0.441, -0.467, 0., 1, "C", "CA"],
+        [0.180, -0.055, -1.836, 0., 1, "C", "C"],
+        [-0.591, -0.731, -2.474, 0., 1, "O", "O"],
+        [0.221, -0.583, 0.585, 0., 1, "C", "CB"],
+        [-0.170, -0.079, 1.976, 0., 1, "C", "CG"],
+        [0.256, -1.104, 3.029, 0., 1, "C", "CD1"],
+        [0.526, 1.254, 2.250, 0., 1, "C", "CD2"],
+        [1.382, 0.254, -2.348, 0., 0, "O", "OXT"],
+    ],
+    "LYS": [
+        [1.422, 1.796, 0.198, 0., 1, "N", "N"],
+        [1.394, 0.355, 0.484, 0., 1, "C", "CA"],
+        [2.657, -0.284, -0.032, 0., 1, "C", "C"],
+        [3.316, 0.275, -0.876, 0., 1, "O", "O"],
+        [0.184, -0.278, -0.206, 0., 1, "C", "CB"],
+        [-1.102, 0.282, 0.407, 0., 1, "C", "CG"],
+        [-2.313, -0.351, -0.283, 0., 1, "C", "CD"],
+        [-3.598, 0.208, 0.329, 0., 1, "C", "CE"],
+        [-4.761, -0.400, -0.332, 0., 1, "N", "NZ"],
+        [3.050, -1.476, 0.446, 0., 0, "O", "OXT"],
+    ],
+    "MET": [
+        [-1.816, 0.142, -1.166, 0., 1, "N", "N"],
+        [-0.392, 0.499, -1.214, 0., 1, "C", "CA"],
+        [0.206, 0.002, -2.504, 0., 1, "C", "C"],
+        [-0.236, -0.989, -3.033, 0., 1, "O", "O"],
+        [0.334, -0.145, -0.032, 0., 1, "C", "CB"],
+        [-0.273, 0.359, 1.277, 0., 1, "C", "CG"],
+        [0.589, -0.405, 2.678, 0., 1, "S", "SD"],
+        [-0.314, 0.353, 4.056, 0., 1, "C", "CE"],
+        [1.232, 0.661, -3.066, 0., 0, "O", "OXT"],
+    ],
+    "PHE": [
+        [1.317, 0.962, 1.014, 0., 1, "N", "N"],
+        [-0.020, 0.426, 1.300, 0., 1, "C", "CA"],
+        [-0.109, 0.047, 2.756, 0., 1, "C", "C"],
+        [0.879, -0.317, 3.346, 0., 1, "O", "O"],
+        [-0.270, -0.809, 0.434, 0., 1, "C", "CB"],
+        [-0.181, -0.430, -1.020, 0., 1, "C", "CG"],
+        [1.031, -0.498, -1.680, 0., 1, "C", "CD1"],
+        [-1.314, -0.018, -1.698, 0., 1, "C", "CD2"],
+        [1.112, -0.150, -3.015, 0., 1, "C", "CE1"],
+        [-1.231, 0.333, -3.032, 0., 1, "C", "CE2"],
+        [-0.018, 0.265, -3.691, 0., 1, "C", "CZ"],
+        [-1.286, 0.113, 3.396, 0., 0, "O", "OXT"],
+    ],
+    "PRO": [
+        [-0.816, 1.108, 0.254, 0., 1, "N", "N"],
+        [0.001, -0.107, 0.509, 0., 1, "C", "CA"],
+        [1.408, 0.091, 0.005, 0., 1, "C", "C"],
+        [1.650, 0.980, -0.777, 0., 1, "O", "O"],
+        [-0.703, -1.227, -0.286, 0., 1, "C", "CB"],
+        [-2.163, -0.753, -0.439, 0., 1, "C", "CG"],
+        [-2.218, 0.614, 0.276, 0., 1, "C", "CD"],
+        [2.391, -0.721, 0.424, 0., 0, "O", "OXT"],
+    ],
+    "SER": [
+        [1.525, 0.493, -0.608, 0., 1, "N", "N"],
+        [0.100, 0.469, -0.252, 0., 1, "C", "CA"],
+        [-0.053, 0.004, 1.173, 0., 1, "C", "C"],
+        [0.751, -0.760, 1.649, 0., 1, "O", "O"],
+        [-0.642, -0.489, -1.184, 0., 1, "C", "CB"],
+        [-0.496, -0.049, -2.535, 0., 1, "O", "OG"],
+        [-1.084, 0.440, 1.913, 0., 0, "O", "OXT"],
+    ],
+    "THR": [
+        [1.543, -0.702, 0.430, 0., 1, "N", "N"],
+        [0.122, -0.706, 0.056, 0., 1, "C", "CA"],
+        [-0.038, -0.090, -1.309, 0., 1, "C", "C"],
+        [0.732, 0.761, -1.683, 0., 1, "O", "O"],
+        [-0.675, 0.104, 1.079, 0., 1, "C", "CB"],
+        [-0.193, 1.448, 1.103, 0., 1, "O", "OG1"],
+        [-0.511, -0.521, 2.466, 0., 1, "C", "CG2"],
+        [-1.039, -0.488, -2.110, 0., 0, "O", "OXT"],
+    ],
+    "TRP": [
+        [1.278, 1.121, 2.059, 0., 1, "N", "N"],
+        [-0.008, 0.417, 1.970, 0., 1, "C", "CA"],
+        [-0.490, 0.076, 3.357, 0., 1, "C", "C"],
+        [0.308, -0.130, 4.240, 0., 1, "O", "O"],
+        [0.168, -0.868, 1.161, 0., 1, "C", "CB"],
+        [0.650, -0.526, -0.225, 0., 1, "C", "CG"],
+        [1.928, -0.418, -0.622, 0., 1, "C", "CD1"],
+        [-0.186, -0.256, -1.396, 0., 1, "C", "CD2"],
+        [1.978, -0.095, -1.951, 0., 1, "N", "NE1"],
+        [0.701, 0.014, -2.454, 0., 1, "C", "CE2"],
+        [-1.564, -0.210, -1.615, 0., 1, "C", "CE3"],
+        [0.190, 0.314, -3.712, 0., 1, "C", "CZ2"],
+        [-2.044, 0.086, -2.859, 0., 1, "C", "CZ3"],
+        [-1.173, 0.348, -3.907, 0., 1, "C", "CH2"],
+        [-1.806, 0.001, 3.610, 0., 0, "O", "OXT"],
+    ],
+    "TYR": [
+        [1.320, 0.952, 1.428, 0., 1, "N", "N"],
+        [-0.018, 0.429, 1.734, 0., 1, "C", "CA"],
+        [-0.103, 0.094, 3.201, 0., 1, "C", "C"],
+        [0.886, -0.254, 3.799, 0., 1, "O", "O"],
+        [-0.274, -0.831, 0.907, 0., 1, "C", "CB"],
+        [-0.189, -0.496, -0.559, 0., 1, "C", "CG"],
+        [1.022, -0.589, -1.219, 0., 1, "C", "CD1"],
+        [-1.324, -0.102, -1.244, 0., 1, "C", "CD2"],
+        [1.103, -0.282, -2.563, 0., 1, "C", "CE1"],
+        [-1.247, 0.210, -2.587, 0., 1, "C", "CE2"],
+        [-0.032, 0.118, -3.252, 0., 1, "C", "CZ"],
+        [0.044, 0.420, -4.574, 0., 1, "O", "OH"],
+        [-1.279, 0.184, 3.842, 0., 0, "O", "OXT"],
+    ],
+    "VAL": [
+        [1.564, -0.642, 0.454, 0., 1, "N", "N"],
+        [0.145, -0.698, 0.079, 0., 1, "C", "CA"],
+        [-0.037, -0.093, -1.288, 0., 1, "C", "C"],
+        [0.703, 0.784, -1.664, 0., 1, "O", "O"],
+        [-0.682, 0.086, 1.098, 0., 1, "C", "CB"],
+        [-0.497, -0.528, 2.487, 0., 1, "C", "CG1"],
+        [-0.218, 1.543, 1.119, 0., 1, "C", "CG2"],
+        [-1.022, -0.529, -2.089, 0., 0, "O", "OXT"],
+    ],
+    "A  ": [
+        [2.135, -1.141, -5.313, 0., 0, "O", "OP3"],
+        [1.024, -0.137, -4.723, 0., 1, "P", "P"],
+        [1.633, 1.190, -4.488, 0., 1, "O", "OP1"],
+        [-0.183, 0.005, -5.778, 0., 1, "O", "OP2"],
+        [0.456, -0.720, -3.334, 0., 1, "O", "O5'"],
+        [-0.520, 0.209, -2.863, 0., 1, "C", "C5'"],
+        [-1.101, -0.287, -1.538, 0., 1, "C", "C4'"],
+        [-0.064, -0.383, -0.538, 0., 1, "O", "O4'"],
+        [-2.105, 0.739, -0.969, 0., 1, "C", "C3'"],
+        [-3.445, 0.360, -1.287, 0., 1, "O", "O3'"],
+        [-1.874, 0.684, 0.558, 0., 1, "C", "C2'"],
+        [-3.065, 0.271, 1.231, 0., 1, "O", "O2'"],
+        [-0.755, -0.367, 0.729, 0., 1, "C", "C1'"],
+        [0.158, 0.029, 1.803, 0., 1, "N", "N9"],
+        [1.265, 0.813, 1.672, 0., 1, "C", "C8"],
+        [1.843, 0.963, 2.828, 0., 1, "N", "N7"],
+        [1.143, 0.292, 3.773, 0., 1, "C", "C5"],
+        [1.290, 0.091, 5.156, 0., 1, "C", "C6"],
+        [2.344, 0.664, 5.846, 0., 1, "N", "N6"],
+        [0.391, -0.656, 5.787, 0., 1, "N", "N1"],
+        [-0.617, -1.206, 5.136, 0., 1, "C", "C2"],
+        [-0.792, -1.051, 3.841, 0., 1, "N", "N3"],
+        [0.056, -0.320, 3.126, 0., 1, "C", "C4"],
+    ],
+    "G  ": [
+        [-1.945, -1.360, 5.599, 0., 0, "O", "OP3"],
+        [-0.911, -0.277, 5.008, 0., 1, "P", "P"],
+        [-1.598, 1.022, 4.844, 0., 1, "O", "OP1"],
+        [0.325, -0.105, 6.025, 0., 1, "O", "OP2"],
+        [-0.365, -0.780, 3.580, 0., 1, "O", "O5'"],
+        [0.542, 0.217, 3.109, 0., 1, "C", "C5'"],
+        [1.100, -0.200, 1.748, 0., 1, "C", "C4'"],
+        [0.033, -0.318, 0.782, 0., 1, "O", "O4'"],
+        [2.025, 0.898, 1.182, 0., 1, "C", "C3'"],
+        [3.395, 0.582, 1.439, 0., 1, "O", "O3'"],
+        [1.741, 0.884, -0.338, 0., 1, "C", "C2'"],
+        [2.927, 0.560, -1.066, 0., 1, "O", "O2'"],
+        [0.675, -0.220, -0.507, 0., 1, "C", "C1'"],
+        [-0.297, 0.162, -1.534, 0., 1, "N", "N9"],
+        [-1.440, 0.880, -1.334, 0., 1, "C", "C8"],
+        [-2.066, 1.037, -2.464, 0., 1, "N", "N7"],
+        [-1.364, 0.431, -3.453, 0., 1, "C", "C5"],
+        [-1.556, 0.279, -4.846, 0., 1, "C", "C6"],
+        [-2.534, 0.755, -5.397, 0., 1, "O", "O6"],
+        [-0.626, -0.401, -5.551, 0., 1, "N", "N1"],
+        [0.459, -0.934, -4.923, 0., 1, "C", "C2"],
+        [1.384, -1.626, -5.664, 0., 1, "N", "N2"],
+        [0.649, -0.800, -3.630, 0., 1, "N", "N3"],
+        [-0.226, -0.134, -2.868, 0., 1, "C", "C4"],
+    ],
+    "C  ": [
+        [2.147, -1.021, -4.678, 0., 0, "O", "OP3"],
+        [1.049, -0.039, -4.028, 0., 1, "P", "P"],
+        [1.692, 1.237, -3.646, 0., 1, "O", "OP1"],
+        [-0.116, 0.246, -5.102, 0., 1, "O", "OP2"],
+        [0.415, -0.733, -2.721, 0., 1, "O", "O5'"],
+        [-0.546, 0.181, -2.193, 0., 1, "C", "C5'"],
+        [-1.189, -0.419, -0.942, 0., 1, "C", "C4'"],
+        [-0.190, -0.648, 0.076, 0., 1, "O", "O4'"],
+        [-2.178, 0.583, -0.307, 0., 1, "C", "C3'"],
+        [-3.518, 0.283, -0.703, 0., 1, "O", "O3'"],
+        [-2.001, 0.373, 1.215, 0., 1, "C", "C2'"],
+        [-3.228, -0.059, 1.806, 0., 1, "O", "O2'"],
+        [-0.924, -0.729, 1.317, 0., 1, "C", "C1'"],
+        [-0.036, -0.470, 2.453, 0., 1, "N", "N1"],
+        [0.652, 0.683, 2.514, 0., 1, "C", "C2"],
+        [0.529, 1.504, 1.620, 0., 1, "O", "O2"],
+        [1.467, 0.945, 3.535, 0., 1, "N", "N3"],
+        [1.620, 0.070, 4.520, 0., 1, "C", "C4"],
+        [2.464, 0.350, 5.569, 0., 1, "N", "N4"],
+        [0.916, -1.151, 4.483, 0., 1, "C", "C5"],
+        [0.087, -1.399, 3.442, 0., 1, "C", "C6"],
+    ],
+    "U  ": [
+        [-2.122, 1.033, -4.690, 0., 0, "O", "OP3"],
+        [-1.030, 0.047, -4.037, 0., 1, "P", "P"],
+        [-1.679, -1.228, -3.660, 0., 1, "O", "OP1"],
+        [0.138, -0.241, -5.107, 0., 1, "O", "OP2"],
+        [-0.399, 0.736, -2.726, 0., 1, "O", "O5'"],
+        [0.557, -0.182, -2.196, 0., 1, "C", "C5'"],
+        [1.197, 0.415, -0.942, 0., 1, "C", "C4'"],
+        [0.194, 0.645, 0.074, 0., 1, "O", "O4'"],
+        [2.181, -0.588, -0.301, 0., 1, "C", "C3'"],
+        [3.524, -0.288, -0.686, 0., 1, "O", "O3'"],
+        [1.995, -0.383, 1.218, 0., 1, "C", "C2'"],
+        [3.219, 0.046, 1.819, 0., 1, "O", "O2'"],
+        [0.922, 0.723, 1.319, 0., 1, "C", "C1'"],
+        [0.028, 0.464, 2.451, 0., 1, "N", "N1"],
+        [-0.690, -0.671, 2.486, 0., 1, "C", "C2"],
+        [-0.587, -1.474, 1.580, 0., 1, "O", "O2"],
+        [-1.515, -0.936, 3.517, 0., 1, "N", "N3"],
+        [-1.641, -0.055, 4.530, 0., 1, "C", "C4"],
+        [-2.391, -0.292, 5.460, 0., 1, "O", "O4"],
+        [-0.894, 1.146, 4.502, 0., 1, "C", "C5"],
+        [-0.070, 1.384, 3.459, 0., 1, "C", "C6"],
+    ],
+    "DA ": [
+        [1.845, -1.282, -5.339, 0., 0, "O", "OP3"],
+        [0.934, -0.156, -4.636, 0., 1, "P", "P"],
+        [1.781, 0.996, -4.255, 0., 1, "O", "OP1"],
+        [-0.204, 0.331, -5.665, 0., 1, "O", "OP2"],
+        [0.241, -0.771, -3.320, 0., 1, "O", "O5'"],
+        [-0.549, 0.270, -2.744, 0., 1, "C", "C5'"],
+        [-1.239, -0.251, -1.482, 0., 1, "C", "C4'"],
+        [-0.267, -0.564, -0.458, 0., 1, "O", "O4'"],
+        [-2.105, 0.859, -0.835, 0., 1, "C", "C3'"],
+        [-3.409, 0.895, -1.418, 0., 1, "O", "O3'"],
+        [-2.173, 0.398, 0.640, 0., 1, "C", "C2'"],
+        [-0.965, -0.545, 0.797, 0., 1, "C", "C1'"],
+        [-0.078, -0.047, 1.852, 0., 1, "N", "N9"],
+        [0.962, 0.817, 1.689, 0., 1, "C", "C8"],
+        [1.535, 1.044, 2.835, 0., 1, "N", "N7"],
+        [0.897, 0.346, 3.805, 0., 1, "C", "C5"],
+        [1.069, 0.196, 5.191, 0., 1, "C", "C6"],
+        [2.079, 0.869, 5.856, 0., 1, "N", "N6"],
+        [0.236, -0.603, 5.850, 0., 1, "N", "N1"],
+        [-0.729, -1.249, 5.224, 0., 1, "C", "C2"],
+        [-0.925, -1.144, 3.927, 0., 1, "N", "N3"],
+        [-0.142, -0.368, 3.184, 0., 1, "C", "C4"],
+    ],
+    "DG ": [
+        [-1.603, -1.547, 5.624, 0., 0, "O", "OP3"],
+        [-0.818, -0.321, 4.935, 0., 1, "P", "P"],
+        [-1.774, 0.766, 4.630, 0., 1, "O", "OP1"],
+        [0.312, 0.224, 5.941, 0., 1, "O", "OP2"],
+        [-0.126, -0.826, 3.572, 0., 1, "O", "O5'"],
+        [0.550, 0.300, 3.011, 0., 1, "C", "C5'"],
+        [1.233, -0.113, 1.706, 0., 1, "C", "C4'"],
+        [0.253, -0.471, 0.705, 0., 1, "O", "O4'"],
+        [1.976, 1.091, 1.073, 0., 1, "C", "C3'"],
+        [3.294, 1.218, 1.612, 0., 1, "O", "O3'"],
+        [2.026, 0.692, -0.421, 0., 1, "C", "C2'"],
+        [0.897, -0.345, -0.573, 0., 1, "C", "C1'"],
+        [-0.068, 0.111, -1.575, 0., 1, "N", "N9"],
+        [-1.172, 0.877, -1.341, 0., 1, "C", "C8"],
+        [-1.804, 1.094, -2.458, 0., 1, "N", "N7"],
+        [-1.145, 0.482, -3.472, 0., 1, "C", "C5"],
+        [-1.361, 0.377, -4.866, 0., 1, "C", "C6"],
+        [-2.321, 0.914, -5.391, 0., 1, "O", "O6"],
+        [-0.473, -0.327, -5.601, 0., 1, "N", "N1"],
+        [0.593, -0.928, -5.003, 0., 1, "C", "C2"],
+        [1.474, -1.643, -5.774, 0., 1, "N", "N2"],
+        [0.804, -0.839, -3.709, 0., 1, "N", "N3"],
+        [-0.027, -0.152, -2.917, 0., 1, "C", "C4"],
+    ],
+    "DC ": [
+        [1.941, -1.055, -4.672, 0., 0, "O", "OP3"],
+        [0.987, -0.017, -3.894, 0., 1, "P", "P"],
+        [1.802, 1.099, -3.365, 0., 1, "O", "OP1"],
+        [-0.119, 0.560, -4.910, 0., 1, "O", "OP2"],
+        [0.255, -0.772, -2.674, 0., 1, "O", "O5'"],
+        [-0.571, 0.196, -2.027, 0., 1, "C", "C5'"],
+        [-1.300, -0.459, -0.852, 0., 1, "C", "C4'"],
+        [-0.363, -0.863, 0.171, 0., 1, "O", "O4'"],
+        [-2.206, 0.569, -0.129, 0., 1, "C", "C3'"],
+        [-3.488, 0.649, -0.756, 0., 1, "O", "O3'"],
+        [-2.322, -0.040, 1.288, 0., 1, "C", "C2'"],
+        [-1.106, -0.981, 1.395, 0., 1, "C", "C1'"],
+        [-0.267, -0.584, 2.528, 0., 1, "N", "N1"],
+        [0.270, 0.648, 2.563, 0., 1, "C", "C2"],
+        [0.052, 1.424, 1.647, 0., 1, "O", "O2"],
+        [1.037, 1.035, 3.581, 0., 1, "N", "N3"],
+        [1.291, 0.212, 4.589, 0., 1, "C", "C4"],
+        [2.085, 0.622, 5.635, 0., 1, "N", "N4"],
+        [0.746, -1.088, 4.580, 0., 1, "C", "C5"],
+        [-0.035, -1.465, 3.541, 0., 1, "C", "C6"],
+    ],
+    "DT ": [
+        [-3.912, -2.311, 1.636, 0., 0, "O", "OP3"],
+        [-3.968, -1.665, 3.118, 0., 1, "P", "P"],
+        [-4.406, -2.599, 4.208, 0., 1, "O", "OP1"],
+        [-4.901, -0.360, 2.920, 0., 1, "O", "OP2"],
+        [-2.493, -1.028, 3.315, 0., 1, "O", "O5'"],
+        [-2.005, -0.136, 2.327, 0., 1, "C", "C5'"],
+        [-0.611, 0.328, 2.728, 0., 1, "C", "C4'"],
+        [0.247, -0.829, 2.764, 0., 1, "O", "O4'"],
+        [0.008, 1.286, 1.720, 0., 1, "C", "C3'"],
+        [0.965, 2.121, 2.368, 0., 1, "O", "O3'"],
+        [0.710, 0.360, 0.754, 0., 1, "C", "C2'"],
+        [1.157, -0.778, 1.657, 0., 1, "C", "C1'"],
+        [1.164, -2.047, 0.989, 0., 1, "N", "N1"],
+        [2.333, -2.544, 0.374, 0., 1, "C", "C2"],
+        [3.410, -1.945, 0.363, 0., 1, "O", "O2"],
+        [2.194, -3.793, -0.240, 0., 1, "N", "N3"],
+        [1.047, -4.570, -0.300, 0., 1, "C", "C4"],
+        [0.995, -5.663, -0.857, 0., 1, "O", "O4"],
+        [-0.143, -3.980, 0.369, 0., 1, "C", "C5"],
+        [-1.420, -4.757, 0.347, 0., 1, "C", "C7"],
+        [-0.013, -2.784, 0.958, 0., 1, "C", "C6"],
+    ],
+}
+standard_ccd_to_ref_atom_name_chars = {
+    ccd: [atom_ref_feats[-1] for atom_ref_feats in standard_ccd_to_reference_features_table[ccd]]
+    for ccd in standard_ccds if not is_unk(ccd)
+}
+eye_64 = np.eye(64)
+eye_128 = np.eye(128)
+eye_9 = np.eye(9)
+eye_7 = np.eye(7)
+eye_3 = np.eye(3)
+eye_32 = np.eye(32)
+eye_5 = np.eye(5)
+def _get_ref_feat_from_ccd_data(ccd, ref_feat_table):
+    ref_feat = np.stack([
+        np.concatenate(
+            [np.array(atom_ref_feats[:5]), eye_128[get_element_id[atom_ref_feats[5].lower()]],
+             *[eye_64[ord(c) - 32] for c in f"{atom_ref_feats[-1]:<4}"]], axis=-1)
+        for atom_ref_feats in ref_feat_table[ccd]
+    ], axis=0)
+    return ref_feat
+standard_ccd_to_ref_feat = {
+    ccd: _get_ref_feat_from_ccd_data(ccd, standard_ccd_to_reference_features_table) for ccd in standard_ccds if
+    not is_unk(ccd)
+}

PhysDock/data/tools/templates.py ADDED Viewed

	@@ -0,0 +1,1357 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for getting templates and calculating template features."""
+import abc
+import dataclasses
+import datetime
+import functools
+import glob
+import json
+import logging
+import os
+import re
+from typing import Any, Dict, Mapping, Optional, Sequence, Tuple
+import numpy as np
+from . import parsers, mmcif_parsing
+from . import kalign
+from .utils import to_date
+@dataclasses.dataclass
+class residue_constants:
+    restypes = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
+    restype_order = {restype: i for i, restype in enumerate(restypes)}
+    restypes_with_x = restypes + ["X"]
+    restype_order_with_x = {restype: i for i, restype in enumerate(restypes_with_x)}
+    atom_type_num = 37
+    atom_types = ["N", "CA", "C", "CB", "O", "CG", "CG1", "CG2", "OG", "OG1", "SG", "CD", "CD1", "CD2", "ND1", "ND2",
+                  "OD1",
+                  "OD2", "SD", "CE", "CE1", "CE2", "CE3", "NE", "NE1", "NE2", "OE1", "OE2", "CH2", "NH1", "NH2", "OH",
+                  "CZ",
+                  "CZ2", "CZ3", "NZ", "OXT", ]
+    atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
+    HHBLITS_AA_TO_ID = {
+        "A": 0, "B": 2, "C": 1, "D": 2, "E": 3, "F": 4, "G": 5, "H": 6, "I": 7, "J": 20, "K": 8, "L": 9, "M": 10,
+        "N": 11, "O": 20, "P": 12, "Q": 13, "R": 14, "S": 15, "T": 16, "U": 1, "V": 17, "W": 18, "X": 20, "Y": 19,
+        "Z": 3, "-": 21,
+    }
+    @staticmethod
+    def sequence_to_onehot(
+            sequence: str, mapping: Mapping[str, int], map_unknown_to_x: bool = False
+    ) -> np.ndarray:
+        """Maps the given sequence into a one-hot encoded matrix.
+        Args:
+            sequence: An amino acid sequence.
+            mapping: A dictionary mapping amino acids to integers.
+            map_unknown_to_x: If True, any amino acid that is not in the mapping will be
+                mapped to the unknown amino acid 'X'. If the mapping doesn't contain
+                amino acid 'X', an error will be thrown. If False, any amino acid not in
+                the mapping will throw an error.
+        Returns:
+            A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of
+            the sequence.
+        Raises:
+            ValueError: If the mapping doesn't contain values from 0 to
+                num_unique_aas - 1 without any gaps.
+        """
+        num_entries = max(mapping.values()) + 1
+        if sorted(set(mapping.values())) != list(range(num_entries)):
+            raise ValueError(
+                "The mapping must have values from 0 to num_unique_aas-1 "
+                "without any gaps. Got: %s" % sorted(mapping.values())
+            )
+        one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)
+        for aa_index, aa_type in enumerate(sequence):
+            if map_unknown_to_x:
+                if aa_type.isalpha() and aa_type.isupper():
+                    aa_id = mapping.get(aa_type, mapping["X"])
+                else:
+                    raise ValueError(f"Invalid character in the sequence: {aa_type}")
+            else:
+                aa_id = mapping[aa_type]
+            one_hot_arr[aa_index, aa_id] = 1
+        return one_hot_arr
+class Error(Exception):
+    """Base class for exceptions."""
+class NoChainsError(Error):
+    """An error indicating that template mmCIF didn't have any chains."""
+class SequenceNotInTemplateError(Error):
+    """An error indicating that template mmCIF didn't contain the sequence."""
+class NoAtomDataInTemplateError(Error):
+    """An error indicating that template mmCIF didn't contain atom positions."""
+class TemplateAtomMaskAllZerosError(Error):
+    """An error indicating that template mmCIF had all atom positions masked."""
+class QueryToTemplateAlignError(Error):
+    """An error indicating that the query can't be aligned to the template."""
+class CaDistanceError(Error):
+    """An error indicating that a CA atom distance exceeds a threshold."""
+# Prefilter exceptions.
+class PrefilterError(Exception):
+    """A base class for template prefilter exceptions."""
+class DateError(PrefilterError):
+    """An error indicating that the hit date was after the max allowed date."""
+class AlignRatioError(PrefilterError):
+    """An error indicating that the hit align ratio to the query was too small."""
+class DuplicateError(PrefilterError):
+    """An error indicating that the hit was an exact subsequence of the query."""
+class LengthError(PrefilterError):
+    """An error indicating that the hit was too short."""
+TEMPLATE_FEATURES = {
+    "template_aatype": np.int64,
+    "template_all_atom_masks": np.float32,
+    "template_all_atom_positions": np.float32,
+    "template_domain_names": object,
+    "template_sequence": object,
+    "template_sum_probs": np.float32,
+}
+def empty_template_feats(n_res):
+    return {
+        "template_aatype": np.zeros(
+            (0, n_res, len(residue_constants.restypes_with_x_and_gap)),
+            np.float32
+        ),
+        "template_all_atom_masks": np.zeros(
+            (0, n_res, residue_constants.atom_type_num), np.float32
+        ),
+        "template_all_atom_positions": np.zeros(
+            (0, n_res, residue_constants.atom_type_num, 3), np.float32
+        ),
+        "template_domain_names": np.array([''.encode()], dtype=np.object),
+        "template_sequence": np.array([''.encode()], dtype=np.object),
+        "template_sum_probs": np.zeros((0, 1), dtype=np.float32),
+    }
+def _get_pdb_id_and_chain(hit: parsers.TemplateHit) -> Tuple[str, str]:
+    """Returns PDB id and chain id for an HHSearch Hit."""
+    # PDB ID: 4 letters. Chain ID: 1+ alphanumeric letters or "." if unknown.
+    id_match = re.match(r"[a-zA-Z\d]{4}_[a-zA-Z0-9.]+", hit.name)
+    if not id_match:
+        raise ValueError(f"hit.name did not start with PDBID_chain: {hit.name}")
+    pdb_id, chain_id = id_match.group(0).split("_")
+    return pdb_id.lower(), chain_id
+def _is_after_cutoff(
+        pdb_id: str,
+        release_dates: Mapping[str, datetime.datetime],
+        release_date_cutoff: Optional[datetime.datetime],
+) -> bool:
+    """Checks if the template date is after the release date cutoff.
+    Args:
+        pdb_id: 4 letter pdb code.
+        release_dates: Dictionary mapping PDB ids to their structure release dates.
+        release_date_cutoff: Max release date that is valid for this query.
+    Returns:
+        True if the template release date is after the cutoff, False otherwise.
+    """
+    pdb_id_upper = pdb_id.upper()
+    if release_date_cutoff is None:
+        raise ValueError("The release_date_cutoff must not be None.")
+    if pdb_id_upper in release_dates:
+        return release_dates[pdb_id_upper] > release_date_cutoff
+    else:
+        # Since this is just a quick prefilter to reduce the number of mmCIF files
+        # we need to parse, we don't have to worry about returning True here.
+        logging.info(
+            "Template structure not in release dates dict: %s", pdb_id
+        )
+        return False
+def _replace_obsolete_references(obsolete_mapping) -> Mapping[str, str]:
+    """Generates a new obsolete by tracing all cross-references and store the latest leaf to all referencing nodes"""
+    obsolete_new = {}
+    obsolete_keys = obsolete_mapping.keys()
+    def _new_target(k):
+        v = obsolete_mapping[k]
+        if v in obsolete_keys:
+            return _new_target(v)
+        return v
+    for k in obsolete_keys:
+        obsolete_new[k] = _new_target(k)
+    return obsolete_new
+def _parse_obsolete(obsolete_file_path: str) -> Mapping[str, str]:
+    """Parses the data file from PDB that lists which PDB ids are obsolete."""
+    with open(obsolete_file_path) as f:
+        result = {}
+        for line in f:
+            line = line.strip()
+            # We skip obsolete entries that don't contain a mapping to a new entry.
+            if line.startswith("OBSLTE") and len(line) > 30:
+                # Format:    Date      From     To
+                # 'OBSLTE    31-JUL-94 116L     216L'
+                from_id = line[20:24].lower()
+                to_id = line[29:33].lower()
+                result[from_id] = to_id
+        return _replace_obsolete_references(result)
+def generate_release_dates_cache(mmcif_dir: str, out_path: str):
+    dates = {}
+    for f in os.listdir(mmcif_dir):
+        if f.endswith(".cif"):
+            path = os.path.join(mmcif_dir, f)
+            with open(path, "r") as fp:
+                mmcif_string = fp.read()
+            file_id = os.path.splitext(f)[0]
+            mmcif = mmcif_parsing.parse(
+                file_id=file_id, mmcif_string=mmcif_string
+            )
+            if mmcif.mmcif_object is None:
+                logging.info(f"Failed to parse {f}. Skipping...")
+                continue
+            mmcif = mmcif.mmcif_object
+            release_date = mmcif.header["release_date"]
+            dates[file_id] = release_date
+    with open(out_path, "r") as fp:
+        fp.write(json.dumps(dates))
+def _parse_release_dates(path: str) -> Mapping[str, datetime.datetime]:
+    """Parses release dates file, returns a mapping from PDBs to release dates."""
+    with open(path, "r") as fp:
+        data = json.load(fp)
+    return {
+        pdb.upper(): to_date(v)
+        for pdb, d in data.items()
+        for k, v in d.items()
+        if k == "release_date"
+    }
+def _assess_hhsearch_hit(
+        hit: parsers.TemplateHit,
+        hit_pdb_code: str,
+        query_sequence: str,
+        release_dates: Mapping[str, datetime.datetime],
+        release_date_cutoff: datetime.datetime,
+        max_subsequence_ratio: float = 0.95,
+        min_align_ratio: float = 0.1,
+) -> bool:
+    """Determines if template is valid (without parsing the template mmcif file).
+    Args:
+        hit: HhrHit for the template.
+        hit_pdb_code: The 4 letter pdb code of the template hit. This might be
+            different from the value in the actual hit since the original pdb might
+            have become obsolete.
+        query_sequence: Amino acid sequence of the query.
+        release_dates: Dictionary mapping pdb codes to their structure release
+            dates.
+        release_date_cutoff: Max release date that is valid for this query.
+        max_subsequence_ratio: Exclude any exact matches with this much overlap.
+        min_align_ratio: Minimum overlap between the template and query.
+    Returns:
+        True if the hit passed the prefilter. Raises an exception otherwise.
+    Raises:
+        DateError: If the hit date was after the max allowed date.
+        AlignRatioError: If the hit align ratio to the query was too small.
+        DuplicateError: If the hit was an exact subsequence of the query.
+        LengthError: If the hit was too short.
+    """
+    aligned_cols = hit.aligned_cols
+    align_ratio = aligned_cols / len(query_sequence)
+    template_sequence = hit.hit_sequence.replace("-", "")
+    length_ratio = float(len(template_sequence)) / len(query_sequence)
+    if _is_after_cutoff(hit_pdb_code, release_dates, release_date_cutoff):
+        date = release_dates[hit_pdb_code.upper()]
+        raise DateError(
+            f"Date ({date}) > max template date "
+            f"({release_date_cutoff})."
+        )
+    if align_ratio <= min_align_ratio:
+        raise AlignRatioError(
+            "Proportion of residues aligned to query too small. "
+            f"Align ratio: {align_ratio}."
+        )
+    # Check whether the template is a large subsequence or duplicate of original
+    # query. This can happen due to duplicate entries in the PDB database.
+    duplicate = (
+            template_sequence in query_sequence
+            and length_ratio > max_subsequence_ratio
+    )
+    if duplicate:
+        raise DuplicateError(
+            "Template is an exact subsequence of query with large "
+            f"coverage. Length ratio: {length_ratio}."
+        )
+    if len(template_sequence) < 10:
+        raise LengthError(
+            f"Template too short. Length: {len(template_sequence)}."
+        )
+    return True
+def _find_template_in_pdb(
+        template_chain_id: str,
+        template_sequence: str,
+        mmcif_object: mmcif_parsing.MmcifObject,
+) -> Tuple[str, str, int]:
+    """Tries to find the template chain in the given pdb file.
+    This method tries the three following things in order:
+        1. Tries if there is an exact match in both the chain ID and the sequence.
+             If yes, the chain sequence is returned. Otherwise:
+        2. Tries if there is an exact match only in the sequence.
+             If yes, the chain sequence is returned. Otherwise:
+        3. Tries if there is a fuzzy match (X = wildcard) in the sequence.
+             If yes, the chain sequence is returned.
+    If none of these succeed, a SequenceNotInTemplateError is thrown.
+    Args:
+        template_chain_id: The template chain ID.
+        template_sequence: The template chain sequence.
+        mmcif_object: The PDB object to search for the template in.
+    Returns:
+        A tuple with:
+        * The chain sequence that was found to match the template in the PDB object.
+        * The ID of the chain that is being returned.
+        * The offset where the template sequence starts in the chain sequence.
+    Raises:
+        SequenceNotInTemplateError: If no match is found after the steps described
+            above.
+    """
+    # Try if there is an exact match in both the chain ID and the (sub)sequence.
+    pdb_id = mmcif_object.file_id
+    chain_sequence = mmcif_object.chain_to_seqres.get(template_chain_id)
+    if chain_sequence and (template_sequence in chain_sequence):
+        logging.info(
+            "Found an exact template match %s_%s.", pdb_id, template_chain_id
+        )
+        mapping_offset = chain_sequence.find(template_sequence)
+        return chain_sequence, template_chain_id, mapping_offset
+    # Try if there is an exact match in the (sub)sequence only.
+    for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
+        if chain_sequence and (template_sequence in chain_sequence):
+            logging.info("Found a sequence-only match %s_%s.", pdb_id, chain_id)
+            mapping_offset = chain_sequence.find(template_sequence)
+            return chain_sequence, chain_id, mapping_offset
+    # Return a chain sequence that fuzzy matches (X = wildcard) the template.
+    # Make parentheses unnamed groups (?:_) to avoid the 100 named groups limit.
+    regex = ["." if aa == "X" else "(?:%s|X)" % aa for aa in template_sequence]
+    regex = re.compile("".join(regex))
+    for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
+        match = re.search(regex, chain_sequence)
+        if match:
+            logging.info(
+                "Found a fuzzy sequence-only match %s_%s.", pdb_id, chain_id
+            )
+            mapping_offset = match.start()
+            return chain_sequence, chain_id, mapping_offset
+    # No hits, raise an error.
+    raise SequenceNotInTemplateError(
+        "Could not find the template sequence in %s_%s. Template sequence: %s, "
+        "chain_to_seqres: %s"
+        % (
+            pdb_id,
+            template_chain_id,
+            template_sequence,
+            mmcif_object.chain_to_seqres,
+        )
+    )
+def _realign_pdb_template_to_query(
+        old_template_sequence: str,
+        template_chain_id: str,
+        mmcif_object: mmcif_parsing.MmcifObject,
+        old_mapping: Mapping[int, int],
+        kalign_binary_path: str,
+) -> Tuple[str, Mapping[int, int]]:
+    """Aligns template from the mmcif_object to the query.
+    In case PDB70 contains a different version of the template sequence, we need
+    to perform a realignment to the actual sequence that is in the mmCIF file.
+    This method performs such realignment, but returns the new sequence and
+    mapping only if the sequence in the mmCIF file is 90% identical to the old
+    sequence.
+    Note that the old_template_sequence comes from the hit, and contains only that
+    part of the chain that matches with the query while the new_template_sequence
+    is the full chain.
+    Args:
+        old_template_sequence: The template sequence that was returned by the PDB
+            template search (typically done using HHSearch).
+        template_chain_id: The template chain id was returned by the PDB template
+            search (typically done using HHSearch). This is used to find the right
+            chain in the mmcif_object chain_to_seqres mapping.
+        mmcif_object: A mmcif_object which holds the actual template data.
+        old_mapping: A mapping from the query sequence to the template sequence.
+            This mapping will be used to compute the new mapping from the query
+            sequence to the actual mmcif_object template sequence by aligning the
+            old_template_sequence and the actual template sequence.
+        kalign_binary_path: The path to a kalign executable.
+    Returns:
+        A tuple (new_template_sequence, new_query_to_template_mapping) where:
+        * new_template_sequence is the actual template sequence that was found in
+            the mmcif_object.
+        * new_query_to_template_mapping is the new mapping from the query to the
+            actual template found in the mmcif_object.
+    Raises:
+        QueryToTemplateAlignError:
+        * If there was an error thrown by the alignment tool.
+        * Or if the actual template sequence differs by more than 10% from the
+            old_template_sequence.
+    """
+    aligner = kalign.Kalign(binary_path=kalign_binary_path)
+    new_template_sequence = mmcif_object.chain_to_seqres.get(
+        template_chain_id, ""
+    )
+    # Sometimes the template chain id is unknown. But if there is only a single
+    # sequence within the mmcif_object, it is safe to assume it is that one.
+    if not new_template_sequence:
+        if len(mmcif_object.chain_to_seqres) == 1:
+            logging.info(
+                "Could not find %s in %s, but there is only 1 sequence, so "
+                "using that one.",
+                template_chain_id,
+                mmcif_object.file_id,
+            )
+            new_template_sequence = list(mmcif_object.chain_to_seqres.values())[
+                0
+            ]
+        else:
+            raise QueryToTemplateAlignError(
+                f"Could not find chain {template_chain_id} in {mmcif_object.file_id}. "
+                "If there are no mmCIF parsing errors, it is possible it was not a "
+                "protein chain."
+            )
+    try:
+        parsed_a3m = parsers.parse_a3m(
+            aligner.align([old_template_sequence, new_template_sequence])
+        )
+        old_aligned_template, new_aligned_template = parsed_a3m.sequences
+    except Exception as e:
+        raise QueryToTemplateAlignError(
+            "Could not align old template %s to template %s (%s_%s). Error: %s"
+            % (
+                old_template_sequence,
+                new_template_sequence,
+                mmcif_object.file_id,
+                template_chain_id,
+                str(e),
+            )
+        )
+    logging.info(
+        "Old aligned template: %s\nNew aligned template: %s",
+        old_aligned_template,
+        new_aligned_template,
+    )
+    old_to_new_template_mapping = {}
+    old_template_index = -1
+    new_template_index = -1
+    num_same = 0
+    for old_template_aa, new_template_aa in zip(
+            old_aligned_template, new_aligned_template
+    ):
+        if old_template_aa != "-":
+            old_template_index += 1
+        if new_template_aa != "-":
+            new_template_index += 1
+        if old_template_aa != "-" and new_template_aa != "-":
+            old_to_new_template_mapping[old_template_index] = new_template_index
+            if old_template_aa == new_template_aa:
+                num_same += 1
+    # Require at least 90 % sequence identity wrt to the shorter of the sequences.
+    if (
+            float(num_same)
+            / min(len(old_template_sequence), len(new_template_sequence))
+            < 0.9
+    ):
+        raise QueryToTemplateAlignError(
+            "Insufficient similarity of the sequence in the database: %s to the "
+            "actual sequence in the mmCIF file %s_%s: %s. We require at least "
+            "90 %% similarity wrt to the shorter of the sequences. This is not a "
+            "problem unless you think this is a template that should be included."
+            % (
+                old_template_sequence,
+                mmcif_object.file_id,
+                template_chain_id,
+                new_template_sequence,
+            )
+        )
+    new_query_to_template_mapping = {}
+    for query_index, old_template_index in old_mapping.items():
+        new_query_to_template_mapping[
+            query_index
+        ] = old_to_new_template_mapping.get(old_template_index, -1)
+    new_template_sequence = new_template_sequence.replace("-", "")
+    return new_template_sequence, new_query_to_template_mapping
+def _check_residue_distances(
+        all_positions: np.ndarray,
+        all_positions_mask: np.ndarray,
+        max_ca_ca_distance: float,
+):
+    """Checks if the distance between unmasked neighbor residues is ok."""
+    ca_position = residue_constants.atom_order["CA"]
+    prev_is_unmasked = False
+    prev_calpha = None
+    for i, (coords, mask) in enumerate(zip(all_positions, all_positions_mask)):
+        this_is_unmasked = bool(mask[ca_position])
+        if this_is_unmasked:
+            this_calpha = coords[ca_position]
+            if prev_is_unmasked:
+                distance = np.linalg.norm(this_calpha - prev_calpha)
+                if distance > max_ca_ca_distance:
+                    raise CaDistanceError(
+                        "The distance between residues %d and %d is %f > limit %f."
+                        % (i, i + 1, distance, max_ca_ca_distance)
+                    )
+            prev_calpha = this_calpha
+        prev_is_unmasked = this_is_unmasked
+def _get_atom_positions(
+        mmcif_object: mmcif_parsing.MmcifObject,
+        auth_chain_id: str,
+        max_ca_ca_distance: float,
+        _zero_center_positions: bool = False,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Gets atom positions and mask from a list of Biopython Residues."""
+    coords_with_mask = mmcif_parsing.get_atom_coords(
+        mmcif_object=mmcif_object,
+        chain_id=auth_chain_id,
+        _zero_center_positions=_zero_center_positions,
+    )
+    all_atom_positions, all_atom_mask = coords_with_mask
+    _check_residue_distances(
+        all_atom_positions, all_atom_mask, max_ca_ca_distance
+    )
+    return all_atom_positions, all_atom_mask
+def _extract_template_features(
+        mmcif_object: mmcif_parsing.MmcifObject,
+        pdb_id: str,
+        mapping: Mapping[int, int],
+        template_sequence: str,
+        query_sequence: str,
+        template_chain_id: str,
+        kalign_binary_path: str,
+        _zero_center_positions: bool = True,
+) -> Tuple[Dict[str, Any], Optional[str]]:
+    """Parses atom positions in the target structure and aligns with the query.
+    Atoms for each residue in the template structure are indexed to coincide
+    with their corresponding residue in the query sequence, according to the
+    alignment mapping provided.
+    Args:
+        mmcif_object: mmcif_parsing.MmcifObject representing the template.
+        pdb_id: PDB code for the template.
+        mapping: Dictionary mapping indices in the query sequence to indices in
+            the template sequence.
+        template_sequence: String describing the amino acid sequence for the
+            template protein.
+        query_sequence: String describing the amino acid sequence for the query
+            protein.
+        template_chain_id: String ID describing which chain in the structure proto
+            should be used.
+        kalign_binary_path: The path to a kalign executable used for template
+                realignment.
+    Returns:
+        A tuple with:
+        * A dictionary containing the extra features derived from the template
+            protein structure.
+        * A warning message if the hit was realigned to the actual mmCIF sequence.
+            Otherwise None.
+    Raises:
+        NoChainsError: If the mmcif object doesn't contain any chains.
+        SequenceNotInTemplateError: If the given chain id / sequence can't
+            be found in the mmcif object.
+        QueryToTemplateAlignError: If the actual template in the mmCIF file
+            can't be aligned to the query.
+        NoAtomDataInTemplateError: If the mmcif object doesn't contain
+            atom positions.
+        TemplateAtomMaskAllZerosError: If the mmcif object doesn't have any
+            unmasked residues.
+    """
+    if mmcif_object is None or not mmcif_object.chain_to_seqres:
+        raise NoChainsError(
+            "No chains in PDB: %s_%s" % (pdb_id, template_chain_id)
+        )
+    warning = None
+    try:
+        seqres, chain_id, mapping_offset = _find_template_in_pdb(
+            template_chain_id=template_chain_id,
+            template_sequence=template_sequence,
+            mmcif_object=mmcif_object,
+        )
+    except SequenceNotInTemplateError:
+        # If PDB70 contains a different version of the template, we use the sequence
+        # from the mmcif_object.
+        chain_id = template_chain_id
+        warning = (
+            f"The exact sequence {template_sequence} was not found in "
+            f"{pdb_id}_{chain_id}. Realigning the template to the actual sequence."
+        )
+        logging.warning(warning)
+        # This throws an exception if it fails to realign the hit.
+        seqres, mapping = _realign_pdb_template_to_query(
+            old_template_sequence=template_sequence,
+            template_chain_id=template_chain_id,
+            mmcif_object=mmcif_object,
+            old_mapping=mapping,
+            kalign_binary_path=kalign_binary_path,
+        )
+        logging.info(
+            "Sequence in %s_%s: %s successfully realigned to %s",
+            pdb_id,
+            chain_id,
+            template_sequence,
+            seqres,
+        )
+        # The template sequence changed.
+        template_sequence = seqres
+        # No mapping offset, the query is aligned to the actual sequence.
+        mapping_offset = 0
+    try:
+        # Essentially set to infinity - we don't want to reject templates unless
+        # they're really really bad.
+        all_atom_positions, all_atom_mask = _get_atom_positions(
+            mmcif_object,
+            chain_id,
+            max_ca_ca_distance=150.0,
+            _zero_center_positions=_zero_center_positions,
+        )
+    except (CaDistanceError, KeyError) as ex:
+        raise NoAtomDataInTemplateError(
+            "Could not get atom data (%s_%s): %s" % (pdb_id, chain_id, str(ex))
+        ) from ex
+    all_atom_positions = np.split(
+        all_atom_positions, all_atom_positions.shape[0]
+    )
+    all_atom_masks = np.split(all_atom_mask, all_atom_mask.shape[0])
+    output_templates_sequence = []
+    templates_all_atom_positions = []
+    templates_all_atom_masks = []
+    for _ in query_sequence:
+        # Residues in the query_sequence that are not in the template_sequence:
+        templates_all_atom_positions.append(
+            np.zeros((residue_constants.atom_type_num, 3))
+        )
+        templates_all_atom_masks.append(
+            np.zeros(residue_constants.atom_type_num)
+        )
+        output_templates_sequence.append("-")
+    for k, v in mapping.items():
+        template_index = v + mapping_offset
+        templates_all_atom_positions[k] = all_atom_positions[template_index][0]
+        templates_all_atom_masks[k] = all_atom_masks[template_index][0]
+        output_templates_sequence[k] = template_sequence[v]
+    # Alanine (AA with the lowest number of atoms) has 5 atoms (C, CA, CB, N, O).
+    if np.sum(templates_all_atom_masks) < 5:
+        raise TemplateAtomMaskAllZerosError(
+            "Template all atom mask was all zeros: %s_%s. Residue range: %d-%d"
+            % (
+                pdb_id,
+                chain_id,
+                min(mapping.values()) + mapping_offset,
+                max(mapping.values()) + mapping_offset,
+            )
+        )
+    output_templates_sequence = "".join(output_templates_sequence)
+    templates_aatype = residue_constants.sequence_to_onehot(
+        output_templates_sequence, residue_constants.HHBLITS_AA_TO_ID
+    )
+    return (
+        {
+            "template_all_atom_positions": np.array(
+                templates_all_atom_positions
+            ),
+            "template_all_atom_masks": np.array(templates_all_atom_masks),
+            "template_sequence": output_templates_sequence.encode(),
+            "template_aatype": np.array(templates_aatype),
+            "template_domain_names": f"{pdb_id.lower()}_{chain_id}".encode(),
+        },
+        warning,
+    )
+def _build_query_to_hit_index_mapping(
+        hit_query_sequence: str,
+        hit_sequence: str,
+        indices_hit: Sequence[int],
+        indices_query: Sequence[int],
+        original_query_sequence: str,
+) -> Mapping[int, int]:
+    """Gets mapping from indices in original query sequence to indices in the hit.
+    hit_query_sequence and hit_sequence are two aligned sequences containing gap
+    characters. hit_query_sequence contains only the part of the original query
+    sequence that matched the hit. When interpreting the indices from the .hhr, we
+    need to correct for this to recover a mapping from original query sequence to
+    the hit sequence.
+    Args:
+        hit_query_sequence: The portion of the query sequence that is in the .hhr
+            hit
+        hit_sequence: The portion of the hit sequence that is in the .hhr
+        indices_hit: The indices for each aminoacid relative to the hit sequence
+        indices_query: The indices for each aminoacid relative to the original query
+            sequence
+        original_query_sequence: String describing the original query sequence.
+    Returns:
+        Dictionary with indices in the original query sequence as keys and indices
+        in the hit sequence as values.
+    """
+    # If the hit is empty (no aligned residues), return empty mapping
+    if not hit_query_sequence:
+        return {}
+    # Remove gaps and find the offset of hit.query relative to original query.
+    hhsearch_query_sequence = hit_query_sequence.replace("-", "")
+    hit_sequence = hit_sequence.replace("-", "")
+    hhsearch_query_offset = original_query_sequence.find(
+        hhsearch_query_sequence
+    )
+    # Index of -1 used for gap characters. Subtract the min index ignoring gaps.
+    min_idx = min(x for x in indices_hit if x > -1)
+    fixed_indices_hit = [x - min_idx if x > -1 else -1 for x in indices_hit]
+    min_idx = min(x for x in indices_query if x > -1)
+    fixed_indices_query = [x - min_idx if x > -1 else -1 for x in indices_query]
+    # Zip the corrected indices, ignore case where both seqs have gap characters.
+    mapping = {}
+    for q_i, q_t in zip(fixed_indices_query, fixed_indices_hit):
+        if q_t != -1 and q_i != -1:
+            if q_t >= len(hit_sequence) or q_i + hhsearch_query_offset >= len(
+                    original_query_sequence
+            ):
+                continue
+            mapping[q_i + hhsearch_query_offset] = q_t
+    return mapping
+@dataclasses.dataclass(frozen=True)
+class PrefilterResult:
+    valid: bool
+    error: Optional[str]
+    warning: Optional[str]
+@dataclasses.dataclass(frozen=True)
+class SingleHitResult:
+    features: Optional[Mapping[str, Any]]
+    error: Optional[str]
+    warning: Optional[str]
+def _prefilter_hit(
+        query_sequence: str,
+        hit: parsers.TemplateHit,
+        max_template_date: datetime.datetime,
+        release_dates: Mapping[str, datetime.datetime],
+        obsolete_pdbs: Mapping[str, str],
+        strict_error_check: bool = False,
+):
+    # Fail hard if we can't get the PDB ID and chain name from the hit.
+    hit_pdb_code, hit_chain_id = _get_pdb_id_and_chain(hit)
+    if hit_pdb_code not in release_dates:
+        if hit_pdb_code in obsolete_pdbs:
+            hit_pdb_code = obsolete_pdbs[hit_pdb_code]
+    # Pass hit_pdb_code since it might have changed due to the pdb being
+    # obsolete.
+    try:
+        _assess_hhsearch_hit(
+            hit=hit,
+            hit_pdb_code=hit_pdb_code,
+            query_sequence=query_sequence,
+            release_dates=release_dates,
+            release_date_cutoff=max_template_date,
+        )
+    except PrefilterError as e:
+        hit_name = f"{hit_pdb_code}_{hit_chain_id}"
+        msg = f"hit {hit_name} did not pass prefilter: {str(e)}"
+        logging.info(msg)
+        if strict_error_check and isinstance(e, (DateError, DuplicateError)):
+            # In strict mode we treat some prefilter cases as errors.
+            return PrefilterResult(valid=False, error=msg, warning=None)
+        return PrefilterResult(valid=False, error=None, warning=None)
+    return PrefilterResult(valid=True, error=None, warning=None)
+@functools.lru_cache(16, typed=False)
+def _read_file(path):
+    with open(path, 'r') as f:
+        file_data = f.read()
+    return file_data
+def _process_single_hit(
+        query_sequence: str,
+        hit: parsers.TemplateHit,
+        mmcif_dir: str,
+        max_template_date: datetime.datetime,
+        release_dates: Mapping[str, datetime.datetime],
+        obsolete_pdbs: Mapping[str, str],
+        kalign_binary_path: str,
+        strict_error_check: bool = False,
+        _zero_center_positions: bool = True,
+) -> SingleHitResult:
+    """Tries to extract template features from a single HHSearch hit."""
+    # Fail hard if we can't get the PDB ID and chain name from the hit.
+    hit_pdb_code, hit_chain_id = _get_pdb_id_and_chain(hit)
+    if hit_pdb_code not in release_dates:
+        if hit_pdb_code in obsolete_pdbs:
+            hit_pdb_code = obsolete_pdbs[hit_pdb_code]
+    mapping = _build_query_to_hit_index_mapping(
+        hit.query,
+        hit.hit_sequence,
+        hit.indices_hit,
+        hit.indices_query,
+        query_sequence,
+    )
+    # The mapping is from the query to the actual hit sequence, so we need to
+    # remove gaps (which regardless have a missing confidence score).
+    template_sequence = hit.hit_sequence.replace("-", "")
+    cif_path = os.path.join(mmcif_dir, hit_pdb_code + ".cif")
+    logging.info(
+        "Reading PDB entry from %s. Query: %s, template: %s",
+        cif_path,
+        query_sequence,
+        template_sequence,
+    )
+    # Fail if we can't find the mmCIF file.
+    cif_string = _read_file(cif_path)
+    parsing_result = mmcif_parsing.parse(
+        file_id=hit_pdb_code, mmcif_string=cif_string
+    )
+    if parsing_result.mmcif_object is not None:
+        hit_release_date = datetime.datetime.strptime(
+            parsing_result.mmcif_object.header["release_date"], "%Y-%m-%d"
+        )
+        if hit_release_date > max_template_date:
+            error = "Template %s date (%s) > max template date (%s)." % (
+                hit_pdb_code,
+                hit_release_date,
+                max_template_date,
+            )
+            if strict_error_check:
+                return SingleHitResult(features=None, error=error, warning=None)
+            else:
+                logging.info(error)
+                return SingleHitResult(features=None, error=None, warning=None)
+    try:
+        features, realign_warning = _extract_template_features(
+            mmcif_object=parsing_result.mmcif_object,
+            pdb_id=hit_pdb_code,
+            mapping=mapping,
+            template_sequence=template_sequence,
+            query_sequence=query_sequence,
+            template_chain_id=hit_chain_id,
+            kalign_binary_path=kalign_binary_path,
+            _zero_center_positions=_zero_center_positions,
+        )
+        if hit.sum_probs is None:
+            features["template_sum_probs"] = [0]
+        else:
+            features["template_sum_probs"] = [hit.sum_probs]
+        # It is possible there were some errors when parsing the other chains in the
+        # mmCIF file, but the template features for the chain we want were still
+        # computed. In such case the mmCIF parsing errors are not relevant.
+        return SingleHitResult(
+            features=features, error=None, warning=realign_warning
+        )
+    except (
+            NoChainsError,
+            NoAtomDataInTemplateError,
+            TemplateAtomMaskAllZerosError,
+    ) as e:
+        # These 3 errors indicate missing mmCIF experimental data rather than a
+        # problem with the template search, so turn them into warnings.
+        warning = (
+                "%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: "
+                "%s, mmCIF parsing errors: %s"
+                % (
+                    hit_pdb_code,
+                    hit_chain_id,
+                    hit.sum_probs if hit.sum_probs else 0.,
+                    hit.index,
+                    str(e),
+                    parsing_result.errors,
+                )
+        )
+        if strict_error_check:
+            return SingleHitResult(features=None, error=warning, warning=None)
+        else:
+            return SingleHitResult(features=None, error=None, warning=warning)
+    except Error as e:
+        error = (
+                "%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: "
+                "%s, mmCIF parsing errors: %s"
+                % (
+                    hit_pdb_code,
+                    hit_chain_id,
+                    hit.sum_probs if hit.sum_probs else 0.,
+                    hit.index,
+                    str(e),
+                    parsing_result.errors,
+                )
+        )
+        return SingleHitResult(features=None, error=error, warning=None)
+def get_custom_template_features(
+        mmcif_path: str,
+        query_sequence: str,
+        pdb_id: str,
+        chain_id: str,
+        kalign_binary_path: str):
+    with open(mmcif_path, "r") as mmcif_path:
+        cif_string = mmcif_path.read()
+    mmcif_parse_result = mmcif_parsing.parse(
+        file_id=pdb_id, mmcif_string=cif_string
+    )
+    template_sequence = mmcif_parse_result.mmcif_object.chain_to_seqres[chain_id]
+    mapping = {x: x for x, _ in enumerate(query_sequence)}
+    features, warnings = _extract_template_features(
+        mmcif_object=mmcif_parse_result.mmcif_object,
+        pdb_id=pdb_id,
+        mapping=mapping,
+        template_sequence=template_sequence,
+        query_sequence=query_sequence,
+        template_chain_id=chain_id,
+        kalign_binary_path=kalign_binary_path,
+        _zero_center_positions=True
+    )
+    features["template_sum_probs"] = [1.0]
+    # TODO: clean up this logic
+    template_features = {}
+    for template_feature_name in TEMPLATE_FEATURES:
+        template_features[template_feature_name] = []
+    for k in template_features:
+        template_features[k].append(features[k])
+    for name in template_features:
+        template_features[name] = np.stack(
+            template_features[name], axis=0
+        ).astype(TEMPLATE_FEATURES[name])
+    return TemplateSearchResult(
+        features=template_features, errors=None, warnings=warnings
+    )
+@dataclasses.dataclass(frozen=True)
+class TemplateSearchResult:
+    features: Mapping[str, Any]
+    errors: Sequence[str]
+    warnings: Sequence[str]
+class TemplateHitFeaturizer(abc.ABC):
+    """An abstract base class for turning template hits to features."""
+    def __init__(
+            self,
+            mmcif_dir: str,
+            max_template_date: str,
+            max_hits: int,
+            kalign_binary_path: str,
+            release_dates_path: Optional[str] = None,
+            obsolete_pdbs_path: Optional[str] = None,
+            strict_error_check: bool = False,
+            _shuffle_top_k_prefiltered: Optional[int] = None,
+            _zero_center_positions: bool = True,
+    ):
+        """Initializes the Template Search.
+        Args:
+            mmcif_dir: Path to a directory with mmCIF structures. Once a template ID
+                is found by HHSearch, this directory is used to retrieve the template
+                data.
+            max_template_date: The maximum date permitted for template structures. No
+                template with date higher than this date will be returned. In ISO8601
+                date format, YYYY-MM-DD.
+            max_hits: The maximum number of templates that will be returned.
+            kalign_binary_path: The path to a kalign executable used for template
+                realignment.
+            release_dates_path: An optional path to a file with a mapping from PDB IDs
+                to their release dates. Thanks to this we don't have to redundantly
+                parse mmCIF files to get that information.
+            obsolete_pdbs_path: An optional path to a file containing a mapping from
+                obsolete PDB IDs to the PDB IDs of their replacements.
+            strict_error_check: If True, then the following will be treated as errors:
+                * If any template date is after the max_template_date.
+                * If any template has identical PDB ID to the query.
+                * If any template is a duplicate of the query.
+                * Any feature computation errors.
+        """
+        self._mmcif_dir = mmcif_dir
+        if not glob.glob(os.path.join(self._mmcif_dir, "*.cif")):
+            logging.error("Could not find CIFs in %s", self._mmcif_dir)
+            raise ValueError(f"Could not find CIFs in {self._mmcif_dir}")
+        try:
+            self._max_template_date = datetime.datetime.strptime(
+                max_template_date, "%Y-%m-%d"
+            )
+        except ValueError:
+            raise ValueError(
+                "max_template_date must be set and have format YYYY-MM-DD."
+            )
+        self._max_hits = max_hits
+        self._kalign_binary_path = kalign_binary_path
+        self._strict_error_check = strict_error_check
+        if release_dates_path:
+            logging.info(
+                "Using precomputed release dates %s.", release_dates_path
+            )
+            self._release_dates = _parse_release_dates(release_dates_path)
+        else:
+            self._release_dates = {}
+        if obsolete_pdbs_path:
+            logging.info(
+                "Using precomputed obsolete pdbs %s.", obsolete_pdbs_path
+            )
+            self._obsolete_pdbs = _parse_obsolete(obsolete_pdbs_path)
+        else:
+            self._obsolete_pdbs = {}
+        self._shuffle_top_k_prefiltered = _shuffle_top_k_prefiltered
+        self._zero_center_positions = _zero_center_positions
+    @abc.abstractmethod
+    def get_templates(
+            self,
+            query_sequence: str,
+            hits: Sequence[parsers.TemplateHit]
+    ) -> TemplateSearchResult:
+        """ Computes the templates for a given query sequence """
+class HhsearchHitFeaturizer(TemplateHitFeaturizer):
+    def get_templates(
+            self,
+            query_sequence: str,
+            hits: Sequence[parsers.TemplateHit],
+    ) -> TemplateSearchResult:
+        """Computes the templates for given query sequence (more details above)."""
+        logging.info("Searching for template for: %s", query_sequence)
+        template_features = {}
+        for template_feature_name in TEMPLATE_FEATURES:
+            template_features[template_feature_name] = []
+        already_seen = set()
+        errors = []
+        warnings = []
+        filtered = []
+        for hit in hits:
+            prefilter_result = _prefilter_hit(
+                query_sequence=query_sequence,
+                hit=hit,
+                max_template_date=self._max_template_date,
+                release_dates=self._release_dates,
+                obsolete_pdbs=self._obsolete_pdbs,
+                strict_error_check=self._strict_error_check,
+            )
+            if prefilter_result.error:
+                errors.append(prefilter_result.error)
+            if prefilter_result.warning:
+                warnings.append(prefilter_result.warning)
+            if prefilter_result.valid:
+                filtered.append(hit)
+        filtered = list(
+            sorted(filtered, key=lambda x: x.sum_probs, reverse=True)
+        )
+        idx = list(range(len(filtered)))
+        if (self._shuffle_top_k_prefiltered):
+            stk = self._shuffle_top_k_prefiltered
+            idx[:stk] = np.random.permutation(idx[:stk])
+        for i in idx:
+            # We got all the templates we wanted, stop processing hits.
+            if len(already_seen) >= self._max_hits:
+                break
+            try:
+                hit = filtered[i]
+                result = _process_single_hit(
+                    query_sequence=query_sequence,
+                    hit=hit,
+                    mmcif_dir=self._mmcif_dir,
+                    max_template_date=self._max_template_date,
+                    release_dates=self._release_dates,
+                    obsolete_pdbs=self._obsolete_pdbs,
+                    strict_error_check=self._strict_error_check,
+                    kalign_binary_path=self._kalign_binary_path,
+                    _zero_center_positions=self._zero_center_positions,
+                )
+                if result.error:
+                    errors.append(result.error)
+                # There could be an error even if there are some results, e.g. thrown by
+                # other unparsable chains in the same mmCIF file.
+                if result.warning:
+                    warnings.append(result.warning)
+                if result.features is None:
+                    logging.info(
+                        "Skipped invalid hit %s, error: %s, warning: %s",
+                        hit.name,
+                        result.error,
+                        result.warning,
+                    )
+                else:
+                    already_seen_key = result.features["template_sequence"]
+                    if (already_seen_key in already_seen):
+                        continue
+                    already_seen.add(already_seen_key)
+                    for k in template_features:
+                        template_features[k].append(result.features[k])
+            except Exception as e:
+                print(e)
+                continue
+        if already_seen:
+            for name in template_features:
+                template_features[name] = np.stack(
+                    template_features[name], axis=0
+                ).astype(TEMPLATE_FEATURES[name])
+        else:
+            num_res = len(query_sequence)
+            # Construct a default template with all zeros.
+            template_features = empty_template_feats(num_res)
+        return TemplateSearchResult(
+            features=template_features, errors=errors, warnings=warnings
+        )
+class HmmsearchHitFeaturizer(TemplateHitFeaturizer):
+    def get_templates(
+            self,
+            query_sequence: str,
+            hits: Sequence[parsers.TemplateHit]
+    ) -> TemplateSearchResult:
+        logging.info("Searching for template for: %s", query_sequence)
+        template_features = {}
+        for template_feature_name in TEMPLATE_FEATURES:
+            template_features[template_feature_name] = []
+        already_seen = set()
+        errors = []
+        warnings = []
+        # DISCREPANCY: This filtering scheme that saves time
+        filtered = []
+        for hit in hits:
+            prefilter_result = _prefilter_hit(
+                query_sequence=query_sequence,
+                hit=hit,
+                max_template_date=self._max_template_date,
+                release_dates=self._release_dates,
+                obsolete_pdbs=self._obsolete_pdbs,
+                strict_error_check=self._strict_error_check,
+            )
+            if prefilter_result.error:
+                errors.append(prefilter_result.error)
+            if prefilter_result.warning:
+                warnings.append(prefilter_result.warning)
+            if prefilter_result.valid:
+                filtered.append(hit)
+        filtered = list(
+            sorted(
+                filtered, key=lambda x: x.sum_probs if x.sum_probs else 0., reverse=True
+            )
+        )
+        idx = list(range(len(filtered)))
+        if (self._shuffle_top_k_prefiltered):
+            stk = self._shuffle_top_k_prefiltered
+            idx[:stk] = np.random.permutation(idx[:stk])
+        for i in idx:
+            if (len(already_seen) >= self._max_hits):
+                break
+            hit = filtered[i]
+            result = _process_single_hit(
+                query_sequence=query_sequence,
+                hit=hit,
+                mmcif_dir=self._mmcif_dir,
+                max_template_date=self._max_template_date,
+                release_dates=self._release_dates,
+                obsolete_pdbs=self._obsolete_pdbs,
+                strict_error_check=self._strict_error_check,
+                kalign_binary_path=self._kalign_binary_path
+            )
+            if result.error:
+                errors.append(result.error)
+            if result.warning:
+                warnings.append(result.warning)
+            if result.features is None:
+                logging.debug(
+                    "Skipped invalid hit %s, error: %s, warning: %s",
+                    hit.name, result.error, result.warning,
+                )
+            else:
+                already_seen_key = result.features["template_sequence"]
+                if (already_seen_key in already_seen):
+                    continue
+                # Increment the hit counter, since we got features out of this hit.
+                already_seen.add(already_seen_key)
+                for k in template_features:
+                    template_features[k].append(result.features[k])
+        if already_seen:
+            for name in template_features:
+                template_features[name] = np.stack(
+                    template_features[name], axis=0
+                ).astype(TEMPLATE_FEATURES[name])
+        else:
+            num_res = len(query_sequence)
+            # Construct a default template with all zeros.
+            template_features = empty_template_feats(num_res)
+        return TemplateSearchResult(
+            features=template_features,
+            errors=errors,
+            warnings=warnings,
+        )

PhysDock/data/tools/utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Common utilities for data pipeline tools."""
+import contextlib
+import datetime
+import logging
+import shutil
+import tempfile
+import time
+from typing import Optional
+@contextlib.contextmanager
+def tmpdir_manager(base_dir: Optional[str] = None):
+    """Context manager that deletes a temporary directory on exit."""
+    tmpdir = tempfile.mkdtemp(dir=base_dir)
+    try:
+        yield tmpdir
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
+@contextlib.contextmanager
+def timing(msg: str):
+    logging.info("Started %s", msg)
+    tic = time.perf_counter()
+    yield
+    toc = time.perf_counter()
+    logging.info("Finished %s in %.3f seconds", msg, toc - tic)
+def to_date(s: str):
+    return datetime.datetime(
+        year=int(s[:4]), month=int(s[5:7]), day=int(s[8:10])
+    )

PhysDock/models/__init__.py ADDED Viewed

File without changes

PhysDock/models/layers/__init__.py ADDED Viewed

File without changes