Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| from pathlib import Path | |
| from typing import Dict | |
| import numpy as np | |
| import json | |
| from fairchem.data.omol.modules.evaluator import ( | |
| ligand_pocket, | |
| ligand_strain, | |
| geom_conformers, | |
| protonation_energies, | |
| unoptimized_ie_ea, | |
| distance_scaling, | |
| unoptimized_spin_gap, | |
| ) | |
| from evaluator_utils import get_order | |
| class SubmissionLoadError(Exception): | |
| """Raised if unable to load the submission file.""" | |
| OMOL_EVAL_FUNCTIONS = { | |
| "Ligand pocket": ligand_pocket, | |
| "Ligand strain": ligand_strain, | |
| "Conformers": geom_conformers, | |
| "Protonation": protonation_energies, | |
| "IE_EA": unoptimized_ie_ea, | |
| "Distance scaling": distance_scaling, | |
| "Spin gap": unoptimized_spin_gap, | |
| } | |
| OMOL_DATA_ID_MAPPING = { | |
| "metal_complexes": ["metal_complexes"], | |
| "electrolytes": ["elytes"], | |
| "biomolecules": ["biomolecules"], | |
| "neutral_organics": ["ani2x", "orbnet_denali", "geom_orca6", "trans1x", "rgd"], | |
| } | |
| # OC20 subsplit mappings | |
| OC20_DATA_ID_MAPPING = { | |
| "id": ["id"], | |
| "ood_ads": ["ood_ads"], | |
| "ood_cat": ["ood_cat"], | |
| "ood_both": ["ood_both"], | |
| } | |
| def omol_s2ef_metrics( | |
| annotations_path: Path, | |
| submission_filename: Path, | |
| subsets: list = ["all"], | |
| ) -> Dict[str, float]: | |
| try: | |
| with np.load(submission_filename) as data: | |
| submission_ids = data["ids"] | |
| except Exception as e: | |
| raise SubmissionLoadError( | |
| "Error loading submission file. 'ids' must not be object types." | |
| ) from e | |
| with np.load(annotations_path, allow_pickle=True) as data: | |
| annotations_ids = data["ids"] | |
| order = get_order(annotations_ids, submission_ids) | |
| try: | |
| with np.load(submission_filename) as pred_data: | |
| forces = pred_data["forces"] | |
| energy = pred_data["energy"][order] | |
| forces = np.array( | |
| np.split(forces, np.cumsum(pred_data["natoms"])[:-1]), dtype=object | |
| )[order] | |
| except Exception as e: | |
| raise SubmissionLoadError( | |
| "Error loading submission data. Make sure you concatenated your forces and there are no object types." | |
| ) from e | |
| if len(set(np.where(np.isinf(energy))[0])) != 0: | |
| inf_energy_ids = list(set(np.where(np.isinf(energy))[0])) | |
| raise Exception( | |
| f"Inf values found in `energy` for IDs: ({inf_energy_ids[:3]}, ...)" | |
| ) | |
| with np.load(annotations_path, allow_pickle=True) as target_data: | |
| target_forces = target_data["forces"] | |
| target_energy = target_data["energy"] | |
| target_data_ids = target_data["data_ids"] | |
| metrics = {} | |
| for subset in subsets: | |
| if subset == "all": | |
| subset_mask = np.ones(len(target_data_ids), dtype=bool) | |
| else: | |
| allowed_ids = set(OMOL_DATA_ID_MAPPING.get(subset, [])) | |
| subset_mask = np.array( | |
| [data_id in allowed_ids for data_id in target_data_ids] | |
| ) | |
| sub_energy = energy[subset_mask] | |
| sub_target_energy = target_energy[subset_mask] | |
| energy_mae = np.mean(np.abs(sub_target_energy - sub_energy)) | |
| metrics[f"{subset}_energy_mae"] = energy_mae | |
| forces_mae = 0 | |
| natoms = 0 | |
| for sub_forces, sub_target_forces in zip( | |
| forces[subset_mask], target_forces[subset_mask] | |
| ): | |
| forces_mae += np.sum(np.abs(sub_target_forces - sub_forces)) | |
| natoms += sub_forces.shape[0] | |
| forces_mae /= 3 * natoms | |
| metrics[f"{subset}_forces_mae"] = forces_mae | |
| return metrics | |
| def omol_evaluations( | |
| annotations_path: Path, | |
| submission_filename: Path, | |
| eval_type: str, | |
| ) -> Dict[str, float]: | |
| try: | |
| with open(submission_filename) as f: | |
| submission_data = json.load(f) | |
| except Exception as e: | |
| raise SubmissionLoadError("Error loading submission file") from e | |
| with open(annotations_path) as f: | |
| annotations_data = json.load(f) | |
| submission_entries = set(submission_data.keys()) | |
| annotation_entries = set(annotations_data.keys()) | |
| if submission_entries != annotation_entries: | |
| missing = annotation_entries - submission_entries | |
| unexpected = submission_entries - annotation_entries | |
| raise ValueError( | |
| f"Submission and annotations entries do not match.\n" | |
| f"Missing entries in submission: {missing}\n" | |
| f"Unexpected entries in submission: {unexpected}" | |
| ) | |
| assert len(submission_entries) == len( | |
| submission_data | |
| ), "Duplicate entries found in submission." | |
| eval_fn = OMOL_EVAL_FUNCTIONS.get(eval_type) | |
| metrics = eval_fn(annotations_data, submission_data) | |
| return metrics | |
| def oc_s2ef_metrics( | |
| annotations_path: Path, | |
| submission_filename: Path, | |
| subsets: list = ["id", "ood_ads", "ood_cat", "ood_both"], | |
| ) -> Dict[str, float]: | |
| """ | |
| Calculate S2EF metrics for OC datasets. | |
| """ | |
| metrics = {} | |
| for split in subsets: | |
| try: | |
| with np.load(submission_filename) as data: | |
| submission_ids = data[f"{split}_ids"] | |
| except Exception as e: | |
| raise SubmissionLoadError( | |
| f"Error loading submission file. '{split}_ids' must not be object types." | |
| ) from e | |
| with np.load(annotations_path, allow_pickle=True) as data: | |
| annotations_ids = data[f"{split}_ids"] | |
| order = get_order(annotations_ids, submission_ids) | |
| try: | |
| with np.load(submission_filename) as pred_data: | |
| forces = pred_data[f"{split}_forces"] | |
| energy = pred_data[f"{split}_energy"][order] | |
| forces = np.array( | |
| np.split(forces, pred_data[f"{split}_chunk_idx"]), dtype=object | |
| )[order] | |
| except Exception as e: | |
| raise SubmissionLoadError( | |
| "Error loading submission data. Make sure you concatenated your forces and there are no object types." | |
| ) from e | |
| if len(set(np.where(np.isinf(energy))[0])) != 0: | |
| inf_energy_ids = list(set(np.where(np.isinf(energy))[0])) | |
| raise Exception( | |
| f"Inf values found in `energy` for IDs: ({inf_energy_ids[:3]}, ...)" | |
| ) | |
| with np.load(annotations_path, allow_pickle=True) as target_data: | |
| target_forces = target_data[f"{split}_forces"] | |
| target_energy = target_data[f"{split}_energy"] | |
| energy_mae = np.mean(np.abs(target_energy - energy.flatten())) | |
| metrics[f"{split}_energy_mae"] = energy_mae | |
| forces_mae = 0 | |
| natoms = 0 | |
| for sub_forces, sub_target_forces in zip( | |
| forces, target_forces | |
| ): | |
| forces_mae += np.sum(np.abs(sub_target_forces - sub_forces)) | |
| natoms += sub_forces.shape[0] | |
| forces_mae /= 3 * natoms | |
| metrics[f"{split}_forces_mae"] = forces_mae | |
| # Compute average across all subsplits | |
| energy_maes = [metrics[f"{s}_energy_mae"] for s in subsets] | |
| forces_maes = [metrics[f"{s}_forces_mae"] for s in subsets] | |
| metrics["avg_energy_mae"] = np.mean(energy_maes) | |
| metrics["avg_forces_mae"] = np.mean(forces_maes) | |
| return metrics | |
| def oc_is2re_metrics( | |
| annotations_path: Path, | |
| submission_filename: Path, | |
| subsets: list = ["id", "ood_ads", "ood_cat", "ood_both"], | |
| ) -> Dict[str, float]: | |
| """ | |
| Calculate IS2RE metrics for OC dataset. | |
| """ | |
| metrics = {} | |
| for split in subsets: | |
| try: | |
| with np.load(submission_filename) as data: | |
| submission_ids = data[f"{split}_ids"] | |
| except Exception as e: | |
| raise SubmissionLoadError( | |
| f"Error loading submission file. '{split}_ids' must not be object types." | |
| ) from e | |
| with np.load(annotations_path, allow_pickle=True) as data: | |
| annotations_ids = data[f"{split}_ids"] | |
| order = get_order(annotations_ids, submission_ids) | |
| try: | |
| with np.load(submission_filename) as pred_data: | |
| energy = pred_data[f"{split}_energy"][order] | |
| except Exception as e: | |
| raise SubmissionLoadError( | |
| "Error loading submission data." | |
| ) from e | |
| if len(set(np.where(np.isinf(energy))[0])) != 0: | |
| inf_energy_ids = list(set(np.where(np.isinf(energy))[0])) | |
| raise Exception( | |
| f"Inf values found in `energy` for IDs: ({inf_energy_ids[:3]}, ...)" | |
| ) | |
| with np.load(annotations_path, allow_pickle=True) as target_data: | |
| target_energy = target_data[f"{split}_energy"] | |
| energy_mae = np.mean(np.abs(target_energy - energy.flatten())) | |
| metrics[f"{split}_energy_mae"] = energy_mae | |
| # Compute average across all subsplits | |
| energy_maes = [metrics[f"{s}_energy_mae"] for s in subsets] | |
| metrics["avg_energy_mae"] = np.mean(energy_maes) | |
| return metrics | |
| def evaluate( | |
| annotations_path: Path, | |
| submission_filename: Path, | |
| eval_type: str, | |
| ): | |
| if eval_type in ["Validation", "Test"]: | |
| metrics = omol_s2ef_metrics( | |
| annotations_path, | |
| submission_filename, | |
| subsets=[ | |
| "all", | |
| "metal_complexes", | |
| "electrolytes", | |
| "biomolecules", | |
| "neutral_organics", | |
| ], | |
| ) | |
| elif eval_type == "OC20 S2EF Test": | |
| metrics = oc_s2ef_metrics( | |
| annotations_path, | |
| submission_filename, | |
| subsets=["id", "ood_ads", "ood_cat", "ood_both"], | |
| ) | |
| elif eval_type == "OC20 IS2RE Test": | |
| metrics = oc_is2re_metrics( | |
| annotations_path, | |
| submission_filename, | |
| subsets=["id", "ood_ads", "ood_cat", "ood_both"], | |
| ) | |
| elif eval_type in OMOL_EVAL_FUNCTIONS: | |
| metrics = omol_evaluations( | |
| annotations_path, | |
| submission_filename, | |
| eval_type, | |
| ) | |
| else: | |
| raise ValueError(f"Unknown eval_type: {eval_type}") | |
| return metrics | |