from pathlib import Path from typing import Dict import numpy as np import json from fairchem.data.omol.modules.evaluator import ( ligand_pocket, ligand_strain, geom_conformers, protonation_energies, unoptimized_ie_ea, distance_scaling, unoptimized_spin_gap, ) from evaluator_utils import get_order class SubmissionLoadError(Exception): """Raised if unable to load the submission file.""" OMOL_EVAL_FUNCTIONS = { "Ligand pocket": ligand_pocket, "Ligand strain": ligand_strain, "Conformers": geom_conformers, "Protonation": protonation_energies, "IE_EA": unoptimized_ie_ea, "Distance scaling": distance_scaling, "Spin gap": unoptimized_spin_gap, } OMOL_DATA_ID_MAPPING = { "metal_complexes": ["metal_complexes"], "electrolytes": ["elytes"], "biomolecules": ["biomolecules"], "neutral_organics": ["ani2x", "orbnet_denali", "geom_orca6", "trans1x", "rgd"], } # OC20 subsplit mappings OC20_DATA_ID_MAPPING = { "id": ["id"], "ood_ads": ["ood_ads"], "ood_cat": ["ood_cat"], "ood_both": ["ood_both"], } def omol_s2ef_metrics( annotations_path: Path, submission_filename: Path, subsets: list = ["all"], ) -> Dict[str, float]: try: with np.load(submission_filename) as data: submission_ids = data["ids"] except Exception as e: raise SubmissionLoadError( "Error loading submission file. 'ids' must not be object types." ) from e with np.load(annotations_path, allow_pickle=True) as data: annotations_ids = data["ids"] order = get_order(annotations_ids, submission_ids) try: with np.load(submission_filename) as pred_data: forces = pred_data["forces"] energy = pred_data["energy"][order] forces = np.array( np.split(forces, np.cumsum(pred_data["natoms"])[:-1]), dtype=object )[order] except Exception as e: raise SubmissionLoadError( "Error loading submission data. Make sure you concatenated your forces and there are no object types." ) from e if len(set(np.where(np.isinf(energy))[0])) != 0: inf_energy_ids = list(set(np.where(np.isinf(energy))[0])) raise Exception( f"Inf values found in `energy` for IDs: ({inf_energy_ids[:3]}, ...)" ) with np.load(annotations_path, allow_pickle=True) as target_data: target_forces = target_data["forces"] target_energy = target_data["energy"] target_data_ids = target_data["data_ids"] metrics = {} for subset in subsets: if subset == "all": subset_mask = np.ones(len(target_data_ids), dtype=bool) else: allowed_ids = set(OMOL_DATA_ID_MAPPING.get(subset, [])) subset_mask = np.array( [data_id in allowed_ids for data_id in target_data_ids] ) sub_energy = energy[subset_mask] sub_target_energy = target_energy[subset_mask] energy_mae = np.mean(np.abs(sub_target_energy - sub_energy)) metrics[f"{subset}_energy_mae"] = energy_mae forces_mae = 0 natoms = 0 for sub_forces, sub_target_forces in zip( forces[subset_mask], target_forces[subset_mask] ): forces_mae += np.sum(np.abs(sub_target_forces - sub_forces)) natoms += sub_forces.shape[0] forces_mae /= 3 * natoms metrics[f"{subset}_forces_mae"] = forces_mae return metrics def omol_evaluations( annotations_path: Path, submission_filename: Path, eval_type: str, ) -> Dict[str, float]: try: with open(submission_filename) as f: submission_data = json.load(f) except Exception as e: raise SubmissionLoadError("Error loading submission file") from e with open(annotations_path) as f: annotations_data = json.load(f) submission_entries = set(submission_data.keys()) annotation_entries = set(annotations_data.keys()) if submission_entries != annotation_entries: missing = annotation_entries - submission_entries unexpected = submission_entries - annotation_entries raise ValueError( f"Submission and annotations entries do not match.\n" f"Missing entries in submission: {missing}\n" f"Unexpected entries in submission: {unexpected}" ) assert len(submission_entries) == len( submission_data ), "Duplicate entries found in submission." eval_fn = OMOL_EVAL_FUNCTIONS.get(eval_type) metrics = eval_fn(annotations_data, submission_data) return metrics def oc_s2ef_metrics( annotations_path: Path, submission_filename: Path, subsets: list = ["id", "ood_ads", "ood_cat", "ood_both"], ) -> Dict[str, float]: """ Calculate S2EF metrics for OC datasets. """ metrics = {} for split in subsets: try: with np.load(submission_filename) as data: submission_ids = data[f"{split}_ids"] except Exception as e: raise SubmissionLoadError( f"Error loading submission file. '{split}_ids' must not be object types." ) from e with np.load(annotations_path, allow_pickle=True) as data: annotations_ids = data[f"{split}_ids"] order = get_order(annotations_ids, submission_ids) try: with np.load(submission_filename) as pred_data: forces = pred_data[f"{split}_forces"] energy = pred_data[f"{split}_energy"][order] forces = np.array( np.split(forces, pred_data[f"{split}_chunk_idx"]), dtype=object )[order] except Exception as e: raise SubmissionLoadError( "Error loading submission data. Make sure you concatenated your forces and there are no object types." ) from e if len(set(np.where(np.isinf(energy))[0])) != 0: inf_energy_ids = list(set(np.where(np.isinf(energy))[0])) raise Exception( f"Inf values found in `energy` for IDs: ({inf_energy_ids[:3]}, ...)" ) with np.load(annotations_path, allow_pickle=True) as target_data: target_forces = target_data[f"{split}_forces"] target_energy = target_data[f"{split}_energy"] energy_mae = np.mean(np.abs(target_energy - energy.flatten())) metrics[f"{split}_energy_mae"] = energy_mae forces_mae = 0 natoms = 0 for sub_forces, sub_target_forces in zip( forces, target_forces ): forces_mae += np.sum(np.abs(sub_target_forces - sub_forces)) natoms += sub_forces.shape[0] forces_mae /= 3 * natoms metrics[f"{split}_forces_mae"] = forces_mae # Compute average across all subsplits energy_maes = [metrics[f"{s}_energy_mae"] for s in subsets] forces_maes = [metrics[f"{s}_forces_mae"] for s in subsets] metrics["avg_energy_mae"] = np.mean(energy_maes) metrics["avg_forces_mae"] = np.mean(forces_maes) return metrics def oc_is2re_metrics( annotations_path: Path, submission_filename: Path, subsets: list = ["id", "ood_ads", "ood_cat", "ood_both"], ) -> Dict[str, float]: """ Calculate IS2RE metrics for OC dataset. """ metrics = {} for split in subsets: try: with np.load(submission_filename) as data: submission_ids = data[f"{split}_ids"] except Exception as e: raise SubmissionLoadError( f"Error loading submission file. '{split}_ids' must not be object types." ) from e with np.load(annotations_path, allow_pickle=True) as data: annotations_ids = data[f"{split}_ids"] order = get_order(annotations_ids, submission_ids) try: with np.load(submission_filename) as pred_data: energy = pred_data[f"{split}_energy"][order] except Exception as e: raise SubmissionLoadError( "Error loading submission data." ) from e if len(set(np.where(np.isinf(energy))[0])) != 0: inf_energy_ids = list(set(np.where(np.isinf(energy))[0])) raise Exception( f"Inf values found in `energy` for IDs: ({inf_energy_ids[:3]}, ...)" ) with np.load(annotations_path, allow_pickle=True) as target_data: target_energy = target_data[f"{split}_energy"] energy_mae = np.mean(np.abs(target_energy - energy.flatten())) metrics[f"{split}_energy_mae"] = energy_mae # Compute average across all subsplits energy_maes = [metrics[f"{s}_energy_mae"] for s in subsets] metrics["avg_energy_mae"] = np.mean(energy_maes) return metrics def evaluate( annotations_path: Path, submission_filename: Path, eval_type: str, ): if eval_type in ["Validation", "Test"]: metrics = omol_s2ef_metrics( annotations_path, submission_filename, subsets=[ "all", "metal_complexes", "electrolytes", "biomolecules", "neutral_organics", ], ) elif eval_type == "OC20 S2EF Test": metrics = oc_s2ef_metrics( annotations_path, submission_filename, subsets=["id", "ood_ads", "ood_cat", "ood_both"], ) elif eval_type == "OC20 IS2RE Test": metrics = oc_is2re_metrics( annotations_path, submission_filename, subsets=["id", "ood_ads", "ood_cat", "ood_both"], ) elif eval_type in OMOL_EVAL_FUNCTIONS: metrics = omol_evaluations( annotations_path, submission_filename, eval_type, ) else: raise ValueError(f"Unknown eval_type: {eval_type}") return metrics