Spaces:

facebook
/

fairchem_leaderboard

Running on CPU Upgrade

App Files Files Community

fairchem_leaderboard / evaluator.py

mshuaibi

add utils

5258492 about 1 month ago

raw

history blame contribute delete

10 kB

	from pathlib import Path
	from typing import Dict

	import numpy as np
	import json

	from fairchem.data.omol.modules.evaluator import (
	ligand_pocket,
	ligand_strain,
	geom_conformers,
	protonation_energies,
	unoptimized_ie_ea,
	distance_scaling,
	unoptimized_spin_gap,
	)

	from evaluator_utils import get_order


	class SubmissionLoadError(Exception):
	"""Raised if unable to load the submission file."""


	OMOL_EVAL_FUNCTIONS = {
	"Ligand pocket": ligand_pocket,
	"Ligand strain": ligand_strain,
	"Conformers": geom_conformers,
	"Protonation": protonation_energies,
	"IE_EA": unoptimized_ie_ea,
	"Distance scaling": distance_scaling,
	"Spin gap": unoptimized_spin_gap,
	}

	OMOL_DATA_ID_MAPPING = {
	"metal_complexes": ["metal_complexes"],
	"electrolytes": ["elytes"],
	"biomolecules": ["biomolecules"],
	"neutral_organics": ["ani2x", "orbnet_denali", "geom_orca6", "trans1x", "rgd"],
	}

	# OC20 subsplit mappings
	OC20_DATA_ID_MAPPING = {
	"id": ["id"],
	"ood_ads": ["ood_ads"],
	"ood_cat": ["ood_cat"],
	"ood_both": ["ood_both"],
	}


	def omol_s2ef_metrics(
	annotations_path: Path,
	submission_filename: Path,
	subsets: list = ["all"],
	) -> Dict[str, float]:
	try:
	with np.load(submission_filename) as data:
	submission_ids = data["ids"]
	except Exception as e:
	raise SubmissionLoadError(
	"Error loading submission file. 'ids' must not be object types."
	) from e
	with np.load(annotations_path, allow_pickle=True) as data:
	annotations_ids = data["ids"]

	order = get_order(annotations_ids, submission_ids)

	try:
	with np.load(submission_filename) as pred_data:
	forces = pred_data["forces"]
	energy = pred_data["energy"][order]
	forces = np.array(
	np.split(forces, np.cumsum(pred_data["natoms"])[:-1]), dtype=object
	)[order]
	except Exception as e:
	raise SubmissionLoadError(
	"Error loading submission data. Make sure you concatenated your forces and there are no object types."
	) from e

	if len(set(np.where(np.isinf(energy))[0])) != 0:
	inf_energy_ids = list(set(np.where(np.isinf(energy))[0]))
	raise Exception(
	f"Inf values found in `energy` for IDs: ({inf_energy_ids[:3]}, ...)"
	)

	with np.load(annotations_path, allow_pickle=True) as target_data:
	target_forces = target_data["forces"]
	target_energy = target_data["energy"]
	target_data_ids = target_data["data_ids"]

	metrics = {}
	for subset in subsets:
	if subset == "all":
	subset_mask = np.ones(len(target_data_ids), dtype=bool)
	else:
	allowed_ids = set(OMOL_DATA_ID_MAPPING.get(subset, []))
	subset_mask = np.array(
	[data_id in allowed_ids for data_id in target_data_ids]
	)

	sub_energy = energy[subset_mask]
	sub_target_energy = target_energy[subset_mask]
	energy_mae = np.mean(np.abs(sub_target_energy - sub_energy))
	metrics[f"{subset}_energy_mae"] = energy_mae

	forces_mae = 0
	natoms = 0
	for sub_forces, sub_target_forces in zip(
	forces[subset_mask], target_forces[subset_mask]
	):
	forces_mae += np.sum(np.abs(sub_target_forces - sub_forces))
	natoms += sub_forces.shape[0]
	forces_mae /= 3 * natoms

	metrics[f"{subset}_forces_mae"] = forces_mae

	return metrics


	def omol_evaluations(
	annotations_path: Path,
	submission_filename: Path,
	eval_type: str,
	) -> Dict[str, float]:
	try:
	with open(submission_filename) as f:
	submission_data = json.load(f)
	except Exception as e:
	raise SubmissionLoadError("Error loading submission file") from e

	with open(annotations_path) as f:
	annotations_data = json.load(f)

	submission_entries = set(submission_data.keys())
	annotation_entries = set(annotations_data.keys())
	if submission_entries != annotation_entries:
	missing = annotation_entries - submission_entries
	unexpected = submission_entries - annotation_entries
	raise ValueError(
	f"Submission and annotations entries do not match.\n"
	f"Missing entries in submission: {missing}\n"
	f"Unexpected entries in submission: {unexpected}"
	)

	assert len(submission_entries) == len(
	submission_data
	), "Duplicate entries found in submission."

	eval_fn = OMOL_EVAL_FUNCTIONS.get(eval_type)
	metrics = eval_fn(annotations_data, submission_data)
	return metrics


	def oc_s2ef_metrics(
	annotations_path: Path,
	submission_filename: Path,
	subsets: list = ["id", "ood_ads", "ood_cat", "ood_both"],
	) -> Dict[str, float]:
	"""
	Calculate S2EF metrics for OC datasets.
	"""
	metrics = {}
	for split in subsets:
	try:
	with np.load(submission_filename) as data:
	submission_ids = data[f"{split}_ids"]
	except Exception as e:
	raise SubmissionLoadError(
	f"Error loading submission file. '{split}_ids' must not be object types."
	) from e
	with np.load(annotations_path, allow_pickle=True) as data:
	annotations_ids = data[f"{split}_ids"]

	order = get_order(annotations_ids, submission_ids)

	try:
	with np.load(submission_filename) as pred_data:
	forces = pred_data[f"{split}_forces"]
	energy = pred_data[f"{split}_energy"][order]
	forces = np.array(
	np.split(forces, pred_data[f"{split}_chunk_idx"]), dtype=object
	)[order]
	except Exception as e:
	raise SubmissionLoadError(
	"Error loading submission data. Make sure you concatenated your forces and there are no object types."
	) from e

	if len(set(np.where(np.isinf(energy))[0])) != 0:
	inf_energy_ids = list(set(np.where(np.isinf(energy))[0]))
	raise Exception(
	f"Inf values found in `energy` for IDs: ({inf_energy_ids[:3]}, ...)"
	)

	with np.load(annotations_path, allow_pickle=True) as target_data:
	target_forces = target_data[f"{split}_forces"]
	target_energy = target_data[f"{split}_energy"]

	energy_mae = np.mean(np.abs(target_energy - energy.flatten()))
	metrics[f"{split}_energy_mae"] = energy_mae

	forces_mae = 0
	natoms = 0
	for sub_forces, sub_target_forces in zip(
	forces, target_forces
	):
	forces_mae += np.sum(np.abs(sub_target_forces - sub_forces))
	natoms += sub_forces.shape[0]
	forces_mae /= 3 * natoms

	metrics[f"{split}_forces_mae"] = forces_mae

	# Compute average across all subsplits
	energy_maes = [metrics[f"{s}_energy_mae"] for s in subsets]
	forces_maes = [metrics[f"{s}_forces_mae"] for s in subsets]
	metrics["avg_energy_mae"] = np.mean(energy_maes)
	metrics["avg_forces_mae"] = np.mean(forces_maes)

	return metrics


	def oc_is2re_metrics(
	annotations_path: Path,
	submission_filename: Path,
	subsets: list = ["id", "ood_ads", "ood_cat", "ood_both"],
	) -> Dict[str, float]:
	"""
	Calculate IS2RE metrics for OC dataset.
	"""
	metrics = {}
	for split in subsets:
	try:
	with np.load(submission_filename) as data:
	submission_ids = data[f"{split}_ids"]
	except Exception as e:
	raise SubmissionLoadError(
	f"Error loading submission file. '{split}_ids' must not be object types."
	) from e
	with np.load(annotations_path, allow_pickle=True) as data:
	annotations_ids = data[f"{split}_ids"]

	order = get_order(annotations_ids, submission_ids)

	try:
	with np.load(submission_filename) as pred_data:
	energy = pred_data[f"{split}_energy"][order]
	except Exception as e:
	raise SubmissionLoadError(
	"Error loading submission data."
	) from e

	if len(set(np.where(np.isinf(energy))[0])) != 0:
	inf_energy_ids = list(set(np.where(np.isinf(energy))[0]))
	raise Exception(
	f"Inf values found in `energy` for IDs: ({inf_energy_ids[:3]}, ...)"
	)

	with np.load(annotations_path, allow_pickle=True) as target_data:
	target_energy = target_data[f"{split}_energy"]

	energy_mae = np.mean(np.abs(target_energy - energy.flatten()))
	metrics[f"{split}_energy_mae"] = energy_mae

	# Compute average across all subsplits
	energy_maes = [metrics[f"{s}_energy_mae"] for s in subsets]
	metrics["avg_energy_mae"] = np.mean(energy_maes)

	return metrics


	def evaluate(
	annotations_path: Path,
	submission_filename: Path,
	eval_type: str,
	):
	if eval_type in ["Validation", "Test"]:
	metrics = omol_s2ef_metrics(
	annotations_path,
	submission_filename,
	subsets=[
	"all",
	"metal_complexes",
	"electrolytes",
	"biomolecules",
	"neutral_organics",
	],
	)
	elif eval_type == "OC20 S2EF Test":
	metrics = oc_s2ef_metrics(
	annotations_path,
	submission_filename,
	subsets=["id", "ood_ads", "ood_cat", "ood_both"],
	)
	elif eval_type == "OC20 IS2RE Test":
	metrics = oc_is2re_metrics(
	annotations_path,
	submission_filename,
	subsets=["id", "ood_ads", "ood_cat", "ood_both"],
	)
	elif eval_type in OMOL_EVAL_FUNCTIONS:
	metrics = omol_evaluations(
	annotations_path,
	submission_filename,
	eval_type,
	)
	else:
	raise ValueError(f"Unknown eval_type: {eval_type}")

	return metrics