ProFound / util /metric.py
Anonymise's picture
add necessary module
45461c9
import torch
import prettytable
import copy
import sys
from importlib import import_module
from inspect import signature
from pathlib import Path
from typing import Optional, Union
import numpy as np
from scipy.stats import kendalltau, pearsonr, spearmanr
from sklearn.metrics import (
confusion_matrix,
f1_score,
fbeta_score,
get_scorer,
get_scorer_names,
make_scorer,
)
def binary_accuracy(output: torch.Tensor, target: torch.Tensor) -> float:
"""Computes the accuracy for binary classification"""
with torch.no_grad():
batch_size = target.size(0)
pred = (output >= 0.5).float().t().view(-1)
correct = pred.eq(target.view(-1)).float().sum()
correct.mul_(100.0 / batch_size)
return correct
def accuracy(output, target, topk=(1,)):
r"""
Computes the accuracy over the k top predictions for the specified values of k
Args:
output (tensor): Classification outputs, :math:`(N, C)` where `C = number of classes`
target (tensor): :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`
topk (sequence[int]): A list of top-N number.
Returns:
Top-N accuracies (N :math:`\in` topK).
"""
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target[None])
res = []
for k in topk:
correct_k = correct[:k].flatten().sum(dtype=torch.float32)
res.append(correct_k * (100.0 / batch_size))
return res
class ConfusionMatrix(object):
def __init__(self, num_classes):
self.num_classes = num_classes
self.mat = None
def update(self, target, output):
"""
Update confusion matrix.
Args:
target: ground truth
output: predictions of models
Shape:
- target: :math:`(minibatch, C)` where C means the number of classes.
- output: :math:`(minibatch, C)` where C means the number of classes.
"""
n = self.num_classes
if self.mat is None:
self.mat = torch.zeros((n, n), dtype=torch.int64, device=target.device)
with torch.no_grad():
k = (target >= 0) & (target < n)
inds = n * target[k].to(torch.int64) + output[k]
self.mat += torch.bincount(inds, minlength=n**2).reshape(n, n)
def reset(self):
self.mat.zero_()
def compute(self):
"""compute global accuracy, per-class accuracy and per-class IoU"""
h = self.mat.float()
acc_global = torch.diag(h).sum() / h.sum()
acc = torch.diag(h) / h.sum(1)
iu = torch.diag(h) / (h.sum(1) + h.sum(0) - torch.diag(h))
return acc_global, acc, iu
# def reduce_from_all_processes(self):
# if not torch.distributed.is_available():
# return
# if not torch.distributed.is_initialized():
# return
# torch.distributed.barrier()
# torch.distributed.all_reduce(self.mat)
def __str__(self):
acc_global, acc, iu = self.compute()
return (
"global correct: {:.1f}\n"
"average row correct: {}\n"
"IoU: {}\n"
"mean IoU: {:.1f}"
).format(
acc_global.item() * 100,
["{:.1f}".format(i) for i in (acc * 100).tolist()],
["{:.1f}".format(i) for i in (iu * 100).tolist()],
iu.mean().item() * 100,
)
def format(self, classes: list):
"""Get the accuracy and IoU for each class in the table format"""
acc_global, acc, iu = self.compute()
table = prettytable.PrettyTable(["class", "acc", "iou"])
for i, class_name, per_acc, per_iu in zip(
range(len(classes)), classes, (acc * 100).tolist(), (iu * 100).tolist()
):
table.add_row([class_name, per_acc, per_iu])
return (
"global correct: {:.1f}\nmean correct:{:.1f}\nmean IoU: {:.1f}\n{}".format(
acc_global.item() * 100,
acc.mean().item() * 100,
iu.mean().item() * 100,
table.get_string(),
)
)
def kappa(
y_true: np.ndarray,
y_pred: np.ndarray,
weights: Optional[Union[str, np.ndarray]] = None,
allow_off_by_one: bool = False,
) -> float:
"""
Calculate the kappa inter-rater agreement.
The agreement is calculated between the gold standard and the predicted
ratings. Potential values range from -1 (representing complete disagreement)
to 1 (representing complete agreement). A kappa value of 0 is expected if
all agreement is due to chance.
In the course of calculating kappa, all items in ``y_true`` and ``y_pred`` will
first be converted to floats and then rounded to integers.
It is assumed that y_true and y_pred contain the complete range of possible
ratings.
This function contains a combination of code from yorchopolis's kappa-stats
and Ben Hamner's Metrics projects on Github.
Parameters
----------
y_true : numpy.ndarray
The true/actual/gold labels for the data.
y_pred : numpy.ndarray
The predicted/observed labels for the data.
weights : Optional[Union[str, numpy.ndarray]], default=None
Specifies the weight matrix for the calculation.
Possible values are: ``None`` (unweighted-kappa), ``"quadratic"``
(quadratically weighted kappa), ``"linear"`` (linearly weighted kappa),
and a two-dimensional numpy array (a custom matrix of weights). Each
weight in this array corresponds to the :math:`w_{ij}` values in the
Wikipedia description of how to calculate weighted Cohen's kappa.
allow_off_by_one : bool, default=False
If true, ratings that are off by one are counted as
equal, and all other differences are reduced by
one. For example, 1 and 2 will be considered to be
equal, whereas 1 and 3 will have a difference of 1
for when building the weights matrix.
Returns
-------
float
The weighted or unweighted kappa score.
Raises
------
AssertionError
If ``y_true`` != ``y_pred``.
ValueError
If labels cannot be converted to int.
ValueError
If invalid weight scheme.
"""
# Ensure that the lists are both the same length
assert len(y_true) == len(y_pred)
# This rather crazy looking typecast is intended to work as follows:
# If an input is an int, the operations will have no effect.
# If it is a float, it will be rounded and then converted to an int
# because the ml_metrics package requires ints.
# If it is a str like "1", then it will be converted to a (rounded) int.
# If it is a str that can't be typecast, then the user is
# given a hopefully useful error message.
try:
y_true = np.array([int(np.round(float(y))) for y in y_true])
y_pred = np.array([int(np.round(float(y))) for y in y_pred])
except ValueError:
raise ValueError(
"For kappa, the labels should be integers or strings"
" that can be converted to ints (E.g., '4.0' or "
"'3')."
)
# Figure out normalized expected values
min_rating = min(min(y_true), min(y_pred))
max_rating = max(max(y_true), max(y_pred))
# shift the values so that the lowest value is 0
# (to support scales that include negative values)
y_true = y_true - min_rating
y_pred = y_pred - min_rating
# Build the observed/confusion matrix
num_ratings = max_rating - min_rating + 1
observed = confusion_matrix(y_true, y_pred, labels=list(range(num_ratings)))
num_scored_items = float(len(y_true))
# Build weight array if weren't passed one
if isinstance(weights, str):
wt_scheme = weights
weights = None
else:
wt_scheme = ""
if weights is None:
kappa_weights = np.empty((num_ratings, num_ratings))
for i in range(num_ratings):
for j in range(num_ratings):
diff = abs(i - j)
if allow_off_by_one and diff:
diff -= 1
if wt_scheme == "linear":
kappa_weights[i, j] = diff
elif wt_scheme == "quadratic":
kappa_weights[i, j] = diff**2
elif not wt_scheme: # unweighted
kappa_weights[i, j] = bool(diff)
else:
raise ValueError(
"Invalid weight scheme specified for " f"kappa: {wt_scheme}"
)
else:
kappa_weights = weights
hist_true: np.ndarray = np.bincount(y_true, minlength=num_ratings)
hist_true = hist_true[:num_ratings] / num_scored_items
hist_pred: np.ndarray = np.bincount(y_pred, minlength=num_ratings)
hist_pred = hist_pred[:num_ratings] / num_scored_items
expected = np.outer(hist_true, hist_pred)
# Normalize observed array
observed = observed / num_scored_items
# If all weights are zero, that means no disagreements matter.
k = 1.0
if np.count_nonzero(kappa_weights):
observed_sum = np.sum(kappa_weights * observed)
expected_sum = np.sum(kappa_weights * expected)
k -= np.sum(observed_sum) / np.sum(expected_sum)
return k
def correlation(
y_true: np.ndarray, y_pred: np.ndarray, corr_type: str = "pearson"
) -> float:
"""
Calculate given correlation type between ``y_true`` and ``y_pred``.
``y_pred`` can be multi-dimensional. If ``y_pred`` is 1-dimensional, it
may either contain probabilities, most-likely classification labels, or
regressor predictions. In that case, we simply return the correlation
between ``y_true`` and ``y_pred``. If ``y_pred`` is multi-dimensional,
it contains probabilties for multiple classes in which case, we infer the
most likely labels and then compute the correlation between those and
``y_true``.
Parameters
----------
y_true : numpy.ndarray
The true/actual/gold labels for the data.
y_pred : numpy.ndarray
The predicted/observed labels for the data.
corr_type : str, default="pearson"
Which type of correlation to compute. Possible
choices are "pearson", "spearman", and "kendall_tau".
Returns
-------
float
correlation value if well-defined, else 0.0
"""
# get the correlation function to use based on the given type
corr_func = pearsonr
if corr_type == "spearman":
corr_func = spearmanr
elif corr_type == "kendall_tau":
corr_func = kendalltau
# convert to numpy array in case we are passed a list
y_pred = np.array(y_pred)
# multi-dimensional -> probability array -> get label
if y_pred.ndim > 1:
labels = np.argmax(y_pred, axis=1)
ret_score = corr_func(y_true, labels)[0]
# 1-dimensional -> probabilities/labels -> use as is
else:
ret_score = corr_func(y_true, y_pred)[0]
return ret_score
def f1_score_least_frequent(y_true: np.ndarray, y_pred: np.ndarray) -> float:
"""
Calculate F1 score of the least frequent label/class.
Parameters
----------
y_true : numpy.ndarray
The true/actual/gold labels for the data.
y_pred : numpy.ndarray
The predicted/observed labels for the data.
Returns
-------
float
F1 score of the least frequent label.
"""
least_frequent = np.bincount(y_true).argmin()
return f1_score(y_true, y_pred, average=None)[least_frequent]