Source code for kiwi.metrics.functions

#  OpenKiwi: Open-Source Machine Translation Quality Estimation
#  Copyright (C) 2019 Unbabel <openkiwi@unbabel.com>
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Affero General Public License as published
#  by the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Affero General Public License for more details.
#
#  You should have received a copy of the GNU Affero General Public License
#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
#

import numpy as np
from more_itertools import collapse

# def calibrate_threshold(scores, labels, MetricClass=LazyF1):
#     """Finds optimal decision threshold according to metric.
#     Args:
#         scores (list[float]): List of model output scores
#         labels (list): List of corresponding target labels
#     Returns:
#     (metric, threshold): The value of the Metric and the Threshold to be used.
#     """
#     metric = MetricClass(scores, labels)
#     scores, labels = metric.sort(scores, labels)
#     init_threshold = scores[0]
#     thresholds = [(metric.compute(), init_threshold)]
#     for score, label in zip(scores, labels):
#         metric.update(score, label)
#         thresholds.append((metric.compute(), score))
#     return metric.choose(thresholds)


[docs]def mean_absolute_error(y, y_hat): return np.mean(np.absolute(y_hat - y))
[docs]def mean_squared_error(y, y_hat): return np.square(np.subtract(y, y_hat)).mean()
[docs]def delta_average(y_true, y_rank): """Calculate the DeltaAvg score This is a much faster version than the Perl one provided in the WMT QE task 1. References: could not find any. Author: Fabio Kepler (contributed to MARMOT) Args: y_true: array of reference score (not rank) of each segment. y_rank: array of rank of each segment. Returns: the absolute delta average score. """ sorted_ranked_indexes = np.argsort(y_rank) y_length = len(sorted_ranked_indexes) delta_avg = 0 max_quantiles = y_length // 2 set_value = ( np.sum(y_true[sorted_ranked_indexes[np.arange(y_length)]]) / y_length ) quantile_values = { head: np.sum(y_true[sorted_ranked_indexes[np.arange(head)]]) / head for head in range(2, y_length) } # Cache values, since there are many that are repeatedly computed # between various quantiles. for quantiles in range(2, max_quantiles + 1): # Current number of quantiles quantile_length = y_length // quantiles quantile_sum = 0 for head in np.arange( quantile_length, quantiles * quantile_length, quantile_length ): quantile_sum += quantile_values[head] delta_avg += quantile_sum / (quantiles - 1) - set_value if max_quantiles > 1: delta_avg /= max_quantiles - 1 else: delta_avg = 0 return abs(delta_avg)
[docs]def precision(tp, fp, fn): if tp + fp > 0: return tp / (tp + fp) return 0
[docs]def recall(tp, fp, fn): if tp + fn > 0: return tp / (tp + fn) return 0
[docs]def fscore(tp, fp, fn): p = precision(tp, fp, fn) r = recall(tp, fp, fn) if p + r > 0: return 2 * (p * r) / (p + r) return 0
[docs]def confusion_matrix(hat_y, y, n_classes=None): hat_y = np.array(list(collapse(hat_y))) y = np.array(list(collapse(y))) if n_classes is None: classes = np.unique(np.union1d(hat_y, y)) n_classes = len(classes) cnfm = np.zeros((n_classes, n_classes)) for j in range(y.shape[0]): cnfm[y[j], hat_y[j]] += 1 return cnfm
[docs]def scores_for_class(class_index, cnfm): tp = cnfm[class_index, class_index] fp = cnfm[:, class_index].sum() - tp fn = cnfm[class_index, :].sum() - tp tn = cnfm.sum() - tp - fp - fn p = precision(tp, fp, fn) r = recall(tp, fp, fn) f1 = fscore(tp, fp, fn) support = tp + tn return p, r, f1, support
[docs]def precision_recall_fscore_support(hat_y, y, labels=None): n_classes = len(labels) if labels else None cnfm = confusion_matrix(hat_y, y, n_classes) if n_classes is None: n_classes = cnfm.shape[0] scores = np.zeros((n_classes, 4)) for class_id in range(n_classes): scores[class_id] = scores_for_class(class_id, cnfm) return scores.T.tolist()
[docs]def f1_product(hat_y, y): p, r, f1, s = precision_recall_fscore_support(hat_y, y) f1_mult = np.prod(f1) return f1_mult
[docs]def f1_scores(hat_y, y): """ Return f1_bad, f1_ok and f1_product """ p, r, f1, s = precision_recall_fscore_support(hat_y, y) f_mult = np.prod(f1) return (*f1, f_mult)