twosixlabs · swsuggs · May 27, 2022 · May 30, 2022 · May 30, 2022 · May 31, 2022
diff --git a/armory/instrument/config.py b/armory/instrument/config.py
@@ -205,6 +205,8 @@ def _write(self, name, batch, result):
                 f"neutral: {result['neutral']}/{total}, "
                 f"entailment: {result['entailment']}/{total}"
             )
+        elif "confusion_matrix" in name:
+            f_result = f"{result}"
         elif any(m in name for m in MEAN_AP_METRICS):
             if "input_to" in name:
                 for m in MEAN_AP_METRICS:
@@ -216,6 +218,8 @@ def _write(self, name, batch, result):
         elif any(m in name for m in QUANTITY_METRICS):
             # Don't include % symbol
             f_result = f"{np.mean(result):.2}"
+        elif isinstance(result, dict):
+            f_result = f"{result}"
         else:
             f_result = f"{np.mean(result):.2%}"
         log.success(
@@ -253,6 +257,31 @@ def _task_metric(
     elif name == "word_error_rate":
         final = metrics.get("total_wer")
         final_suffix = "total_word_error_rate"
+    elif name == "per_class_mean_accuracy":
+        metric = metrics.get("identity_unzip")
+        func = metrics.get("per_class_mean_accuracy")
+
+        def final(x):
+            return func(*metrics.task.identity_zip(x))
+
+        final_suffix = name
+    elif name == "confusion_matrix":
+        metric = metrics.get("identity_unzip")
+        func = metrics.get("confusion_matrix")
+
+        def final(x):
+            return func(*metrics.task.identity_zip(x))
+
+        final_suffix = name
+    elif name == "precision_and_recall":
+        metric = metrics.get("identity_unzip")
+        func = metrics.get("precision_and_recall")
+
+        def final(x):
+            return func(*metrics.task.identity_zip(x))
+
+        final_suffix = name
+
     elif use_mean:
         final = np.mean
         final_suffix = f"mean_{name}"

diff --git a/armory/metrics/statistical.py b/armory/metrics/statistical.py
@@ -10,6 +10,7 @@
 from sklearn.metrics import silhouette_samples
 
 from armory.metrics.perturbation import MetricNameSpace, set_namespace
+from armory.metrics.task import populationwise
 
 registered = MetricNameSpace()
 
@@ -21,6 +22,54 @@ def register(metric):
     return set_namespace(registered, metric)
 
 
+@populationwise
+def precision_and_recall(y, y_pred):
+    """
+    Produce a dictionary whose keys are class labels, and values are (precision, recall) for that class
+    """
+    # Assumes that every class is represented in y
+
+    C = confusion_matrix(y, y_pred, normalize=False)
+    # breakpoint()
+    N = C.shape[0]
+    D = {}
+    for class_ in range(N):
+        # precision: true positives / number of items identified as class_
+        tp = C[class_, class_]
+        total_selected = C[:, class_].sum()
+        precision = tp / total_selected
+
+        # recall: true positives / number of actual items in class_
+        total_class_ = C[class_, :].sum()
+        recall = tp / total_class_
+
+        D[class_] = (precision, recall)
+
+    return D
+
+
+@populationwise
+def confusion_matrix(y, y_pred, normalize=True):
+    """
+    Produce a matrix C such that C[i,j] is the percentage of class i that was classified as j
+    If normalize is False, C[i,j] is the actual number of such elements, rather than the percentage
+    """
+    # Assumes that every class is represented in y
+    y = np.array(y)
+    y_pred = np.array(y_pred)
+    if y_pred.ndim == 2:
+        y_pred = np.argmax(y_pred, axis=1)
+    N = len(np.unique(y))
+    C = np.zeros((N, N))
+    for i in range(N):
+        for j in range(N):
+            C[i, j] = np.sum(y_pred[y == i] == j)
+    if normalize:
+        sums = np.sum(C, axis=1)
+        C = C / sums[:, np.newaxis]
+    return C
+
+
 @register
 def chi2_p_value(contingency_table: np.ndarray) -> List[float]:
     """

diff --git a/armory/utils/config_schema.json b/armory/utils/config_schema.json
@@ -261,7 +261,10 @@
                 "carla_od_disappearance_rate",
                 "carla_od_hallucinations_per_image",
                 "carla_od_misclassification_rate",
-                "carla_od_true_positive_rate"
+                "carla_od_true_positive_rate",
+                "per_class_mean_accuracy",
+                "confusion_matrix",
+                "precision_and_recall"
             ]
         },
         "sysconfig": {

diff --git a/tests/unit/test_statistical_metrics.py b/tests/unit/test_statistical_metrics.py
@@ -11,6 +11,35 @@
 pytestmark = pytest.mark.unit
 
 
+def test_confusion_matrix():
+    y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    y_pred = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 0])
+    assert statistical.confusion_matrix(y, y) == pytest.approx(
+        np.array([[1, 0], [0, 1]])
+    )
+    assert statistical.confusion_matrix(y, y_pred) == pytest.approx(
+        np.array([[0.6, 0.4], [0.2, 0.8]])
+    )
+    assert statistical.confusion_matrix(y, y_pred, normalize=False) == pytest.approx(
+        np.array([[3, 2], [1, 4]])
+    )
+
+
+def test_precision_and_recall():
+    y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    y_pred = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 0])
+    D = statistical.precision_and_recall(y, y_pred)
+    assert D[0] == pytest.approx((0.75, 0.6))
+    assert D[1] == pytest.approx((0.66666667, 0.8))
+
+    y = np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2])
+    y_pred = np.array([0, 0, 0, 0, 1, 1, 2, 1, 2, 2, 0, 1])
+    D = statistical.precision_and_recall(y, y_pred)
+    assert D[0] == pytest.approx((0.8, 1))
+    assert D[1] == pytest.approx((0.75, 0.75))
+    assert D[2] == pytest.approx((0.666666667, 0.5))
+
+
 def test_chi2_p_value():
 
     table1 = np.array([[2, 3], [4, 6]])