Skip to content
29 changes: 29 additions & 0 deletions armory/instrument/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ def _write(self, name, batch, result):
f"neutral: {result['neutral']}/{total}, "
f"entailment: {result['entailment']}/{total}"
)
elif "confusion_matrix" in name:
f_result = f"{result}"
elif any(m in name for m in MEAN_AP_METRICS):
if "input_to" in name:
for m in MEAN_AP_METRICS:
Expand All @@ -216,6 +218,8 @@ def _write(self, name, batch, result):
elif any(m in name for m in QUANTITY_METRICS):
# Don't include % symbol
f_result = f"{np.mean(result):.2}"
elif isinstance(result, dict):
f_result = f"{result}"
else:
f_result = f"{np.mean(result):.2%}"
log.success(
Expand Down Expand Up @@ -253,6 +257,31 @@ def _task_metric(
elif name == "word_error_rate":
final = metrics.get("total_wer")
final_suffix = "total_word_error_rate"
elif name == "per_class_mean_accuracy":
metric = metrics.get("identity_unzip")
func = metrics.get("per_class_mean_accuracy")

def final(x):
return func(*metrics.task.identity_zip(x))

final_suffix = name
elif name == "confusion_matrix":
metric = metrics.get("identity_unzip")
func = metrics.get("confusion_matrix")

def final(x):
return func(*metrics.task.identity_zip(x))

final_suffix = name
elif name == "precision_and_recall":
metric = metrics.get("identity_unzip")
func = metrics.get("precision_and_recall")

def final(x):
return func(*metrics.task.identity_zip(x))

final_suffix = name

elif use_mean:
final = np.mean
final_suffix = f"mean_{name}"
Expand Down
49 changes: 49 additions & 0 deletions armory/metrics/statistical.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from sklearn.metrics import silhouette_samples

from armory.metrics.perturbation import MetricNameSpace, set_namespace
from armory.metrics.task import populationwise

registered = MetricNameSpace()

Expand All @@ -21,6 +22,54 @@ def register(metric):
return set_namespace(registered, metric)


@populationwise
def precision_and_recall(y, y_pred):
"""
Produce a dictionary whose keys are class labels, and values are (precision, recall) for that class
"""
# Assumes that every class is represented in y

C = confusion_matrix(y, y_pred, normalize=False)
# breakpoint()
N = C.shape[0]
D = {}
for class_ in range(N):
# precision: true positives / number of items identified as class_
tp = C[class_, class_]
total_selected = C[:, class_].sum()
precision = tp / total_selected

# recall: true positives / number of actual items in class_
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

per-class recall is the exact same as per-class accuracy, which I didn't realize till now. Is it still useful to have two separate per_class_accuracy and per_class_precision_and_recall functions?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, let's keep both.

total_class_ = C[class_, :].sum()
recall = tp / total_class_

D[class_] = (precision, recall)

return D


@populationwise
def confusion_matrix(y, y_pred, normalize=True):
"""
Produce a matrix C such that C[i,j] is the percentage of class i that was classified as j
If normalize is False, C[i,j] is the actual number of such elements, rather than the percentage
"""
# Assumes that every class is represented in y
y = np.array(y)
y_pred = np.array(y_pred)
if y_pred.ndim == 2:
y_pred = np.argmax(y_pred, axis=1)
N = len(np.unique(y))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If y_pred is 2D, you can use that to derive N. (Or at least check to ensure that they match).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I may be misunderstanding, but N is the number of classes, not the total number of items. Hence length of np.unique(y) and not length of y. I don't think we can assume every class will show up in y_pred. For that matter it seems a little risky to assume they will all be present in y.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If y_pred is 2D, then it outputs either logits or probability distributions over the set of predicted classes, so you can do N = y_pred.shape[1].

If y_pred is 1D, however, that doesn't work.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this implicitly assumes that the classes are all integers from 0 to N - 1. However, if y has missing classes, then there will be some misalignment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh right, of course. Is it true that Armory scenarios will always have a 2D y_pred? Or it just depends on how the meter and probes are set up? So far the only source of a 1D y_pred I've encountered is my own unit tests, but I can expand those to 2D and then get N the way you described.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now it's dependent on the underlying model, unfortunately.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well after all, if a class is totally absent from y, its row in the matrix would be all zeros since it was never classified as anything. So maybe what I need to do is make this a dictionary after all, and key it with class labels, so if one is missing, then at least it will be clear what rows are what class. Alternatively, I could add a row of zeros at the index of missing class labels, but this would only be possible for missing labels less than the greatest non-missing label.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's just have the function assume that y_pred is 2D (and add that to the docstring). Other things can be handled by the user.

C = np.zeros((N, N))
for i in range(N):
for j in range(N):
C[i, j] = np.sum(y_pred[y == i] == j)
if normalize:
sums = np.sum(C, axis=1)
C = C / sums[:, np.newaxis]
return C


@register
def chi2_p_value(contingency_table: np.ndarray) -> List[float]:
"""
Expand Down
5 changes: 4 additions & 1 deletion armory/utils/config_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,10 @@
"carla_od_disappearance_rate",
"carla_od_hallucinations_per_image",
"carla_od_misclassification_rate",
"carla_od_true_positive_rate"
"carla_od_true_positive_rate",
"per_class_mean_accuracy",
"confusion_matrix",
"precision_and_recall"
]
},
"sysconfig": {
Expand Down
29 changes: 29 additions & 0 deletions tests/unit/test_statistical_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,35 @@
pytestmark = pytest.mark.unit


def test_confusion_matrix():
y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
y_pred = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 0])
assert statistical.confusion_matrix(y, y) == pytest.approx(
np.array([[1, 0], [0, 1]])
)
assert statistical.confusion_matrix(y, y_pred) == pytest.approx(
np.array([[0.6, 0.4], [0.2, 0.8]])
)
assert statistical.confusion_matrix(y, y_pred, normalize=False) == pytest.approx(
np.array([[3, 2], [1, 4]])
)


def test_precision_and_recall():
y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
y_pred = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 0])
D = statistical.precision_and_recall(y, y_pred)
assert D[0] == pytest.approx((0.75, 0.6))
assert D[1] == pytest.approx((0.66666667, 0.8))

y = np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2])
y_pred = np.array([0, 0, 0, 0, 1, 1, 2, 1, 2, 2, 0, 1])
D = statistical.precision_and_recall(y, y_pred)
assert D[0] == pytest.approx((0.8, 1))
assert D[1] == pytest.approx((0.75, 0.75))
assert D[2] == pytest.approx((0.666666667, 0.5))


def test_chi2_p_value():

table1 = np.array([[2, 3], [4, 6]])
Expand Down