From a9282daa84f6a05dc789c2765afb579f1de20c01 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Tue, 11 Oct 2022 11:45:37 -0700 Subject: [PATCH 001/258] added basic regression ranking --- cleanlab/regression/__init__.py | 1 + cleanlab/regression/rank.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 cleanlab/regression/__init__.py create mode 100644 cleanlab/regression/rank.py diff --git a/cleanlab/regression/__init__.py b/cleanlab/regression/__init__.py new file mode 100644 index 0000000000..77e9b5a97b --- /dev/null +++ b/cleanlab/regression/__init__.py @@ -0,0 +1 @@ +from . import rank \ No newline at end of file diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py new file mode 100644 index 0000000000..2cd91694c3 --- /dev/null +++ b/cleanlab/regression/rank.py @@ -0,0 +1,24 @@ +import numpy as np + +def get_label_quality_score( + true_labels: np.ndarray, + pred_labels: np.ndarray +)-> np.ndarray: + """ + Returns label quality score + + Score is continous value in range [0,1] + + 1 - clean label (given label is likely correct). + 0 - dirty label (given label is likely incorrect). + """ + residual = true_labels - pred_labels + quality_scores = np.exp(-abs(residual)) + return quality_scores + + +if __name__ == "__main__": +## WILL BE DELETED LATER + a = np.array([1,2,3,4]) + b = np.array([2,2,5,4.1]) + print(get_label_quality_score(a,b)) From 05035950d782fee4adde8da6c44afb47953fb5db Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 4 Nov 2022 14:17:40 -0700 Subject: [PATCH 002/258] minor fixes, docstring modified --- cleanlab/regression/__init__.py | 2 +- cleanlab/regression/rank.py | 56 +++++++++++++++++++++++---------- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/cleanlab/regression/__init__.py b/cleanlab/regression/__init__.py index 77e9b5a97b..aab0b677cf 100644 --- a/cleanlab/regression/__init__.py +++ b/cleanlab/regression/__init__.py @@ -1 +1 @@ -from . import rank \ No newline at end of file +from . import rank diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 2cd91694c3..2b4520b1b7 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,24 +1,46 @@ -import numpy as np +import numpy as np -def get_label_quality_score( - true_labels: np.ndarray, - pred_labels: np.ndarray -)-> np.ndarray: + +def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np.ndarray: """ - Returns label quality score - - Score is continous value in range [0,1] - + Returns label quality score for each example in the regression dataset. + + Each score is continous value in range [0,1] 1 - clean label (given label is likely correct). 0 - dirty label (given label is likely incorrect). + + Parameters + ---------- + labels: + Raw labels from original dataset. + Array of shape ``(N, )`` consisting given labels, where N is number of datapoints in the regression dataset. + + pred_labels: + Predicated labels from regressor fitted on the dataset. + Array of shape ``(N,)`` consisting predicted labels, where N is number of datapoints in the regression dataset. + + Returns + ------- + label_quality_scores: + Array of shape ``(N, )`` of scores between 0 and 1, one per datapoint in the dataset. + + Lower scores indicate datapoint more likely to contain a label issue. + + Examples + -------- + >>> import numpy as np + >>> from cleanlab.regression.rank import get_label_quality_scores + >>> labels = np.array([1,2,3,4]) + >>> pred_labels = np.array([2,2,5,4.1]) + >>> label_quality_scores = get_label_quality_scores(labels, pred_labels) + >>> label_quality_scores + array([0.36787944, 1. , 0.13533528, 0.90483742]) """ - residual = true_labels - pred_labels - quality_scores = np.exp(-abs(residual)) - return quality_scores + assert ( + labels.shape == pred_labels.shape + ), f"shape of label {labels.shape} and predicted labels {pred_labels.shape} are not same." -if __name__ == "__main__": -## WILL BE DELETED LATER - a = np.array([1,2,3,4]) - b = np.array([2,2,5,4.1]) - print(get_label_quality_score(a,b)) + residual = pred_labels - labels + quality_scores = np.exp(-abs(residual)) + return quality_scores From 0a0c41e69df2232a5eb615bd47815896acd77461 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Wed, 9 Nov 2022 17:25:35 -0800 Subject: [PATCH 003/258] tutorial added, added to docs index pages --- cleanlab/__init__.py | 1 + cleanlab/regression/rank.py | 2 + docs/source/cleanlab/regression.rst | 8 + docs/source/tutorials/index.rst | 1 + docs/source/tutorials/regression.ipynb | 338 +++++++++++++++++++++++++ 5 files changed, 350 insertions(+) create mode 100644 docs/source/cleanlab/regression.rst create mode 100644 docs/source/tutorials/regression.ipynb diff --git a/cleanlab/__init__.py b/cleanlab/__init__.py index 991eaecbb9..46b87525d1 100644 --- a/cleanlab/__init__.py +++ b/cleanlab/__init__.py @@ -8,3 +8,4 @@ from . import multiannotator from . import outlier from . import token_classification +from . import regression diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 2b4520b1b7..2fdde78299 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,5 +1,7 @@ import numpy as np +""" generate label quality score for regression dataset""" + def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np.ndarray: """ diff --git a/docs/source/cleanlab/regression.rst b/docs/source/cleanlab/regression.rst new file mode 100644 index 0000000000..1cae31915a --- /dev/null +++ b/docs/source/cleanlab/regression.rst @@ -0,0 +1,8 @@ +regression +============== + +.. automodule:: cleanlab.regression + :autosummary: + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst index 1fc00488be..a45367135f 100644 --- a/docs/source/tutorials/index.rst +++ b/docs/source/tutorials/index.rst @@ -15,3 +15,4 @@ Tutorials token_classification pred_probs_cross_val faq + regression diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb new file mode 100644 index 0000000000..46f3b5ee38 --- /dev/null +++ b/docs/source/tutorials/regression.ipynb @@ -0,0 +1,338 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Label Quality Scores for Regression with Noisy Labels\n", + "In this tutorial, you will learn how to use cleanlab on regression dataset to: \n", + "- find label issue in your regression dataset\n", + "- generate label quality scores for each example in the dataset. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install dependencies and import them \n", + "You can use pip to install all packages required for this tutorial as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install cleanlab\n", + "!pip install scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np \n", + "import pandas as pd \n", + "from cleanlab.regression.rank import get_label_quality_scores\n", + "from sklearn.linear_model import LinearRegression\n", + "import matplotlib.pyplot as plt " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def make_data(feature_size = (20, 3), \n", + " means = [8, 20, -10], \n", + " stds = [2, 5, 3], \n", + " bias = 0.8,\n", + " coeff = [2, 0.1, 0.5],\n", + " error = [-2, 0, 2], \n", + " prob_error = [0.2, 0.6, 0.2], \n", + " seed = 42\n", + ") -> pd.DataFrame:\n", + " \"\"\"\n", + " feature_size: Tuple of (datapoints, features)\n", + " \"\"\"\n", + " assert (len(means) == feature_size[1]), (f\"length of mean {len(means)} is not same as features requested{feature_size[0]}\")\n", + " assert (len(stds) == feature_size[1]), (f\"length of stds {len(stds)} is not same as features requested{feature_size[0]}\")\n", + " np.random.seed(seed) \n", + "\n", + " features = []\n", + " for i in range(feature_size[1]):\n", + " values = coeff[i] * np.random.normal(loc=means[i], scale=stds[i], size=feature_size[0])\n", + " features.append(values)\n", + " \n", + " true_labels = sum(map(np.array, features))+ bias\n", + " labels = true_labels + np.random.choice(error, feature_size[0], p=prob_error)\n", + " \n", + " data_dict = {\n", + " \"lables\" : labels, # You have these labels, which have some errors.\n", + " \"true_labels\" : true_labels, # You never get to see these perfect labels.\n", + " } \n", + " for idx, feature in enumerate(features): # adding names to each features \n", + " data_dict[\"feature_\"+str(idx+1)] = feature\n", + " data = pd.DataFrame.from_dict(data_dict)\n", + " col = list(data.columns)\n", + " new_col = col[2:] + col[:2]\n", + " data = data.reindex(columns=new_col)\n", + " return data\n", + "\n", + "def plot_data(data, \n", + " circles, \n", + " title, \n", + " alpha=0.6, \n", + " color = '#1f77b4', \n", + " xlabel = \"feature\", \n", + " colorbar = False):\n", + " \n", + " plt.figure(figsize=(14, 5))\n", + " plt.xlabel(xlabel, size=13)\n", + " plt.ylabel('label',size=13)\n", + " data = data.to_numpy()\n", + " plt.scatter(data[:,0], data[:,1], c = color, s=60)\n", + " for i in circles:\n", + " plt.plot(\n", + " data[i][0],\n", + " data[i][1],\n", + " \"o\",\n", + " markerfacecolor=\"none\",\n", + " markeredgecolor=\"red\",\n", + " markersize=14,\n", + " markeredgewidth=2.5,\n", + " alpha=alpha\n", + " )\n", + " plt.title(title, fontsize=20)\n", + " \n", + " if colorbar: plt.colorbar(orientation = 'vertical')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data = make_data(feature_size=(100,3))\n", + "true_errors = np.where(data['true_labels'] != data['lables'])[0]\n", + "plot_data(data[['feature_1','lables']], circles=true_errors, title=\"Messy Regression dataset\", xlabel=\"feature_1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The figure above represents a toy dataset we'll use to demostrate label scoring for regression dataset. In this example, lables are ploted w.r.t. one of the features of the dataset. \n", + "\n", + "Like many real-world datasets, the given label happen to be incorrect for some of the examples(**circled in red**) in this dataset. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using cleanlab to generate label quality scores" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feature_1feature_2feature_3lablestrue_labels
017.9868571.292315-4.46331915.61585215.615852
115.4469431.789677-4.15882311.87779713.877797
218.5907541.828643-3.37542317.84397417.843974
322.0921191.598861-3.41929723.07168421.071684
415.0633871.919357-7.06650412.71624010.716240
\n", + "
" + ], + "text/plain": [ + " feature_1 feature_2 feature_3 lables true_labels\n", + "0 17.986857 1.292315 -4.463319 15.615852 15.615852\n", + "1 15.446943 1.789677 -4.158823 11.877797 13.877797\n", + "2 18.590754 1.828643 -3.375423 17.843974 17.843974\n", + "3 22.092119 1.598861 -3.419297 23.071684 21.071684\n", + "4 15.063387 1.919357 -7.066504 12.716240 10.716240" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# start with checking the dataset generated\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Note that \"true_labels\" will not be available in real-life dataset. We have added here only for comparision.\n", + "X = data[['feature_'+str(i+1) for i in range(3)]]\n", + "y = data['lables']\n", + "\n", + "# initialize your favourite model and generate predictions \n", + "yourFavouriteModel = LinearRegression()\n", + "yourFavouriteModel = yourFavouriteModel.fit(X,y)\n", + "predictions = yourFavouriteModel.predict(X)\n", + "\n", + "# get label quality score for each example in the dataset \n", + "label_quality = get_label_quality_scores(labels=np.array(data['lables']), pred_labels=predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_data(\n", + " data[['feature_1','lables']], \n", + " circles=true_errors ,\n", + " color=label_quality, \n", + " title=\"Messy Regression dataset with label quality scores\", \n", + " colorbar=True, \n", + " xlabel = \"feature_1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the above plot, we have colored each datapoint considering its label quality score. \\\n", + "Datapoints in the plot are same as earlier plot in the notebook. **Red circle** represents that these datapoint were incorrectly marked when compared to true_label. \n", + "\n", + "Low scores for datapoints marked in **Red circle** and High scores for other datapoints justifies that method can identify the errors in the dataset. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.8 ('ENV': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "1ed33b5e6ac3d9870092cd802185bba6fb7a8302b6022e7097221f18c33cb7b2" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From f4a8d17d1499a17ae2f6de2bca43fb40e1180d0a Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 10 Nov 2022 12:44:24 -0800 Subject: [PATCH 004/258] unit tests added --- cleanlab/regression/rank.py | 5 ++++- tests/test_regression.py | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 tests/test_regression.py diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 2fdde78299..54c38577f1 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,6 +1,6 @@ import numpy as np -""" generate label quality score for regression dataset""" +""" generate label quality score for regression dataset """ def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np.ndarray: @@ -39,6 +39,9 @@ def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np. array([0.36787944, 1. , 0.13533528, 0.90483742]) """ + if not isinstance(labels, np.ndarray) or not isinstance(pred_labels, np.ndarray): + raise TypeError("labels and pred_labels must be of type np.ndarray") + assert ( labels.shape == pred_labels.shape ), f"shape of label {labels.shape} and predicted labels {pred_labels.shape} are not same." diff --git a/tests/test_regression.py b/tests/test_regression.py new file mode 100644 index 0000000000..71de96cf3c --- /dev/null +++ b/tests/test_regression.py @@ -0,0 +1,27 @@ +import numpy as np +import pandas as pd +import pytest + +from cleanlab.regression.rank import get_label_quality_scores + +# To be used for all the tests +labels = np.array([1, 2, 3, 4]) +pred_labels = np.array([1, 3, 4, 5]) + + +def test_output_shape_type(): + scores = get_label_quality_scores(labels=labels, pred_labels=pred_labels) + assert labels.shape == scores.shape + assert isinstance(scores, np.ndarray) + + +@pytest.mark.parametrize("format", [pd.Series, pd.DataFrame, list]) +def test_type_error_for_input_types(format): + with pytest.raises(TypeError) as error: + _ = get_label_quality_scores(labels=format(labels), pred_labels=format(pred_labels)) + + +def test_assertion_error_for_input_shape(): + with pytest.raises(AssertionError) as error: + _ = get_label_quality_scores(labels=labels[:-1], pred_labels=pred_labels) + _ = get_label_quality_scores(labels=labels, pred_labels=pred_labels[:-1]) From 5aee14192d6dee8f5340adadb50331e55079b140 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 10 Nov 2022 12:47:49 -0800 Subject: [PATCH 005/258] reindexed tutorial, punctuation fix for docstring --- cleanlab/regression/rank.py | 2 +- docs/source/tutorials/index.rst | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 54c38577f1..6f8ad48323 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,6 +1,6 @@ import numpy as np -""" generate label quality score for regression dataset """ +""" Generates label quality scores for every sample in regression dataset """ def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np.ndarray: diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst index a45367135f..817161c40b 100644 --- a/docs/source/tutorials/index.rst +++ b/docs/source/tutorials/index.rst @@ -13,6 +13,7 @@ Tutorials outliers multiannotator token_classification + regression pred_probs_cross_val faq - regression + From 03fbc18f73017d5c7556a6f38186f62ee4f96999 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Tue, 15 Nov 2022 15:51:04 -0800 Subject: [PATCH 006/258] plots changed in tutorial notebook --- docs/source/tutorials/regression.ipynb | 99 ++++++++++++-------------- 1 file changed, 47 insertions(+), 52 deletions(-) diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb index 46f3b5ee38..92bc78ff13 100644 --- a/docs/source/tutorials/regression.ipynb +++ b/docs/source/tutorials/regression.ipynb @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -43,15 +43,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "def make_data(feature_size = (20, 3), \n", - " means = [8, 20, -10], \n", - " stds = [2, 5, 3], \n", + "def make_data(feature_size = (20, 2), \n", + " means = [8, -10], \n", + " stds = [2, 5], \n", " bias = 0.8,\n", - " coeff = [2, 0.1, 0.5],\n", + " coeff = [2, 0.1],\n", " error = [-2, 0, 2], \n", " prob_error = [0.2, 0.6, 0.2], \n", " seed = 42\n", @@ -88,12 +88,9 @@ " title, \n", " alpha=0.6, \n", " color = '#1f77b4', \n", - " xlabel = \"feature\", \n", " colorbar = False):\n", " \n", " plt.figure(figsize=(14, 5))\n", - " plt.xlabel(xlabel, size=13)\n", - " plt.ylabel('label',size=13)\n", " data = data.to_numpy()\n", " plt.scatter(data[:,0], data[:,1], c = color, s=60)\n", " for i in circles:\n", @@ -114,14 +111,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -129,16 +126,20 @@ } ], "source": [ - "data = make_data(feature_size=(100,3))\n", + "data = make_data(feature_size=(100, 2))\n", "true_errors = np.where(data['true_labels'] != data['lables'])[0]\n", - "plot_data(data[['feature_1','lables']], circles=true_errors, title=\"Messy Regression dataset\", xlabel=\"feature_1\")" + "plot_data(data[['feature_1','feature_2']], \n", + " circles=true_errors, \n", + " color=data['lables'], \n", + " colorbar=True, \n", + " title=\"Messy Regression dataset\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The figure above represents a toy dataset we'll use to demostrate label scoring for regression dataset. In this example, lables are ploted w.r.t. one of the features of the dataset. \n", + "The figure above represents a toy dataset we'll use to demonstrate label scoring for regression dataset. In this example, datapoints are ploted on 2-D space (in this case feature_1 vs feature_2). Each datapoint is colored based on given label. \n", "\n", "Like many real-world datasets, the given label happen to be incorrect for some of the examples(**circled in red**) in this dataset. " ] @@ -152,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -178,7 +179,6 @@ " \n", " feature_1\n", " feature_2\n", - " feature_3\n", " lables\n", " true_labels\n", " \n", @@ -187,57 +187,52 @@ " \n", " 0\n", " 17.986857\n", - " 1.292315\n", - " -4.463319\n", - " 15.615852\n", - " 15.615852\n", + " -1.707685\n", + " 19.079171\n", + " 17.079171\n", " \n", " \n", " 1\n", " 15.446943\n", - " 1.789677\n", - " -4.158823\n", - " 11.877797\n", - " 13.877797\n", + " -1.210323\n", + " 15.036620\n", + " 15.036620\n", " \n", " \n", " 2\n", " 18.590754\n", - " 1.828643\n", - " -3.375423\n", - " 17.843974\n", - " 17.843974\n", + " -1.171357\n", + " 18.219397\n", + " 18.219397\n", " \n", " \n", " 3\n", " 22.092119\n", - " 1.598861\n", - " -3.419297\n", - " 23.071684\n", - " 21.071684\n", + " -1.401139\n", + " 21.490981\n", + " 21.490981\n", " \n", " \n", " 4\n", " 15.063387\n", - " 1.919357\n", - " -7.066504\n", - " 12.716240\n", - " 10.716240\n", + " -1.080643\n", + " 14.782744\n", + " 14.782744\n", " \n", " \n", "\n", "" ], "text/plain": [ - " feature_1 feature_2 feature_3 lables true_labels\n", - "0 17.986857 1.292315 -4.463319 15.615852 15.615852\n", - "1 15.446943 1.789677 -4.158823 11.877797 13.877797\n", - "2 18.590754 1.828643 -3.375423 17.843974 17.843974\n", - "3 22.092119 1.598861 -3.419297 23.071684 21.071684\n", - "4 15.063387 1.919357 -7.066504 12.716240 10.716240" + " feature_1 feature_2 lables true_labels\n", + "0 17.986857 -1.707685 19.079171 17.079171\n", + "1 15.446943 -1.210323 15.036620 15.036620\n", + "2 18.590754 -1.171357 18.219397 18.219397\n", + "3 22.092119 -1.401139 21.490981 21.490981\n", + "4 15.063387 -1.080643 14.782744 14.782744" ] }, - "execution_count": 5, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -249,12 +244,12 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# Note that \"true_labels\" will not be available in real-life dataset. We have added here only for comparision.\n", - "X = data[['feature_'+str(i+1) for i in range(3)]]\n", + "X = data[['feature_'+str(i+1) for i in range(2)]]\n", "y = data['lables']\n", "\n", "# initialize your favourite model and generate predictions \n", @@ -268,12 +263,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -284,12 +279,11 @@ ], "source": [ "plot_data(\n", - " data[['feature_1','lables']], \n", + " data[['feature_1','feature_2']], \n", " circles=true_errors ,\n", " color=label_quality, \n", " title=\"Messy Regression dataset with label quality scores\", \n", - " colorbar=True, \n", - " xlabel = \"feature_1\")" + " colorbar=True)" ] }, { @@ -297,7 +291,8 @@ "metadata": {}, "source": [ "In the above plot, we have colored each datapoint considering its label quality score. \\\n", - "Datapoints in the plot are same as earlier plot in the notebook. **Red circle** represents that these datapoint were incorrectly marked when compared to true_label. \n", + "Datapoints in the plot are same as earlier plot in the notebook. \\\n", + "**Red circle** represents that these datapoint were incorrectly marked when compared to true_label. \n", "\n", "Low scores for datapoints marked in **Red circle** and High scores for other datapoints justifies that method can identify the errors in the dataset. " ] From 29d60800bf7f73214b477bb300f6f1ba222089cf Mon Sep 17 00:00:00 2001 From: krmayankb Date: Wed, 7 Dec 2022 22:30:18 -0700 Subject: [PATCH 007/258] typo fix --- docs/source/tutorials/regression.ipynb | 132 ++++++++++++++++++++++--- 1 file changed, 118 insertions(+), 14 deletions(-) diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb index 92bc78ff13..b4770e47d1 100644 --- a/docs/source/tutorials/regression.ipynb +++ b/docs/source/tutorials/regression.ipynb @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -72,7 +72,7 @@ " labels = true_labels + np.random.choice(error, feature_size[0], p=prob_error)\n", " \n", " data_dict = {\n", - " \"lables\" : labels, # You have these labels, which have some errors.\n", + " \"labels\" : labels, # You have these labels, which have some errors.\n", " \"true_labels\" : true_labels, # You never get to see these perfect labels.\n", " } \n", " for idx, feature in enumerate(features): # adding names to each features \n", @@ -111,7 +111,111 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
exam_1exam_2exam_3bonus_or_penaltytrue_labelslabels
0537793076.256.2
18164801085.565.5
2748897087.467.4
3619478077.757.7
4489091077.877.8
\n", + "
" + ], + "text/plain": [ + " exam_1 exam_2 exam_3 bonus_or_penalty true_labels labels\n", + "0 53 77 93 0 76.2 56.2\n", + "1 81 64 80 10 85.5 65.5\n", + "2 74 88 97 0 87.4 67.4\n", + "3 61 94 78 0 77.7 57.7\n", + "4 48 90 91 0 77.8 77.8" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "path = \"/Users/krmayank/Desktop/Work/cleanlab/experiments/student_score_regression.csv\"\n", + "data = pd.read_csv(path, index_col=0)\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -127,10 +231,10 @@ ], "source": [ "data = make_data(feature_size=(100, 2))\n", - "true_errors = np.where(data['true_labels'] != data['lables'])[0]\n", + "true_errors = np.where(data['true_labels'] != data['labels'])[0]\n", "plot_data(data[['feature_1','feature_2']], \n", " circles=true_errors, \n", - " color=data['lables'], \n", + " color=data['labels'], \n", " colorbar=True, \n", " title=\"Messy Regression dataset\")" ] @@ -153,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -179,7 +283,7 @@ " \n", " feature_1\n", " feature_2\n", - " lables\n", + " labels\n", " true_labels\n", " \n", " \n", @@ -224,7 +328,7 @@ "" ], "text/plain": [ - " feature_1 feature_2 lables true_labels\n", + " feature_1 feature_2 labels true_labels\n", "0 17.986857 -1.707685 19.079171 17.079171\n", "1 15.446943 -1.210323 15.036620 15.036620\n", "2 18.590754 -1.171357 18.219397 18.219397\n", @@ -232,7 +336,7 @@ "4 15.063387 -1.080643 14.782744 14.782744" ] }, - "execution_count": 11, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -244,13 +348,13 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# Note that \"true_labels\" will not be available in real-life dataset. We have added here only for comparision.\n", "X = data[['feature_'+str(i+1) for i in range(2)]]\n", - "y = data['lables']\n", + "y = data['labels']\n", "\n", "# initialize your favourite model and generate predictions \n", "yourFavouriteModel = LinearRegression()\n", @@ -258,12 +362,12 @@ "predictions = yourFavouriteModel.predict(X)\n", "\n", "# get label quality score for each example in the dataset \n", - "label_quality = get_label_quality_scores(labels=np.array(data['lables']), pred_labels=predictions)" + "label_quality = get_label_quality_scores(labels=np.array(data['labels']), pred_labels=predictions)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 26, "metadata": {}, "outputs": [ { From bf7860ea9b0e0b60947bed49743f84067cf68ab6 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 8 Dec 2022 23:03:45 -0700 Subject: [PATCH 008/258] cleanlab outlier based scoring method added --- cleanlab/regression/rank.py | 116 ++++++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 5 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 6f8ad48323..dcd9460c64 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,9 +1,16 @@ import numpy as np +from cleanlab.outlier import OutOfDistribution +from sklearn.neighbors import NearestNeighbors """ Generates label quality scores for every sample in regression dataset """ -def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np.ndarray: +def get_label_quality_scores( + labels: np.ndarray, + pred_labels: np.ndarray, + *, + method: str = "residual", +) -> np.ndarray: """ Returns label quality score for each example in the regression dataset. @@ -13,14 +20,16 @@ def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np. Parameters ---------- - labels: + labels : np.ndarray Raw labels from original dataset. Array of shape ``(N, )`` consisting given labels, where N is number of datapoints in the regression dataset. - pred_labels: + pred_labels : np.ndarray Predicated labels from regressor fitted on the dataset. Array of shape ``(N,)`` consisting predicted labels, where N is number of datapoints in the regression dataset. + method : {"residual", "TO_BE_NAMED"}, default="residual" #TODO - update name once finalised + Returns ------- label_quality_scores: @@ -39,6 +48,7 @@ def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np. array([0.36787944, 1. , 0.13533528, 0.90483742]) """ + # TODO - add error trigger function in utils. if not isinstance(labels, np.ndarray) or not isinstance(pred_labels, np.ndarray): raise TypeError("labels and pred_labels must be of type np.ndarray") @@ -46,6 +56,102 @@ def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np. labels.shape == pred_labels.shape ), f"shape of label {labels.shape} and predicted labels {pred_labels.shape} are not same." + scoring_funcs = { + "residual": get_residual_score_for_each_label, + "TO_BE_NAMED": get_score_to_named_for_each_label, # TODO - update name once finalised + } + + # TODO - update name once finalised + try: + scoring_func = scoring_funcs[method] + except KeyError: + raise ValueError( + f""" + {method} is not a valid scoring method. + Please choose a valid scoring technique: residual, TO_BE_NAMED. + """ + ) + + # Calculate scores + label_quality_score = scoring_func(labels, pred_labels) + return label_quality_score + + +def get_residual_score_for_each_label( + labels: np.ndarray, + pred_labels: np.ndarray, +) -> np.ndarray: + """Returns the residual based label-quality scores for each datapoints. + + This is function to compute label-quality scores for regression datasets, + where lower score indicate labels less likely to be correct. + + Residual based scores can work better for datasets where independent variables + are based out of normal distribution. + + Parameters + ---------- + labels: np.ndarray + Labels in the same format expected by the :py:func:`get_label_quality_scores ` function. + + pred_labels: np.ndarray + Predicted labels in the same format expected by the :py:func:`get_label_quality_scores ` function. + + Returns + ------- + label_quality_scores: np.ndarray + Contains one score (between 0 and 1) per example. + Lower scores indicate more likely mislabled examples. + + """ residual = pred_labels - labels - quality_scores = np.exp(-abs(residual)) - return quality_scores + label_quality_scores = np.exp(-abs(residual)) + return label_quality_scores + + +# TODO - change name of the function +def get_score_to_named_for_each_label( + label: np.ndarray, + pred_labels: np.ndarray, + *, + variance: float = 10, +) -> np.ndarray: + """Returns label-quality scores. + + This is function to compute label-quality scores for regression datasets, + where lower score indicate labels less likely to be correct. + + Parameters + ---------- + labels: np.ndarray + Labels in the same format expected by the :py:func:`get_label_quality_scores ` function. + + pred_labels: np.ndarray + Predicted labels in the same format expected by the :py:func:`get_label_quality_scores ` function. + + variance: float, default = 10 + Manipulates variance of the distribution of residual. + + Returns + ------- + label_quality_scores: np.ndarray + Contains one score (between 0 and 1) per example. + Lower scores indicate more likely mislabled examples. + """ + + neighbors = int(np.ceil(0.1 * label.shape[0])) + print(neighbors) + knn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean") + + residual = pred_labels - label + + label = (label - label.mean()) / label.std() + residual = np.sqrt(variance) * ((residual - residual.mean()) / residual.std()) + + # 2D features by combining labels and residual + features = np.array([label, residual]).T + + knn.fit(features) + ood = OutOfDistribution(params={"knn": knn}) + label_quality_scores = ood.score(features=features) + return label_quality_scores From 9bf8a5f5906d800f3c14b886fcd870779ec9c768 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 9 Dec 2022 11:25:30 -0700 Subject: [PATCH 009/258] regression_utils created --- cleanlab/internal/regression_utils.py | 28 +++++++++++++++++++++++++++ cleanlab/regression/rank.py | 10 +++------- 2 files changed, 31 insertions(+), 7 deletions(-) create mode 100644 cleanlab/internal/regression_utils.py diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py new file mode 100644 index 0000000000..57c99baedb --- /dev/null +++ b/cleanlab/internal/regression_utils.py @@ -0,0 +1,28 @@ +""" +Helper function internally used in cleanlab.regression +""" + +import numpy as np + + +def assert_valid_inputs( + labels: np.ndarray, + pred_labels: np.ndarray, + method: str, +) -> None: + """Checks that ``labels``, ``pred_labels``, ``method`` are correctly formatted.""" + + # Check if labels and pred_labels are np.ndarray + if not isinstance(labels, np.ndarray) or not isinstance(pred_labels, np.ndarray): + raise TypeError("labels and pred_labels must be of type np.ndarray") + + # Check if labels and pred_labels are of same shape + assert ( + labels.shape == pred_labels.shape + ), f"shape of label {labels.shape} and predicted labels {pred_labels.shape} are not same." + + # Check if method passed is string + if not isinstance(method, str): + raise TypeError( + f"Passed method is not of correct type. Expected string, got {type(method)}" + ) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index dcd9460c64..8d53af9adf 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,6 +1,7 @@ import numpy as np from cleanlab.outlier import OutOfDistribution from sklearn.neighbors import NearestNeighbors +from cleanlab.internal.regression_utils import assert_valid_inputs """ Generates label quality scores for every sample in regression dataset """ @@ -48,13 +49,8 @@ def get_label_quality_scores( array([0.36787944, 1. , 0.13533528, 0.90483742]) """ - # TODO - add error trigger function in utils. - if not isinstance(labels, np.ndarray) or not isinstance(pred_labels, np.ndarray): - raise TypeError("labels and pred_labels must be of type np.ndarray") - - assert ( - labels.shape == pred_labels.shape - ), f"shape of label {labels.shape} and predicted labels {pred_labels.shape} are not same." + # Check if inputs are valid + assert_valid_inputs(labels=labels, pred_labels=pred_labels, method=method) scoring_funcs = { "residual": get_residual_score_for_each_label, From c399ffca26e7b094fdfa71974f81317fb73fedbb Mon Sep 17 00:00:00 2001 From: krmayankb Date: Mon, 12 Dec 2022 09:40:36 -0800 Subject: [PATCH 010/258] pred_labels changed to predictions --- cleanlab/internal/regression_utils.py | 8 ++++---- cleanlab/regression/rank.py | 25 ++++++++++++------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py index 57c99baedb..04576c4012 100644 --- a/cleanlab/internal/regression_utils.py +++ b/cleanlab/internal/regression_utils.py @@ -7,19 +7,19 @@ def assert_valid_inputs( labels: np.ndarray, - pred_labels: np.ndarray, + predictions: np.ndarray, method: str, ) -> None: """Checks that ``labels``, ``pred_labels``, ``method`` are correctly formatted.""" # Check if labels and pred_labels are np.ndarray - if not isinstance(labels, np.ndarray) or not isinstance(pred_labels, np.ndarray): + if not isinstance(labels, np.ndarray) or not isinstance(predictions, np.ndarray): raise TypeError("labels and pred_labels must be of type np.ndarray") # Check if labels and pred_labels are of same shape assert ( - labels.shape == pred_labels.shape - ), f"shape of label {labels.shape} and predicted labels {pred_labels.shape} are not same." + labels.shape == predictions.shape + ), f"shape of label {labels.shape} and predicted labels {predictions.shape} are not same." # Check if method passed is string if not isinstance(method, str): diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 8d53af9adf..3784d290b0 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -8,7 +8,7 @@ def get_label_quality_scores( labels: np.ndarray, - pred_labels: np.ndarray, + predictions: np.ndarray, *, method: str = "residual", ) -> np.ndarray: @@ -25,7 +25,7 @@ def get_label_quality_scores( Raw labels from original dataset. Array of shape ``(N, )`` consisting given labels, where N is number of datapoints in the regression dataset. - pred_labels : np.ndarray + predictions : np.ndarray Predicated labels from regressor fitted on the dataset. Array of shape ``(N,)`` consisting predicted labels, where N is number of datapoints in the regression dataset. @@ -43,14 +43,14 @@ def get_label_quality_scores( >>> import numpy as np >>> from cleanlab.regression.rank import get_label_quality_scores >>> labels = np.array([1,2,3,4]) - >>> pred_labels = np.array([2,2,5,4.1]) - >>> label_quality_scores = get_label_quality_scores(labels, pred_labels) + >>> predictions = np.array([2,2,5,4.1]) + >>> label_quality_scores = get_label_quality_scores(labels, predictions) >>> label_quality_scores array([0.36787944, 1. , 0.13533528, 0.90483742]) """ # Check if inputs are valid - assert_valid_inputs(labels=labels, pred_labels=pred_labels, method=method) + assert_valid_inputs(labels=labels, predictions=predictions, method=method) scoring_funcs = { "residual": get_residual_score_for_each_label, @@ -69,13 +69,13 @@ def get_label_quality_scores( ) # Calculate scores - label_quality_score = scoring_func(labels, pred_labels) + label_quality_score = scoring_func(labels, predictions) return label_quality_score def get_residual_score_for_each_label( labels: np.ndarray, - pred_labels: np.ndarray, + predictions: np.ndarray, ) -> np.ndarray: """Returns the residual based label-quality scores for each datapoints. @@ -90,7 +90,7 @@ def get_residual_score_for_each_label( labels: np.ndarray Labels in the same format expected by the :py:func:`get_label_quality_scores ` function. - pred_labels: np.ndarray + predictions: np.ndarray Predicted labels in the same format expected by the :py:func:`get_label_quality_scores ` function. Returns @@ -100,7 +100,7 @@ def get_residual_score_for_each_label( Lower scores indicate more likely mislabled examples. """ - residual = pred_labels - labels + residual = predictions - labels label_quality_scores = np.exp(-abs(residual)) return label_quality_scores @@ -108,7 +108,7 @@ def get_residual_score_for_each_label( # TODO - change name of the function def get_score_to_named_for_each_label( label: np.ndarray, - pred_labels: np.ndarray, + predictions: np.ndarray, *, variance: float = 10, ) -> np.ndarray: @@ -122,7 +122,7 @@ def get_score_to_named_for_each_label( labels: np.ndarray Labels in the same format expected by the :py:func:`get_label_quality_scores ` function. - pred_labels: np.ndarray + predictions: np.ndarray Predicted labels in the same format expected by the :py:func:`get_label_quality_scores ` function. variance: float, default = 10 @@ -136,10 +136,9 @@ def get_score_to_named_for_each_label( """ neighbors = int(np.ceil(0.1 * label.shape[0])) - print(neighbors) knn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean") - residual = pred_labels - label + residual = predictions - label label = (label - label.mean()) / label.std() residual = np.sqrt(variance) * ((residual - residual.mean()) / residual.std()) From 25195500126306fc5569b0ae4a6e98e940d30d0f Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 22 Dec 2022 00:23:12 -0800 Subject: [PATCH 011/258] unit tests for new scoring method --- cleanlab/regression/rank.py | 19 ++++++++-------- tests/test_regression.py | 43 +++++++++++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 3784d290b0..f53a0fb47e 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -10,7 +10,7 @@ def get_label_quality_scores( labels: np.ndarray, predictions: np.ndarray, *, - method: str = "residual", + method: str = "TO_BE_NAMED", # TODO update name once finalised ) -> np.ndarray: """ Returns label quality score for each example in the regression dataset. @@ -64,13 +64,13 @@ def get_label_quality_scores( raise ValueError( f""" {method} is not a valid scoring method. - Please choose a valid scoring technique: residual, TO_BE_NAMED. + Please choose a valid scoring technique: {scoring_funcs.keys()}. """ ) # Calculate scores - label_quality_score = scoring_func(labels, predictions) - return label_quality_score + label_quality_scores = scoring_func(labels, predictions) + return label_quality_scores def get_residual_score_for_each_label( @@ -106,8 +106,9 @@ def get_residual_score_for_each_label( # TODO - change name of the function +# TODO - change name of function in test def get_score_to_named_for_each_label( - label: np.ndarray, + labels: np.ndarray, predictions: np.ndarray, *, variance: float = 10, @@ -135,16 +136,16 @@ def get_score_to_named_for_each_label( Lower scores indicate more likely mislabled examples. """ - neighbors = int(np.ceil(0.1 * label.shape[0])) + neighbors = int(np.ceil(0.1 * labels.shape[0])) knn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean") - residual = predictions - label + residual = predictions - labels - label = (label - label.mean()) / label.std() + labels = (labels - labels.mean()) / labels.std() residual = np.sqrt(variance) * ((residual - residual.mean()) / residual.std()) # 2D features by combining labels and residual - features = np.array([label, residual]).T + features = np.array([labels, residual]).T knn.fit(features) ood = OutOfDistribution(params={"knn": knn}) diff --git a/tests/test_regression.py b/tests/test_regression.py index 71de96cf3c..8a8154047c 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -2,26 +2,55 @@ import pandas as pd import pytest -from cleanlab.regression.rank import get_label_quality_scores +from cleanlab.regression import rank # To be used for all the tests labels = np.array([1, 2, 3, 4]) -pred_labels = np.array([1, 3, 4, 5]) - +predictions = np.array([1, 3, 4, 5]) +# test with deafault parameters def test_output_shape_type(): - scores = get_label_quality_scores(labels=labels, pred_labels=pred_labels) + scores = rank.get_label_quality_scores(labels=labels, predictions=predictions) assert labels.shape == scores.shape assert isinstance(scores, np.ndarray) +# test for acceptable datatypes @pytest.mark.parametrize("format", [pd.Series, pd.DataFrame, list]) def test_type_error_for_input_types(format): with pytest.raises(TypeError) as error: - _ = get_label_quality_scores(labels=format(labels), pred_labels=format(pred_labels)) + _ = rank.get_label_quality_scores(labels=format(labels), predictions=format(predictions)) +# test for input shapes def test_assertion_error_for_input_shape(): with pytest.raises(AssertionError) as error: - _ = get_label_quality_scores(labels=labels[:-1], pred_labels=pred_labels) - _ = get_label_quality_scores(labels=labels, pred_labels=pred_labels[:-1]) + _ = rank.get_label_quality_scores(labels=labels[:-1], predictions=predictions) + _ = rank.get_label_quality_scores(labels=labels, predictions=predictions[:-1]) + + +# TODO - change name once finalised +# test individual scoring functions +@pytest.mark.parametrize( + "scoring_funcs", + [rank.get_residual_score_for_each_label, rank.get_score_to_named_for_each_label], +) +def test_individual_scoring_functions(scoring_funcs): + scores = scoring_funcs(labels=labels, predictions=predictions) + assert labels.shape == scores.shape + assert isinstance(scores, np.ndarray) + + +# TODO - change name once finalised +# test for method argument +@pytest.mark.parametrize( + "method", + [ + "residual", + "TO_BE_NAMED", + ], +) +def test_method_pass_get_label_quality_scores(method): + scores = rank.get_label_quality_scores(labels=labels, predictions=predictions, method=method) + assert labels.shape == scores.shape + assert isinstance(scores, np.ndarray) From 9d002535bcff4bc6bfddf25cf1480faa428ef56f Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 12:46:08 -0800 Subject: [PATCH 012/258] init merge conflict resolved --- cleanlab/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cleanlab/__init__.py b/cleanlab/__init__.py index 46b87525d1..663977fc04 100644 --- a/cleanlab/__init__.py +++ b/cleanlab/__init__.py @@ -8,4 +8,5 @@ from . import multiannotator from . import outlier from . import token_classification + from . import regression From 1a9409f4f634f385881191916f3bbc3516d3979a Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 22 Dec 2022 01:15:43 -0800 Subject: [PATCH 013/258] tutorial draft1 --- docs/source/tutorials/regression.ipynb | 476 ++++++++++++++----------- 1 file changed, 268 insertions(+), 208 deletions(-) diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb index b4770e47d1..682a81ffb2 100644 --- a/docs/source/tutorials/regression.ipynb +++ b/docs/source/tutorials/regression.ipynb @@ -1,117 +1,215 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "# Label Quality Scores for Regression with Noisy Labels\n", - "In this tutorial, you will learn how to use cleanlab on regression dataset to: \n", - "- find label issue in your regression dataset\n", - "- generate label quality scores for each example in the dataset. " + "# Label Quality Scores for Regression with Noisy Labels " ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Install dependencies and import them \n", - "You can use pip to install all packages required for this tutorial as follows:" + "This quickstart tutorial shows how to use cleanlab for finding label errors in regression data. Using the approach mentioned here, you can find label error in any regression dataset irrespective of modality i.e., tabular, text, image etc. \n", + "\n", + "**This example will take you through following:**\n", + "- Generate label quality scores for each datapoint in the dataset. \n", + "- Find label issue for regression dataset. " ] }, { - "cell_type": "code", - "execution_count": null, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "!pip install cleanlab\n", - "!pip install scikit-learn" + "Quickstart \n", + "\n", + "Cleanlab uses two inputs to generates scores for labels in the dataset:\n", + "- `labels`: NumPy array of given labels in the dataset. labels[i] should contain label for `i`-th datapoint. \n", + "- `predictions`: NumPy array of predictions generated through your favourite regressor. predictions[i] should contain predicted value for `i`-th datapoint. \n", + "\n", + "If you already have predictions from your regressor, you can generate label quality scores for each datapoint using the code below: \n", + "\n", + "
\n", + "\n", + "```python \n", + "\n", + "from cleanlab.regression.rank import get_label_quality_scores\n", + "label_quality_scores = get_label_quality_scores(labels, predictions)\n", + "\n", + "```\n", + "
\n", + "" ] }, { - "cell_type": "code", - "execution_count": 1, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import numpy as np \n", - "import pandas as pd \n", - "from cleanlab.regression.rank import get_label_quality_scores\n", - "from sklearn.linear_model import LinearRegression\n", - "import matplotlib.pyplot as plt " + "# 0. Visualization (can skip these details)" ] }, { - "cell_type": "code", - "execution_count": 21, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "def make_data(feature_size = (20, 2), \n", - " means = [8, -10], \n", - " stds = [2, 5], \n", - " bias = 0.8,\n", - " coeff = [2, 0.1],\n", - " error = [-2, 0, 2], \n", - " prob_error = [0.2, 0.6, 0.2], \n", - " seed = 42\n", - ") -> pd.DataFrame:\n", - " \"\"\"\n", - " feature_size: Tuple of (datapoints, features)\n", - " \"\"\"\n", - " assert (len(means) == feature_size[1]), (f\"length of mean {len(means)} is not same as features requested{feature_size[0]}\")\n", - " assert (len(stds) == feature_size[1]), (f\"length of stds {len(stds)} is not same as features requested{feature_size[0]}\")\n", - " np.random.seed(seed) \n", + "This is added just for reference. We will use this function to plot dataset, highlight points using label quality scores and true_errors.\n", + "You can skip this part and move to next section. " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
See the code for visualization **(click to expand)** \n", "\n", - " features = []\n", - " for i in range(feature_size[1]):\n", - " values = coeff[i] * np.random.normal(loc=means[i], scale=stds[i], size=feature_size[0])\n", - " features.append(values)\n", - " \n", - " true_labels = sum(map(np.array, features))+ bias\n", - " labels = true_labels + np.random.choice(error, feature_size[0], p=prob_error)\n", - " \n", - " data_dict = {\n", - " \"labels\" : labels, # You have these labels, which have some errors.\n", - " \"true_labels\" : true_labels, # You never get to see these perfect labels.\n", - " } \n", - " for idx, feature in enumerate(features): # adding names to each features \n", - " data_dict[\"feature_\"+str(idx+1)] = feature\n", - " data = pd.DataFrame.from_dict(data_dict)\n", - " col = list(data.columns)\n", - " new_col = col[2:] + col[:2]\n", - " data = data.reindex(columns=new_col)\n", - " return data\n", + "```python \n", + "# Note: this pulldown is for docs.cleanlab.ai, if running on local Jupyter or colab, please ignore it. \n", "\n", - "def plot_data(data, \n", - " circles, \n", - " title, \n", - " alpha=0.6, \n", - " color = '#1f77b4', \n", - " colorbar = False):\n", - " \n", + "def plot_data(\n", + " data_x, data_y, circles, title, alpha=0.6, color=\"#1f77b4\", colorbar=False, xlabel=\"\", ylabel=\"\"\n", + "):\n", " plt.figure(figsize=(14, 5))\n", - " data = data.to_numpy()\n", - " plt.scatter(data[:,0], data[:,1], c = color, s=60)\n", + " data_x = data_x.to_numpy()\n", + " data_y = data_y.to_numpy()\n", + " plt.scatter(data_x, data_y, c=color, s=30)\n", " for i in circles:\n", " plt.plot(\n", - " data[i][0],\n", - " data[i][1],\n", + " data_x[i],\n", + " data_y[i],\n", " \"o\",\n", " markerfacecolor=\"none\",\n", " markeredgecolor=\"red\",\n", - " markersize=14,\n", + " markersize=10,\n", " markeredgewidth=2.5,\n", - " alpha=alpha\n", + " alpha=alpha,\n", " )\n", " plt.title(title, fontsize=20)\n", - " \n", - " if colorbar: plt.colorbar(orientation = 'vertical')\n" + " plt.xlabel(xlabel)\n", + " plt.ylabel(ylabel)\n", + "\n", + " if colorbar:\n", + " plt.colorbar(orientation=\"vertical\")\n", + "\n", + "```\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_data(\n", + " data_x, data_y, circles, title, alpha=0.6, color=\"#1f77b4\", colorbar=False, xlabel=\"\", ylabel=\"\"\n", + "):\n", + " plt.figure(figsize=(14, 5))\n", + " data_x = data_x.to_numpy()\n", + " data_y = data_y.to_numpy()\n", + " plt.scatter(data_x, data_y, c=color, s=30)\n", + " for i in circles:\n", + " plt.plot(\n", + " data_x[i],\n", + " data_y[i],\n", + " \"o\",\n", + " markerfacecolor=\"none\",\n", + " markeredgecolor=\"red\",\n", + " markersize=10,\n", + " markeredgewidth=2.5,\n", + " alpha=alpha,\n", + " )\n", + " plt.title(title, fontsize=20)\n", + " plt.xlabel(xlabel)\n", + " plt.ylabel(ylabel)\n", + "\n", + " if colorbar:\n", + " plt.colorbar(orientation=\"vertical\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Install dependencies and import them \n", + "You can use `pip` to install all packages required for this tutorial as follows:\n", + "\n", + "`!pip install cleanlab xgboost`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install cleanlab xgboost" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Package installation (hidden on docs website).\n", + "# Package versions we used: xgboost==1.7.2\n", + "\n", + "dependencies = [\"cleanlab\", \"xgboost\"]\n", + "\n", + "if \"google.colab\" in str(get_ipython()): # Check if it's running in Google Colab\n", + " %pip install cleanlab # for colab\n", + " cmd = \" \".join([dep for dep in dependencies if dep != \"cleanlab\"])\n", + " %pip install $cmd\n", + "else:\n", + " missing_dependencies = []\n", + " for dependency in dependencies:\n", + " try:\n", + " __import__(dependency)\n", + " except ImportError:\n", + " missing_dependencies.append(dependency)\n", + "\n", + " if len(missing_dependencies) > 0:\n", + " print(\"Missing required dependencies:\")\n", + " print(*missing_dependencies, sep=\", \")\n", + " print(\"\\nPlease install them before running the rest of this notebook.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from cleanlab.regression.rank import get_label_quality_scores\n", + "import xgboost as xgb\n", + "import matplotlib.pyplot as plt\n", + "\n", + "np.set_printoptions(suppress=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Import dataset and Generate predictions" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 107, "metadata": {}, "outputs": [ { @@ -138,7 +236,7 @@ " exam_1\n", " exam_2\n", " exam_3\n", - " bonus_or_penalty\n", + " notes\n", " true_labels\n", " labels\n", " \n", @@ -149,43 +247,43 @@ " 53\n", " 77\n", " 93\n", - " 0\n", + " NaN\n", + " 76.2\n", " 76.2\n", - " 56.2\n", " \n", " \n", " 1\n", " 81\n", " 64\n", " 80\n", - " 10\n", + " great participation +10\n", + " 85.5\n", " 85.5\n", - " 65.5\n", " \n", " \n", " 2\n", " 74\n", " 88\n", " 97\n", - " 0\n", + " NaN\n", + " 87.4\n", " 87.4\n", - " 67.4\n", " \n", " \n", " 3\n", " 61\n", " 94\n", " 78\n", - " 0\n", + " NaN\n", + " 77.7\n", " 77.7\n", - " 57.7\n", " \n", " \n", " 4\n", " 48\n", " 90\n", " 91\n", - " 0\n", + " NaN\n", " 77.8\n", " 77.8\n", " \n", @@ -194,15 +292,15 @@ "" ], "text/plain": [ - " exam_1 exam_2 exam_3 bonus_or_penalty true_labels labels\n", - "0 53 77 93 0 76.2 56.2\n", - "1 81 64 80 10 85.5 65.5\n", - "2 74 88 97 0 87.4 67.4\n", - "3 61 94 78 0 77.7 57.7\n", - "4 48 90 91 0 77.8 77.8" + " exam_1 exam_2 exam_3 notes true_labels labels\n", + "0 53 77 93 NaN 76.2 76.2\n", + "1 81 64 80 great participation +10 85.5 85.5\n", + "2 74 88 97 NaN 87.4 87.4\n", + "3 61 94 78 NaN 77.7 77.7\n", + "4 48 90 91 NaN 77.8 77.8" ] }, - "execution_count": 22, + "execution_count": 107, "metadata": {}, "output_type": "execute_result" } @@ -215,14 +313,14 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 108, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -230,149 +328,78 @@ } ], "source": [ - "data = make_data(feature_size=(100, 2))\n", - "true_errors = np.where(data['true_labels'] != data['labels'])[0]\n", - "plot_data(data[['feature_1','feature_2']], \n", - " circles=true_errors, \n", - " color=data['labels'], \n", - " colorbar=True, \n", - " title=\"Messy Regression dataset\")" + "# Generate true errors\n", + "true_errors = np.where(data.labels != data.true_labels)[0]\n", + "plot_data(\n", + " data_x=data[\"exam_3\"], \n", + " data_y=data[\"labels\"],\n", + " circles=true_errors,\n", + " title=\"Messy Regression dataset\",\n", + " xlabel=\"exam_3 feature\",\n", + " ylabel=\"label (Y value)\",\n", + ")" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "The figure above represents a toy dataset we'll use to demonstrate label scoring for regression dataset. In this example, datapoints are ploted on 2-D space (in this case feature_1 vs feature_2). Each datapoint is colored based on given label. \n", + "In the dataframe displayed above, `labels` represents the noisy labels and `true_labels` represents the ground truth. Please note that, ground truth are usually not available in real dataset, we have added it here for comparision and to demonstrate our method. `notes` also has text information, we will model this a categorical variable. \n", "\n", - "Like many real-world datasets, the given label happen to be incorrect for some of the examples(**circled in red**) in this dataset. " + "We will use `xgboost` as regressor for this tutorial. xgboost provides easy to use interface to process categorical variable. This is demonstrated in the code below:" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 109, "metadata": {}, + "outputs": [], "source": [ - "## Using cleanlab to generate label quality scores" + "# XGBOOST automatically factors categorical variable, you just need to mark the columns as category\n", + "data.notes = data.notes.astype(\"category\")\n", + "\n", + "# XGBOOST takes data and label seperately, so you will need to divide data accordingly.\n", + "X = data.drop([\"labels\", \"true_labels\"], axis=1)\n", + "y = data[\"labels\"]\n", + "\n", + "# convert data to format \"DMatrix\" to make it compatible with XGBOOST.\n", + "xgboost_data = xgb.DMatrix(data=X, label=y, enable_categorical=True)\n", + "\n", + "# declare parameters and train the model.\n", + "params = {\"booster\": \"gblinear\", \"objective\": \"reg:squarederror\"}\n", + "boost = xgb.train(params=params, dtrain=xgboost_data, num_boost_round=50)" ] }, { - "cell_type": "code", - "execution_count": 24, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
feature_1feature_2labelstrue_labels
017.986857-1.70768519.07917117.079171
115.446943-1.21032315.03662015.036620
218.590754-1.17135718.21939718.219397
322.092119-1.40113921.49098121.490981
415.063387-1.08064314.78274414.782744
\n", - "
" - ], - "text/plain": [ - " feature_1 feature_2 labels true_labels\n", - "0 17.986857 -1.707685 19.079171 17.079171\n", - "1 15.446943 -1.210323 15.036620 15.036620\n", - "2 18.590754 -1.171357 18.219397 18.219397\n", - "3 22.092119 -1.401139 21.490981 21.490981\n", - "4 15.063387 -1.080643 14.782744 14.782744" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "# start with checking the dataset generated\n", - "data.head()" + "## 3. Using cleanlab to generate label quality scores" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ - "# Note that \"true_labels\" will not be available in real-life dataset. We have added here only for comparision.\n", - "X = data[['feature_'+str(i+1) for i in range(2)]]\n", - "y = data['labels']\n", - "\n", - "# initialize your favourite model and generate predictions \n", - "yourFavouriteModel = LinearRegression()\n", - "yourFavouriteModel = yourFavouriteModel.fit(X,y)\n", - "predictions = yourFavouriteModel.predict(X)\n", + "# using trained xgboost model to get predictions\n", + "predictions = boost.predict(xgboost_data)\n", "\n", - "# get label quality score for each example in the dataset \n", - "label_quality = get_label_quality_scores(labels=np.array(data['labels']), pred_labels=predictions)" + "# get label quality score for each example in the dataset using cleanlab\n", + "label_quality_scores = get_label_quality_scores(labels=np.array(y), predictions=predictions)" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 111, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "iVBORw0KGgoAAAANSUhEUgAABBkAAAHeCAYAAADeqtNZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydd3wU1dqAn9mS3nsCgdB7R5AOgjRFUUBBRWxY0Wv7vOq9tqv3WvCqV7CigBWxgQIKSpfepUpNIJBKetsku3O+PyY72U12NwkJ/Ty/3xJ2TpkzM2dm533PWxQhhEAikUgkEolEIpFIJBKJpJ4YzvcAJBKJRCKRSCQSiUQikVwaSCWDRCKRSCQSiUQikUgkkgZBKhkkEolEIpFIJBKJRCKRNAhSySCRSCQSiUQikUgkEomkQZBKBolEIpFIJBKJRCKRSCQNglQySCQSiUQikUgkEolEImkQpJJBIpFIJBKJRCKRSCQSSYMglQwSiUQikUgkEolEIpFIGgSpZJBIJBKJRCKRSCQSiUTSIEglg0QikZxn7rjjDhRFISEh4XwP5Zzx4osvoigKiqKc76FIzjH26/7iiy/Wq59zOYcSEhJQFIU77rjjrPS/evVq/VhWr159VvZRW+S9WZ2antFne35IJBLJxYZUMkgaHMeXJUVRCAwMpLi4uMZ2JSUlBAcHO7U93y9bFyOO58/x4+XlRXR0NIMGDeLf//43GRkZ53uoEolEIpFIJBKJ5BJDKhkkZ53CwkIWLlxYY72ffvqJ/Pz8sz+gy5Ty8nIyMjJYu3Yt//znP2nXrh2//fbb+R6WRHLWuJxWFy+V1ecLaUVfImkIkpKS9Dk9d+7c8z0ciUQiOSeYzvcAJJc2Pj4+WCwWvvjiC2655RaPdb/44gunNpL60bNnT+bMmaN/Lygo4MiRI7z//vts2rSJ7OxsbrzxRvbs2UOzZs3O40glc+fOlS+fkssGIcT5HoJE0qAkJSWd7yFIJBLJBYW0ZJCcVa677joAfv/9d9LS0tzWy8jI0FfVr7/++nMytksdf39/OnbsqH/69OnD5MmT2bBhA+PHjwegqKiI//73v+d5pBKJRCKRSCQSieRSQSoZJGeV4cOHExMTg81mY968eW7rzZs3D6vVSkxMDFdfffU5HOHlh6IovPbaa/r35cuXn8fRSCQSiUQikUgkkksJqWSQnFWMRiOTJk0CKt0hXPH5558DcMstt2A0Gmvd/6pVq5gyZQrNmzfHz8+PoKAgOnXqxP/93/+RkpLisW1KSgpPP/003bt3Jzg4GLPZTHR0NJ06dWLSpEnMnTvXbYyIBQsWMHbsWBo3boy3tzeBgYE0b96cAQMG8Nxzz7Flyxa9bnl5OTExMSiKwsiRI2s8pr179+r+m2+88Uatz0VdaN68Of7+/gAkJyd7rHvkyBEee+wxOnXqRHBwML6+vjRv3pw77riDbdu21bgvq9XKu+++S69evQgKCiIkJISePXvy9ttvU1ZWVqO/atWo3qmpqfz973+nQ4cOBAYGuvTdttlsfPbZZ1x77bXExcXh7e1NeHg4/fv356233qKkpMTjmLdv387dd99N69at8ff3x8fHh/j4eHr06MFDDz3Ezz//7NLk22Kx8O677zJ48GAiIyMxm82EhYXRpk0bRo0axVtvveXSrLa22SX27NnDvffeS6tWrfDz8yMwMJAOHTrw2GOPeTTXdXWOf//9d8aMGUNMTAze3t40a9aMBx54gJMnT3ocQ205efIkDz30EM2bN8fHx4e4uDiuu+66Wiu1ioqKmD9/Pvfccw9du3bV79HIyEgGDRrEm2++SWFhocu2gwcPRlEUjh8/DsBnn31WLRDq4MGDndrk5OQwZ84cbrvtNtq3b09AQABeXl7ExMQwYsQIPv74Y8rKyjyO2WazMXfuXEaMGEFMTAxeXl4EBwfTqlUrhg4dyn/+8x/279/vsY+FCxcyYcIEmjRpgo+Pj36/vPTSS+Tk5FSrP3fuXBRF4aWXXtK3uQr8Wltz7szMTL3Nhx9+6LKOfb4qisKjjz7qss5rr72GoiiYzeZq18lVdgn7HB0yZIi+bciQIdWOw5NbkcViYfr06XTv3p3AwEACAwPp1asXM2fOxGq11ur4z5TU1FTef/99xo8fT6tWrfD398fb25tGjRpx/fXXM3/+fFRVrXV/qqoya9Ys+vbtS1hYGP7+/nTp0oVXX3211u6EdZ1LZ4Nff/2V0aNHExkZiZ+fH61bt+bxxx/n1KlTgOe4KbWNM1JTHA9VVVm5ciVPPvkk/fr1IyIiArPZTEhICF27duXJJ5/kxIkT9TpOd8ehKIqTO+Kdd95ZbU7b74Pu3bujKArt2rWrcX9ZWVl4e3ujKAoPPvhgncfbEM+qX375hdtuu01/n/Dx8aFZs2aMGzeOuXPnug34raoqX375JaNHj9b3HRkZyZAhQ3j//fc9Pmerzom8vDxefvllunXrRkhIiNtnRH3uhUOHDvHwww/TsWNHAgMD8fLyIi4ujq5du3LXXXcxf/58SktLPfYhkVyWCImkgVm1apUABCDmzJkjduzYoX/fu3dvtfr79u3Ty3fu3CnmzJmjf1+1apXLfZSUlIiJEyfq9Vx9/P39xc8//+yy/dq1a0VQUJDH9oBYtGiRUzur1SomTJhQY7sePXo4tfu///s/AQiDwSBOnjzp8fw99thjAhAmk0mkpqZ6rOsK+xgGDRrksV5wcLAARHBwsNs606dPF2az2e1xKooinnvuObft8/LyxJVXXum2fa9evcTOnTud5ktVpkyZIgDRtGlTsXHjRhEREVGtH8d5cvz4cdGlSxeP16dly5bi4MGDLsf81ltvCYPBUOM1LigocGqXkpIi2rdvX2O7J554wuMxuuM///mPx3F5e3uLzz77zGXbxMREp3P89NNPu+0nMjJS7N+/3+04akNN99eLL74oXnjhBf27KwYNGlTjuWzWrJk4cODAGbWten80bdq0xjbdunVze08WFBSIAQMG1NjHuHHjXLbPzs4WV111lce2UVFRYuPGjU7tHJ+Xnj6JiYk1X7gK7PP45ptvdlmekJCg99ulSxeXdUaOHCkA0bt372pl9rYvvPCCvs1xjnr6OD4jHOdQWlqa6Nq1q9t2Y8aMETabrdbnoCr2+TFlypRqZVartVbPjKuvvrrac8OO4+/msmXL9PPn6tO+fXuPvw1nOpfs1HRv1hb7b5m758zWrVs9ntfajsPx3Ll6Z3Dsx93Hz89P/Pjjj273UdMz2t1x1GZO2++D9957T9/m7trY+d///qfX3bp1q8e6Vanvs+r06dNi6NChdbpX7WRlZYl+/fp5bNeuXTuRlJTkct+O1/LQoUNOzyJX+63vvfDtt98KLy+vGo91z549dboGEsnlgAz8KDnrdOvWjQ4dOrBv3z6++OILJ1N9qLRw6NixI127dmXXrl0e+xNCMH78eJYsWQLAmDFjuOmmm2jevDkGg4EtW7bw3//+lxMnTjB+/HjWr19Pz5499falpaVMnDiR/Px8AgMDeeCBBxgyZAhRUVGUlZWRmJjIhg0bWLBgQbV9f/DBB3z33XcA9O/fn3vuuYcWLVrg7+9PVlYWu3fvZunSpeTl5Tm1u+eee5g+fTqqqvL555/zzDPPuDy28vJyvvzySwBGjRpFTEyMx3NxpqSmpupjdLd6Pn36dJ566ikAOnfuzAMPPECrVq0ICQnh4MGDzJw5k40bN/Lyyy8TERHBI488Uq2PiRMnsmnTJgD69evHww8/TMuWLcnMzOTLL7/kq6++4v7776/VmAsLCxk3bhwWi4V//OMfXH311fj5+bFnzx5iY2MBbXWnf//+JCcn4+3tzdSpUxk0aBAJCQkUFhby22+/8b///Y8jR44watQoduzYQXBwsL6P3bt38+STT6KqKs2aNWPatGl07dqVsLAwCgoKOHjwIKtWreKnn36qNr6HH35YX/m57bbbuPHGG4mLi8NoNJKamsq2bdtctqsN77//Ps8++ywAkZGR/P3vf6dfv37YbDaWL1/O9OnTKSoq4o477iAiIoLRo0e77WvWrFls2LCBQYMGcd9999G6dWtyc3P5/PPP+fzzz8nMzOSuu+5i48aNZzTWEydOcO2115Kfn4/BYODee+9l/PjxBAcHs3v3bl577TVefPFFp3vSFVarlU6dOnHdddfRs2dP4uLiEEJw/PhxFixYwLfffktiYiJjx45l165d+Pj46G3nzJlDUVERI0aMICUlheuvv55XXnnFqX+7JY8dm81G7969ufbaa+nWrRvR0dH68+DLL79k6dKl7Ny5k4kTJ7pcLX3xxRf5448/ALj22mu59dZb9VWzjIwMdu7cyeLFi12uypaWljJs2DB27NiB0WjklltuYfTo0TRr1ozy8nLWrl3LW2+9RUZGBqNHj2bnzp00bdoUgLFjx9KzZ0/ef/99PvjgA0CzeKlKo0aNPJ5vRwYPHsz+/ftZs2ZNtbLjx487WUXs3r2b7OxswsLC9G1Wq5X169cDMGjQoFrts1GjRuzZs4etW7dy1113ATB79myuuOIKp3qNGzd22f7GG29k//79PPLII4wZM4awsDAOHjzIyy+/zIEDB1i0aBGzZs3ivvvuq9V46oKosGq66qqrGDVqFJ06dSIyMpKCggKOHTvGrFmz2LhxI7///jsPPfQQn332mcf+/vnPf7J161aGDx/OAw88QHx8PMnJybz//vv8/vvv7N+/nzFjxrBp06Zqln/1mUsNyTvvvMPbb78NQFxcHM888wy9evXCYrGwZMkS3nnnHSZMmFCr9Nb1xWq1Ehsbyw033ECfPn1066rk5GQ2bNjA+++/T2FhIbfccgs7duyolSVBbdmzZw8pKSmMGDECgFdeeaVa3KmoqCgAbr31Vp588klKSkqYM2cOV155pdt+7UGdO3fuXOOztCr1eVYVFxczZMgQ/RnTo0cP7r33Xjp27Ii3tzfJycmsXbuW+fPnV2trs9m49tpr9d+WQYMGMW3aNJo1a0ZKSgqzZ89m4cKFHDhwgKFDh7Jr1y4CAgLcHsf48eM5deoUDz/8MNdddx2hoaEcPnxYn8/1vRfS09O58847KSsrIyoqimnTpnHllVcSERFBSUkJR44cYc2aNbXKniaRXJacZyWH5BKkqiWDEEK8/vrrAhDx8fFCVVW9rqqqIj4+XgDijTfeEEKIGi0ZPv74YwEIs9ksfv31V5djyM7OFh06dBCA6Nevn1PZihUr9P6rWio4Ul5eLvLy8py22bX/vXv3FuXl5W7bZmVlVdtmb9u6dWu37X788Ud9bAsWLHBbzxP29p4sGR5++GG93ssvv1ytfN++fboFwwsvvOB0zezYbDZx2223CUAEBASI7Oxsp/KFCxfq+7jxxhtdriK++eabNa582FeQ7PvZtWuX2+O65ZZb9NWmY8eOuayzY8cO4e/vLwDx7LPPOpU999xzAjQrmLS0NLf7yc3NdTqekpIS/Xy5slRwxNXc8LRKlpGRIfz8/AQg4uLixIkTJzweU6NGjURZWZlTedVV4qlTp7q8pvfcc49eZ8eOHR6Pwx3jx4/X+/j666+rlefn51ezNHHFoUOHPO7n999/11ePP/nkE5d1PK2S1nV/s2fP1se7fPnyauX259j48eM99uPq+j/77LMCECEhIWLbtm0u2yUlJYnY2FgBiFtuuaVaeUOtPgshxPz58/W+qlqKfPbZZwIQHTp0EM2aNXP5rNq8ebPe/pdffqnWv73M0ZLBTk2r0o44HrPZbHZZPysrS0RHRwtAdO7cuaZDd4unuaSqqjh8+LDH9s8//7wAzfrL1VxzPG5A3HvvvS77ufvuu/U67733XrXyC2Eupaen68+spk2burS6WLFihTCZTPp+zqYlQ2JiYrVnoiPJycmiUaNGAhC33Xabyzpnaslg37+n3zhHJk+eLECzMCwuLnZZx9E69O233/bYnyvq86xytE556KGHXP6OCCFEaWlptd/QmTNn6m1vv/12l23t8xcQTz31VLVyxzlhMBjEsmXL3I6/vvfCp59+WitLheLiYrfXSiK5nJFKBkmD40rJcPLkSV0gWLlypV535cqV+o+F3Y3Ak5JBVVXRokWLWglzv/zyi96P40vdV199pW+vqkSoiVatWglAPPbYY3VqJ0Tlyzkg1q1b57LOmDFjBGjme56UGJ5wp2TIz88X27dvF3fccYdQFEUAolWrVtWUA0IIcddddwlA9OzZ0+1LhBBC5OTkCG9vbwGIjz/+2KnMbu7r6+srMjIyXLZXVVV079691kqGf/3rX27HkpiYKIxGY43KIyGEeOqpp3Sh3ZGpU6cK0Mzi68KpU6f0Mf700091aiuE5xdYu4IOEN98843bPl555RW93rfffutU5viSGxsbKywWi8s+/vrrL73e//73vzofR2pqqn4Nrr32Wrf1HIXQ+gjFY8eO9bivuigZaoPdHH/atGnVyuxKprqet4KCAt11acaMGR7rvv/++7pAXVhY6FTWkEqGtLQ0va8PPvjAqezOO+/UBQz7///2t7851bHPWaPRKPLz86v1fzaUDI8//rjbenb3IEVRRG5ursc+3VHfuWS1WnVXrzfffLNaueNxR0dHi6KiIpf9FBQUiMjISF3RU7XsQphLb7zxht7++++/d1vvgQceOCdKhtrwzjvvCEAEBQW5/M07V0qGNWvW6HW//PJLl3XsiwReXl4iMzOzpkOrxpk+q3JycnTlUY8ePYTVaq1T+3bt2gnQXGVcPReE0BZ32rZtKwARGhpa7bfKcU7cddddbvfVEPfCv//9b30cEomk7sjAj5JzQqNGjfSAXo4BIO3/v+qqq2plzrt//36OHj0KoKdhdMfAgQP1/zuafttN66HS5LC22NsuWrSI06dP16nthAkTdNN8V/tNT0/n119/BWDy5MmYTPXzZlqzZo1TcKmgoCB69OihB0W6/vrrWb16NaGhodXaLlq0CIBx48Z5DLoVEhJCp06dAOdzbLVadVPrkSNHEhkZ6bK9oihMnjy51sd06623ui1bsmQJNpsNPz8/Ro0a5bEf+9xISUlxCvhlv7779+93Ct5ZE+Hh4Xh5eQHanG7IIHP2QIkhISHceOONbuvdc8891dq4Yvz48Xh7e7ssa9OmjW6eeuzYsTqPddWqVdhsNkALcOaOXr160aFDhzr1nZmZyeHDh9m7d6/+sc+rP//8s85j9YQQgrS0NA4dOuS0P/szytX+7HNn/vz5dTIBX7Nmje66VNtnWnl5Odu3b6/1PupKdHQ0bdu2BajmGmK/rwcPHqwHz3RXxx6A8Vzg6dnQo0cPQLuuiYmJZ30sqqqSkpLCwYMH9blz4MAB3dWjpvl600034efn57IsICCAm266CYB9+/Y5pYa+UOaS/fkTGhrqMSW13S3mXJOfn09iYiL79u3Tr4/9fNvLzhcDBw6kdevWgOv3hLKyMr7++mtAcxWNiIio8z7O9Fm1cuVKvf4jjzxSpyDdKSkpHDhwANDmt7vngslk0n87cnJy2LFjh9s+Pd3zDXEv2M9TTk7OGbs6SiSXM1LJIDln3H777QD88MMPlJSUUFJSwvfff+9UVhOO2Qz69OnjMoq6/ePoy+f4Ita/f3+aN28OwKOPPkqvXr149dVXWb9+fY3R46dMmQJoGRdatmzJXXfdxbx582oVkd/X15dbbrkFgG+//bbaj7ujcHq2X77i4uJ49NFHiYuLq1Z2/PhxMjMzAXjmmWc8nmNFUfRr4niOjx49qmdwsL/gu6O2/qQBAQH6dXOFfRzFxcWYTCaPY7722mv1do7jnjRpEmazmdLSUvr168eYMWP48MMP2bt3r8tsEna8vb25+eabAfj+++9p2bIlTz31FL/88gu5ubm1Oj537N27F9AENrPZ7LZedHS0Hl/D3sYVduHRHXalU0FBQR1H6hwLoKovfVV69epVY3/r16/n5ptvJjw8nKioKFq3bk2nTp30z6xZswDqrPBzx5IlS7j22msJDg4mNjaWNm3aOO3PHgfG1f7sz4YNGzbo8TwWLFig30vucHymxcbGepy3HTt21Os6ztuzgV2B4BiXITk5mWPHjqEoCoMGDdLr2OMygOZ3vW7dOqD28RgaAk/z2jFexJnM69oghODLL79kyJAhBAQE0KhRI9q2bes0f+zxhmqar3W5dxzvuQtlLtnH1K1bN4/K8q5du+rK2bPN8ePHefjhh0lISCA4OJjmzZvTsWNH/drce++9et2Gep6cKXfffTegCfX2DDl2fv75Z7KysoAzf08402fVzp079f8PGDCgTvt0/E3q3bu3x7qO5Z5+yzp37uy2rCHuheuuu46QkBAAbrjhBq666irefvtttm/frivTJRKJe6SSQXLOuPHGG/Hz8yM/P5+ffvqJhQsXUlBQgL+/v8cVWkcyMjLOaN+OAr3ZbGbRokV6cKetW7fy7LPP0r9/f0JCQhg5ciRff/21yx+Ru+66i2effRaTyUReXh5z5szhlltuIT4+npYtW/LEE094XAG2rzYXFBToChY79lWL3r170759+zM6Tkd69uzJnj172LNnD7t37+a3337jueeeIzg4mFOnTjFy5Eg9+JMjDXGOHVNCubNiqG25HfuPvTsaYtxt27Zl3rx5hIaGYrVaWbx4MQ888ACdOnUiKiqKyZMnuzxnADNnzmTMmDGA9kI7ffp0rrnmGsLDw7niiiuYPn16tYCgtcEuvNmDg3nCHijU3sYV7lZI7RgM2s/CmbxEOe63pvFGR0d7LH/xxRfp378/3377rcfjAWpMSVoTQgjuuecerr32WpYsWVKjIOpqf8899xx33XUXiqKQkZHBe++9x4033khUVBQdO3bkhRdeID09vVq7hpi3ZwO7giAtLY2//voLqFQ4tG/fnsjISJo0aUKzZs0QQrB27VpAE0TsqX+rpgk9m3ia1/Y5DWc2r2vCYrFwzTXXMHnyZFavXl3jfKypvC73juO9caHMpdo+s0wmk5MC6Gzx66+/0r59e2bOnFlNaHdFfZ8n9WXKlCmYzWaEENWChM6ePRvQrEPtwSTrypk+qxyVL44WobWhLr8NjgGvPT37XVlh2mmIeyE8PJyff/6ZRo0aIYRg1apVPP744/Ts2ZOwsDBuvPFGFi9efEb7kUguB2R2Cck5IyAggBtuuIGvvvqKL774Ql8ZvuGGG6pFeneH4wviokWL3GZGqErVH7X27duzZ88eFi1axKJFi1i7di1HjhyhpKSEZcuWsWzZMt566y1++eWXam3//e9/c++99/LVV1+xYsUKNm3aRHFxMUePHuWtt95ixowZvPvuuy6zJnTv3p1u3bqxc+dO5syZo1twbN68Wc9M0FBWDP7+/k4a+k6dOnH11Vdz00030bdvXwoKCrj11lvZu3cvQUFBej3Hc/z8888zYcKEWu/vbFKTaaZ93BEREaxatarW/TrmMAfNRWTYsGHMnz+fZcuW8ccff5CZmcnp06f58ssv+fLLL5kyZQqzZ892El6CgoL4+eef2bJlC99++y2rV69m165d2Gw2tm3bxrZt23jzzTdZuHAhffr0qcORa9SUK/5Coz7jXbFiBS+99BIAzZs358knn6R///40adIEf39/fXX0+eef5+WXX673WGfPns2nn34KaKurjz76KL1796ZRo0b4+fnpc+/22293enY5Yjab+fTTT3niiSeYN28eK1euZNu2bZSVlbFv3z727dvHW2+9xZdffulkQu54v+3YscOjtYoj7rIsNBSOCoLVq1fTtm1bJ1cJO4MGDSIxMZHVq1czduxYvY7RaKR///5ndYwXCv/+9791V7dBgwbx0EMP0b17d2JiYvD19dWfEwMHDuSPP/7waBUFZ37vXGhz6UJ4Zp0+fZpbbrmF4uJiAgICePLJJxkxYgQtWrQgODhYt6RYuXIlQ4cOBajx+pxtoqOjufbaa1mwYAFz587lueeeQ1EUUlJS+O233wDtWVQXdwVHzvRZ1VA01LzwdPwNdS8MGDCAI0eO8MMPP/DLL7+wdu1aTp48SX5+PgsWLGDBggWMGDGCH3/8sUYFvkRyuSGVDJJzyu23385XX32l/1Dat9WW8PBw/f8hISFOQnRdMRqNjB07lrFjxwJaWselS5fy3nvvsX37drZv3859993nMpVl06ZNefbZZ3n22WcpLy9n69atfPvtt3z00UdYLBYefPBBevfuTbdu3aq1veeee3jooYdYs2YNiYmJNGvWTLdi8PPzY+LEiWd8TLWhY8eO/Oc//+Hhhx8mOTmZ6dOnOwlqjufYbDaf0Tl2XGGoyQSzpvLaYh93QUEB7dq1O+MXMIDg4GDuvfde3YT2wIED/PTTT8yYMYOUlBQ+++wzunXrxt/+9rdqbXv16qWbMxcUFLB69Wrmzp3Ljz/+SEZGBuPGjePo0aP4+vrWaixhYWGkpqa6XFmqit3U81ysDrrC8bqnp6cTHx/vtq6n47G7QYSGhrJp0ya31i41WTjUFvv+WrZsyYYNG9xem9rsr3379rz88su8/PLLWCwW1q1bx9dff83nn39OYWEhkyZN4ujRo/pKoOP9FhkZedaVB7UlJiaG1q1bc+jQIVavXs3999+vx15wVDIMHjyYuXPn6mX2v127dnVKD3upIoTgk08+ATSBZOXKlU7KR0dqO19rutcdyx3v9QtlLoWGhpKWllbjcVitVo/nxPE8qqrq9rwWFRW57eP777/XXdYWLFjAsGHDXNZrqGdJQ3HPPfewYMECEhMTWbNmDYMHD+bzzz+vVcyb2lLXZ5Vj/IfU1NRqCnpPOM7TmuaFo8vCmf6WNeS94OPjw6233qrHgEhMTGTJkiXMmDGDQ4cOsWzZMv7xj3/oKVslEomGdJeQnFOGDh1KbGwsVqsVq9VKXFycvnpQGxyFdnse9oYiNjaWO++8k40bN9K9e3cAFi9eXKPppNlspm/fvrzzzjt6QCYhRDV3CDu33norvr6+CCGYO3cuJSUlfPPNN4C2iu5oVXC2uO+++/QXhLffftvJDLJ58+a6cHCm57hFixb4+PgA1BhUzNF3sj7Y50ZpaWmD9WmnXbt2PP3002zatEm32Pj2229rbBcYGMiYMWP44YcfeOSRRwDt5czus14b7EqeHTt2eAwomZGRoZsC10f5Vh/sQUBBc0PyhKfyffv2ATBkyBCP7jQ1XefarpjZ93fddde5VTAIITwGIXOFj48Pw4YNY/bs2UyfPh3QTLEdTWwb6pl2NlaNHeMynDp1iiNHjujxGKrW2b17N6dPn653PIYLYfW7LmRnZ+tC0YQJE9wKwoWFhRw8eLBWfdbl3nG818/m72NdsD8Hdu3a5fGZ9eeff3qMg+QYHNDRBa8qhw4dcltmv7fDwsLcKhig4X6HXHEmc3rkyJG6YGxfhLD/HTBgAK1atWq4AVK7Z5X9vQjQ3aNqi+M83bx5s8e6jkGXz/S37GzeC/Y4Flu3btWvUW3eBySSyw2pZJCcU4xGI5MnT8bb2xtvb28mT57s9qXMFd27d9cf6h9//DEWi6XBx2g2m/UXZKvVWqfAfY4KE3fBo4KDg/Vox5999hnff/+97qt/rqJtm81mnn76aUBbBXLUwBuNRkaPHg3Ab7/9pkeErgsmk0mP2Lx06VK31gpCCKdsI/VhzJgx+svcO++80yB9ViU+Pl6P/F3X4GC1mRuusL8Y5+bm8uOPP7qt9+mnn+pmvp5eps8mQ4YM0S1IqvoSO7J161aPAb3sgomnFcqdO3fW+LJqV3SVlpZ6rFeb/f3000+kpqZ67McT7q7/sGHDdDPbd99994xNte3HCjUfb21xjMvw0UcfAZXxGOw0bdqUhIQEhBC8++67+vPyTOMxnI3jOJs4CtGe5s8nn3xS66wz3333nVvldlFRkS7QtG/f3sk3vqHmUn2xP3+ys7P1TEWusMcXcIfjSrknJYBdSe8K+zm3WCyoquqyTnFxcYP9DrniTOa0wWDQrRW+//57li5dqitTzvZ7grtn1ZAhQ3Ql+4wZM+oU3yQuLk6Pg/Xtt99SWFjosp7NZtMzYIWGhjopNurCubgXgoKC9CCt5ztYqERyISKVDJJzzuuvv47FYsFisfDaa6/Vqa3BYODZZ58FtBR7t99+u8cf7fz8fGbOnOm07Y8//uDIkSNu25SVlel+xQEBAU4v1F9++aXHF0VHNxBPpoT2AJDHjx/nqaeeArTV/3MZjf2OO+7QU/K99957TkEJn3nmGYxGI6qqMn78eI/ZM2w2G1999VW1Ovfddx+grYbcf//9Ll/w3nrrrTqvDrujTZs2evyIb775hrfeestj/cTERObNm+e0beHChR6VSsnJyXoQPMfre+zYMaco/K6o7dyoyp133qm/LD3xxBOcOnWqWp0///yT//znP4AWEMzuAnSuiY2N1X14f/75Z5erO4WFhfrccId9lW7dunUu79XMzMxapT61C2D2tLc17W/RokUuzaaPHj3KQw895La9XZjy9CLr7vqHhIQwbdo0QIv2/thjj7kVhkAzNbab5zviKGzWdLy1xVFR8O6771bbZsf+3LLXMRgMdY4+b+dsHMfZJDIyUg9KO2/ePJe/R1u3buW5556rdZ9paWk88cQTLssef/xxPajdAw884FTWUHOpvkyZMkW3CHr88cddmsevWbOGjz/+2GM/ffv21eOvvP322y7vr+nTp3tMN2y/t4uLi10+j2w2G/fccw8pKSkex1IfHFMc12VO24MzFhcX6wqHwMDAWsdJckV9n1X2Z/f27dt59NFH3fZTXl5eLfii/RmamZmpW/ZV5aWXXtLjU02dOtVtuuWaaIh7YdmyZR4Vy3l5efrcq8tvukRy2SAkkgZm1apVAhCAmDNnTp3bz5kzR2+/atWqauWqqoobbrhBr9OiRQvxxhtviNWrV4udO3eKNWvWiI8++khMmjRJ+Pv7i/DwcKf2L7zwgjAYDGLQoEHijTfeEEuXLhXbt28X69atE7Nnzxa9evXS+/7b3/7m1BYQ0dHR4oEHHhBffPGF2LBhg9ixY4f49ddfxeOPPy58fX0FIAICAsSJEyc8Hmfr1q31/QDi5ZdfrvO5coW9v0GDBtVY9+2339br/+tf/3JbFhwcLP7v//5P/Prrr2LHjh1iw4YN4uuvvxYPP/ywiI2NFYDYs2dPtf6HDx+u99GvXz8xf/58sX37drF06VJx2223CcDpfM+dO7daH1OmTBGAaNq0aY3Hk5WVJZo3b673N3DgQPHJJ5+IjRs3ih07dojff/9dvPnmm2LYsGHCYDCIcePGObUfNGiQ8PPzExMmTBAffPCBPqdWrlwp3njjDREfH6/3vWDBAr2dfc63b99e/OMf/xALFiwQW7ZsEVu2bBE//PCDuOmmm/R2Xbt2Faqq1ukY33vvPb19dHS0ePvtt8XmzZvF+vXrxUsvvSQCAgIEIBRFEUuWLKnWPjExsdb3ZNOmTQUgpkyZUuP5dkViYqIIDAwUgDAajeLBBx8UK1euFNu2bROzZ8/W533Pnj31MVXlu+++08vi4uLEu+++K9avXy/Wr18vpk+fLmJjY4WiKKJPnz5u+xBCiH/84x96+auvvip27dolDh8+LA4fPixOnjyp15s+fbper3Xr1uLTTz8VmzdvFmvWrBEvvPCCCA4OFj4+PqJ79+4ur5P9/CYkJIjHH39czJ8/X2zatEls27ZNLFq0SNx7773CYDAIQDRq1EgUFBQ4tbdYLKJ37976GLp06SJmzpwp1q1bp8+/GTNmiOuvv154eXmJHj16VDvWw4cP6+2HDx8u1qxZIw4dOqQfb3l5+RlcTSFatmzp9Jz67rvvqtVxfGYDolu3bh77tNd74YUXXJY3btxYAKJZs2bip59+En/99Zd+HPn5+Xq9F154weP1t+P4m+TqN6U2eLovHnroIb3/nj17iq+//lps3bpVLF++XDz++OPCx8dHRERE6HPf1bPZcYz2e2PkyJFi4cKFYvv27WLhwoVixIgRTufY1TVtiLlU2/PqiTfffFPvo1GjRmLmzJliy5YtYu3ateLpp58W3t7eomnTpiIyMtLj82bSpEl6P9dee63+G7Rw4UIxbtw4AYi+ffu6vb7JycnC29tbAMLHx0f8/e9/F8uXLxdbt24Vc+fOFT169NB/nzzNkZqe0TU9N+39h4eHi6+//lrs379fn9NZWVluz+PVV1/tdG/dfffdbuvWhvo+q4qKikSnTp308fTo0UN8/PHHYuPGjWL79u3ip59+Ek8++aRo1KhRtd8aq9Xq9My+6qqrxPfffy+2b98uFi9eLG688Uan97qq+xaibnOzvvfClClThNlsFqNHjxbvvPOOWL58udixY4dYs2aNeO+990S7du30vt9+++06XwuJ5FJHKhkkDc7ZVjIIIURZWZl44IEHhKIoTj/Arj7NmjVzauv4I+Xpc/3114vi4mKntrVpFxwcLH799dcaj/P111/X2xgMBpGcnFznc+UKe5+1UTIUFRWJiIgI/eWn6o/6xx9/LPz8/Go8Zi8vL3H48OFq/efk5DgpEap+unXrJrZt26Z//+abb6r1URclgxBCpKamigEDBtTqWt15551ObQcNGlRjG4PBUE0h5DjnPX3atm0rjh07dkbH+O9//1t/+XP18fb2Fp999pnLtudSySCEdj7sigZXn+eff77Gl8U777zTbXuj0SjeeeedGvs4efKkCAsLc9mH4/1RVlbmpBCr+vH19RXffvut2+vkeH49fWJjY8W2bdtcjjU/P9/pJdvTZ8iQIS77cFRmVf0kJibWeN1ccc899+h9KIoiMjIyqtWpevyPPvqoxz7t9dwpGd5//323x+E4fy8UJUNubq7o2rWr2zGHhYWJNWvW6M+XmpQMy5Yt8zgf27ZtK06dOuV2rPWdSw2hZBBCiEceecTtfiMiIsSWLVtqfN6kpaWJVq1aue1n4sSJYvny5R6v7+zZsz0+O2+++eYa+6ivkmHx4sVu31fc3QdCCDF//nynuuvXr3dbtzY0xLMqMzNTDBw4sMY+XP3WZGVlOSl0XH3atWsnkpKSXO67rnOzPveC/ZrX9Ln//vuFzWar1XgkkssJqWSQNDjnQslgZ/fu3eLhhx8WnTp1EsHBwcJoNIrg4GDRtWtXcffdd4vvv/9eWCwWpzYFBQXihx9+EA888IC48sorRZMmTYSPj4/w8fERCQkJ4qabbhKLFy92ub+9e/eK119/XYwZM0a0b99ehIeHC6PRKEJCQsSVV14pXnjhBZGWllar40xJSdGPc8SIEbVqUxvsfdZGySCEJrza27zxxhvVytPS0sRLL70k+vXrJyIiIoTJZBL+/v6idevWYty4ceLDDz8UmZmZbvsvKysTb7/9tujRo4cICAgQgYGBomvXruLVV18VJSUlYufOnfr+XSln6qpksLN48WJx6623iubNmws/Pz9hNptFZGSk6Nu3r3jiiSfEmjVrqrVJSUkRH3/8sbjllltE165dRUxMjDCZTCIgIEB06NBBPPDAA+LPP/+s1s5qtYrVq1eLZ555RgwZMkS0bNlSBAYGCrPZLKKjo8Xw4cPFhx9+WG0u1vUY//zzTzF16lTRokUL4evrK/z9/UW7du3E3/72N49C5LlWMgghxIkTJ8QDDzwgmjZtKry8vER0dLS45pprxNKlS4UQtXtZ/OKLL8SAAQNEYGCgvvI5efJksXnz5lr3ceTIEXH33XeLli1bCh8fH7f3R3l5uXj33XdFz549hZ+fn/D19RUtW7YU999/vzhw4IAQwv11UlVVbNmyRbz44oti+PDhok2bNiIkJESYTCYREREhBg4cKKZPny7y8vJqPG9//PGHuOeee0SbNm1EYGCgMJlMIiwsTFxxxRXioYceEr/88ouwWq0u25aVlYk33nhD9OrVSwQHBzsJVmeqZPjiiy/0Pjp06OC2nn3eAGLhwoUe+6yNcPXDDz+I4cOHi6ioKGEymVzO3wtFySCEprB9+eWXRadOnYSPj48ICAgQ7dq1E08++aSuQK6tkmHVqlXCarWK999/X1x55ZUiJCRE+Pn5iU6dOolXXnmlmvLbHWc6lxpKySCEEEuWLBEjRowQYWFhwsfHR7Rs2VI88sgj+jmpzfMmOztb/P3vfxetWrUS3t7eIiwsTAwcOFB8+eWXQojaXd/169eLsWPHisjISGE2m0VsbKwYOXKkmD9/fq36qK+SQQghVq5cKa6//noRFxcnzGZzre6D0tJS3VKtbdu2buvVloZ8Vv34449i/PjxonHjxsLb21v4+PiI5s2biwkTJoivvvrK7e+dzWYTn3/+uRg5cqSIjo4WZrNZhIeHi8GDB4uZM2eK0tJSt/s807l5JvdCdna2+PLLL8Vdd90levbsKRo1aiS8vLyEr6+vaN26tZgyZYr4448/6jQOieRyQhHiPCcElkguU37//XeGDx8OwPz587npppvO84jOD19++aXuX3/kyBFatGhxnkckkUgkknNBQkICx48fZ8qUKXrAP0klhw8f1oMNv/7663oMJ4lEIrnQkYEfJZLzhD2ydnh4uB4s73LEHnwxMjKS5s2bn+fRSCQSiURyYWB/TzCZTNx+++3neTQSiURSe6SSQSI5Dxw9epTvv/8e0LIHnGkE5QudU6dOuU3FBlpat19++QWA22+//YzyiUskEolEcqmRm5urZ+AYO3YsMTEx53lEEolEUntM53sAEsnlwqlTpyguLubYsWP8/e9/x2q14uPjw2OPPXa+h3bW+P3333nqqaeYOHEigwcPpmnTpqiqytGjR5k/fz4LFy4EIDo6mmeeeeb8DlYikUgkkvNIRkYG+fn5pKSk8OKLL5KdnY2iKPL3USKRXHRIJYNEco649dZbWbNmjdO2l19+mbi4uPM0onNDZmYmM2bMYMaMGS7LY2NjWbJkCeHh4ed4ZBKJRCKRXDg89dRTfPbZZ07bHnzwQbp3736eRiSRSCRnhlQySCTnGD8/P1q3bs2jjz7KlClTzvdwzirXXnstH3zwAcuWLWP//v1kZmZSUFBASEgI7dq1Y8yYMdx///0EBgae76FKJBKJRHJB4OXlRYsWLZg6dSoPP/zw+R6ORCKR1BmZXUIikUgkEolEIpFIJBJJgyAtGQBVVUlJSSEwMFAGnpNIJBKJRCKRSCSSc4gQgoKCAuLi4jAYLq3cBBaLhbKysgbpy8vLCx8fnwbp62wilQxASkoK8fHx53sYEolEIpFIJBKJRHLZkpycTOPGjc/3MBoMi8VCs6YBpGXYGqS/mJgYEhMTL3hFg1QygO4PnpycTFBQ0HkejUQikUgkEolEIpFcPuTn5xMfH3/JxekqKysjLcNG4vamBAXWz0Ijv0ClWY/jlJWVSSXDxYDdRSIoKEgqGSQSiUQikUgkEonkPHCpuq4HBRrqrWS4mJBKBolEIpFIJBKJRCKRSM4SNqFiq2e6BZtQG2Yw5wCpZJBIJBKJRCKRSCQSieQsoSJQqZ+Wob7tzyWXj82GRCKRSCQSiUQikUgkkrOKtGSQSCQSiUQikUgkEonkLKGiUl9nh/r3cO6QSgaJRCKRSCQSiUQikUjOEjYhsIn6uTvUt/25RCoZJBKJRCKRSCQSieRsoaqQkwOlpeDtDaGhYJBe65JLF6lkkEgkEolEIpFILlWkgHv+SE+HNWtg40YoLq7c7ucHffrAoEEQHX3+xic5Z1xugR+lkkEikUgkEolEIrnUkALu+cNqhXnzYN061+XFxbBihfbp3x8mTQKTFMsuZVQENqlkkEgkEolEIpFIJBcdUsA9v1itMHMmHDhQu/rr1kFWFkybJq+D5JJB2kpJJBKJRCKRSCSXAnYB152CoSrr1mn1rdazO67LiXnzqikYcvxC2RTYmo2NriB/+DWQkODc5sABrZ3kksXuLlHfz8XCeVUyrF27ljFjxhAXF4eiKCxcuNCp/Mcff2T48OGEh4ejKAq7du2q1ofFYuGhhx4iPDycgIAAxo0bR3p6+rk5AIlEIpFIJBKJpCFQVW1FOyVF+6ueQbq6KgKuqgqOl5hZY2zKxvheWK67QQq4Z5P0dCcFTykG3tnnxU3/O8pzb23i+ReXMWHiV8xMjcL2wIPg41PZdt06rb3kksSeXaK+n4uF82qTU1RURJcuXbjrrru48cYbXZb379+fm266ialTp7rs47HHHmPJkiV89913BAcHM23aNG688UbWr19/tocvkUgkEolEIpHUj4aKnVBFwD2dXcJra0r4M8uAwZiFavsTb18vHnz7DkaPGQOzZoHFolVetw6GD5cxGurLmjX6f4UQ/HdTOWsOZDtVUW0qP7+3DIPBwINTp8KMGZWFa9fChAnnarQSyVnjvCoZRo0axahRo9yWT548GYCkpCSX5Xl5eXz66ad8/fXXXHXVVQDMmTOHdu3asWnTJq688soGH7NEIpFIJBKJRFJvGjp2goOAW5RXzAtLcjlc7g8IVJtmFVFaUsbb939MyI9P0lcKuA2LqmqKogpSrD6s2pvqsqoQgp/fX8akZ28kNCEB7LLOhg0wbpzM/nEJolZ86tvHxcJFPYO3b99OeXk5w4YN07e1bduWJk2asNHhJq9KaWkp+fn5Th+JRCKRSCQSieSc0NCxE6oIuLtPlHDEGoBQq5tXKwaFL1/5ATp2dHad2LDhzFw0JBo5OU6WKFvzfDCajG6r26w2dq7YA717V24sLobc3LM4SMn5wlaRXaK+n4uFi1rJkJaWhpeXFyEhIU7bo6OjSUtLc9vu1VdfJTg4WP/Ex8ef5ZFKJBKJRCKRSCQVVIudoJJYaOD71AC+yo5md1wXbPFNnNt4ip1QRcBdmWjVrReqIlTB4R2J5GcXSgG3ISktdfpaYvIBxXMTW7kNQkOdN9pdWCSSi5jLMk/KM888w+OPP65/z8/Pl4oGiUQikUgkEknNqKom1JeWgre3JiTWxby9SuyE4jLBv1eXsPWkFaNJs661LTxE49axTP/fbYQv+r7m2AlVBNwixavGYQhVlQJuQ+Lt7fS1fZtIbOUe0lgq0LF/W0ja57zdMRik5JLBJrRPffu4WLiolQwxMTGUlZWRm5vrZM2Qnp5OTEyM23be3t54V3kQSCQSiUQikUgkbmmoAI1VggO+saGMHWma1YHNatPLUo+l88+nF/P+nHtQZs6sbO8qdkKV99pu3Ruz4/dUbNbq1gyKohDfNo6g8EDYneNcKAXcMyc0VJsLFXOjo1ce0U0jyTyVhVrlOhiMBvpefwWxzaNh/uzKAj8/qGKhLbk0kDEZLiJ69OiB2WxmxYoV+raDBw9y4sQJ+vTpcx5HJpFIJBKJRCK5JLBa4Ysv4PnntSCMjgoGqAzQ+PzzWj13cROgWuyEVNWXDccsLpUBNqvK0T+PszfPVHPsBLuAW8HQlmZAQXFhri+E4JZnbkBRFNi8ubJACrj1w2DQlE0VGJOTeXPmBKLiI7TvZqMeo6HTwHY8OftB2Lu3MugjQN++Muij5JLgvFoyFBYWcuTIEf17YmIiu3btIiwsjCZNmpCdnc2JEydISUkBNAUCaBYMMTExBAcHc/fdd/P4448TFhZGUFAQDz/8MH369JGZJSQSiUQikUgk9cMeoPGAB7N3R9atg6wsmDbNdSaIKrET9hOG0ZTpUskA2or3/o2H6NS7d6Uwao+dEBbmULFCwK1YeAsryeWNt8byz38soaTAgtFsRLWpKIrCnS/fzNBbB0gB1xX1dYUZNEi/BgAxvy1kzvdT2ZhYyr51BzB5mbjy2h506NcWZd8+LY2oIwMHNtCBSC40VBRsNQXpqEUfFwvnVcmwbds2hgwZon+3x0mYMmUKc+fO5eeff+bOO+/UyydOnAjACy+8wIsvvgjA22+/jcFgYNy4cZSWljJixAjef//9c3cQEolEIpFIJJJLkyoBGgHUJk047B1Dvs1IbIgXjbKSUI4fr6xgD9BYkYrdiSqxE9TAYIQHG2ghBCazqXaxE6oIuJ0Prefb3x5h9b58Th1JIzQqiME39yMsJkRTMEgBt5KGcoWJjtbSjdpjblgsmD78gAEJCQy4trd2HXNS4bWFzgoe0NrVZh+SixJVaJ/69nGxoAghLqLhnh3y8/MJDg4mLy+PoKCg8z0ciUQikUgkEsn5Jj1dc4Gw4+PDjuZ9eP2/a8nNyNM3t+jSlH/+4yriVixyFv7/9a/qQmNWFjz7bOUu+g5l8j0/ehzG3L/eIe74fpg/v3Ljq686WzLY+eKL6mkxExK0LBKhodoq/ebNrgVcV0qRSx2rVVMI1SaVaP/+MGmSawuVqn3WxfoFoF0799YvlwmXqjxmP65t+6IJCKyfpVBhgUrPDukXxTm6zG2iJBKJRCKRSCQSFzgEaATY12Egzz69mNzMPKftiXuT+dsD35F3w83O7deurd5nldgJ0SlHGXxzXxRDdTNoxaBw9eSBxLWIqX3shEmTNIHVkaQkTUHx4Yfa36oKhnbttHaXG3ZlQG0UDKDVmznTc8wN0BQF06ZpSona0L//Za9guBywVbhL1PdzsSCVDBKJRCKRSCQSiSNVAjSSkMBHX/wJClDFBli1qRTmFrNwTUrNARqrBAckKYn/e3wAI+8cgsFY+VpuMBm4ZuowHv3w3rrFTpACbu1x4QpDQgJMnIi4/36YONH5ekKlK0xNmEyaZci//gXDhjkplgDt+7BhWvnkyZfn+b/MuNyUDHJGSyQSiUQikUganvoG0TufVAnQWNiuMwdf+tptddWmsm7BZqa8NcpzgEaoFjvB/NlcHps2lSkv3sSePw6gKAqdBrQlNDrkzGIn2AXc4cM1a4oNG6rHGejbV+vnco0BkJ7ubMHg48Pp0Tfy5YKDrHx1NpYiC03aNebGv13DyGuuwfDpp5WuMOvWaee2tjEaJkyAceO0uWCxaGlCQ0IunntBIjkDpJJBIpFIJBKJRNJwNFQQvfNJlQCN5X4BNTexlNcuQKOL4IDMmEFYQgKD7LET9u2A2W5iJ9T23EkB1z1VXGHShl3Hg7fMobigRM/0ceLASd6+7yP23zmEJx69B2XmzMoGa9dq57a2GAyuY2hILhtUoaCKemaXqGf7c4lUMkgkEolEIpFI6k9NQfSKi7UV/BUrah9E73zh7e30NdhoJTwulKyUHJfVjSYDnQe01SwgHPHxcd3/pElaEEhHc/2kpOpKBUfONHaCFHCdceEK8/aM9RTll6DaKt1b7KHxl81ZxZCJ/eiRkFB5fTZs0JQ3l7uyRlJrGsLd4WJyl5B3hkQikUgkEomkfpytIHrniyoBGg1btjDh8WvdVldVwdiHRtY+QKOMnXD+qOIKk9uiHTuW73FSMDhiMBlYOnullqHDjt0VRiKRuEQ+qSQSiUQikUgk9cNNEL3T8a0pNHgT6WfA/8Cfziv19iB6F2LqRHuARnvshKQkrn/oGk7ddzWLPvodo0lbp1NVgcFg4MlZ99HSVFj7AI0gYyecL6q4wuSWe15zVa0q6cdP184VRiJxgw0Dtnqu79saaCznAqlkkEgkEolEIpGcOS6C6B2+4mre+WgrR/9cDIDRZGTQuF48dMdQ/L/96syC6J1rqgRoNHz6KdOmTmX0PVexct468k4X0rhVDFdPHkjY6ZN1D9BoR8ZOOLdUcYUJNasus4bYMZoMxDSLrL0rjETiAtEAMRmEjMkgkUgkEolEIrksqBJE70iv4Tz68AJUh/SNNquN1d9v5ujuE7w7cwren3xU2aCuQfTOFW4CNDZPSKD5sIoAjTk5MOeD+gVotCNjJ5wb7K4wFVYjwUcP0HN4V3Ys3+3SZcJmVRl191DYvLhyoydXGIlEImMySCQSiUQikUjOEBdB9N6buwuhqgjVeWlYtakcP3CK5XvyICGhsmDDBq2fC5FJk7SAi44kJcH8+fDhh9rfqgqGMw3QKDk32F1h7CQl8ejDfQkM9cdgrBSNFIO2anztfVfTNZK6ucJIJFWwB36s7+diQd4dEolEIpFIJJIzo0oQvbxWHTiw+Qiq6tr2XFEUVs3fePEE0ZMBGi9NBg1y+hq9bCEff30X1z04gsCwAExmIy26JPB/cx7ikQf7oHzyiXP72rrCXCioqpbNJCVF+3uhKvUuYWzC0CCfiwX5BJRIJBKJRCKRnBlVguiVmH09VhdCUJBXfHEF0ZMBGi89XLjChH33BQ8lJPDQt5MrXWE2b4QNSc5tz8QV5nyRnq65M23cWH3O9umjKVsulmORXFRIJYNEIpFIJBKJ5MyoEkQvzEvF28+L0uIyl9WNJgOtujS9OIPoyQCNlxaTJmmr+o5ZUZKSqru/OHKxuMJYrVrmFncpZYuLtaCmK1ZoSpNJk6T1zVlGRUGtpxOB6i466QWIfCJKJBKJRCKRSM4MexC9Crx27WD0HYMxGFz7DtusKmOmDoXNmys3XmxB9OwBGuPitL9SwXBxcqm6wlitMHOmewVDVdat0+pbrWd3XJc5l1tMhgv8LpFIJBKJRCKRXLDYg+jZUz0mJXHHPSM4vCuJvRsOYTAoqKrAYDSg2lTue3USbXyKZRA9yYXBpegKM2+es3UGkBMbxZqESLJ9vWhuM9I3MR2vE8mVFQ4c0NpNnnyOByu5VJFKBolEIpFIJBLJmTNoUKWSAfD+8jNef+0u1h8byor5G8nPKqBZx3iuuWsILQz5MGuWc/uLLYie5NLjUnGFSU93smAQ3t58Mqgjb6gZGJUCFBSsBpWA9t7M7jeWbguWVsZDWbdOU7ZcLMqUi4yGCNxoExePu4RUMkgkEolEIpFIzhwXQfSMH7zPwIQEBt7XuzKI3pKvqvu7X0xB9CSXPnZXmIuVNWucvi4cfiVvlBwD7AKqJqQWWcuYnLWL5bfeRMynn1c2WLtWU7ZIGhwtJkP93B3q2/5cIpUMEolEIpFIJJL6caEG0VNVTcFRWqoFqQwNvbhWpiXnlot5vqiqlkWiAlvTprwh0l1WFQisQmWudwlPJyRU3qcbNmjWHBfLMUsuWKSSQSKRSCQSiURSP+xB9DxFtHfkbEe0l6n76s7FLGDXl0thvuTkOI09o0t7ThfvdVvdJgQb0hOhd+9KJUNxseYucjFbc1ygqBiwnafsEu+99x7Tp08nLS2NLl26MGPGDHr16uWy7uDBg1lTxSIGYPTo0SxZsqTW+5RKBolEIpFIJBJJ7XEnjF4IQfRk6r66cykI2GfKpTRfSkudvorQUCh2U7cCo0HR7l9H7DEaJA3K+YrJMH/+fB5//HE+/PBDevfuzTvvvMOIESM4ePAgUVFR1er/+OOPlJVVpiDOysqiS5cuTKijG80FepdIJBKJRCKRSC4oaiuMnq8gevbUfVUi67tl3TrNxeNiSEt4NriUBOwz4VKbL97eTl9jLTaa+IdyoijHZXUDClfFtYa0KuU+PmdrhJLzwFtvvcXUqVO58847Afjwww9ZsmQJs2fP5umnn65WP6yKFcs333yDn5+fVDJIJBKJRCKRSBqQMxVGz3UQvSqp+7LT8/jzRDGbc3xRg4Pp278ZfcPL8EpLqWxzuabuu9QE7DOhynwpV23sDTGxqJEPGb5G2gtfrjtVSuOM/Mo2F/J8CQ3VFH4VCkBl82YeueVantzyU7WqRkUhwOTNpObd4acZlQV+fpoyUNLgqBhQG8hdIj8/32m7t7c33lWUTABlZWVs376dZ555Rt9mMBgYNmwYGx3id3ji008/ZeLEifj7+9dprJeJo5VEIpFIJBKJpM7YhdHaxFkArd7MmVq7c0mV1H1HDqbz9G/FTD8WzqpMH9YcKeW1zw9y748F5E6a4rxau26d1v5yooqADVDepAlbBvdl8ajB/Dl0IGrTps5t7AL2pUCV+VJiNvJon3DuHBTJjy0D+aORH5/EG7mmtw8/Xt/n4pgvBoNmUWQnKYnrCxSe7zYCH6OmGLLnJoj3D+XLwZOJOJLkHJy1b9/LJw7HOcYmlAb5AMTHxxMcHKx/Xn31VZf7PH36NDabjegq7k7R0dGkpaXVOOYtW7awd+9e7rnnnjofr5xFEolEIpFIJBLXOAijNptK0qE0FqxK5vkVxfxrjx9/BLfD2jjeuc35EEYdApXlZxfyxhaV44ZQbFZV3y5UQfqJLP730VaYOtW5/dq152qk558qAjY+Pqy+4Rp6tPDjpvJUpuUf53pLMv1aB7P3lgkXh4BdV6oEtvtgYHM2RvtQmeQRbEKbOy+pRzly83XO7S/U+TJokPP3WbOYbPFl45jH+G/vsTzfbSRfDLqN30Y+QNuTmTBrlnP9gQPP3VglZ0xycjJ5eXn6x9FSoSH59NNP6dSpk9sgkZ6QSgaJRCKRSCQSSXUchFGbTWXjuqO8fiyMWXRkqxLLxtNmXl2Yyt/3hlJ27/3nTxitkrpvX2oZJ73CXVe1qWxcvJOsiMaQkFBZsGGD1s/lQBUBe8f1o7kzYz8FZc6BA9OLCxl/chcnJ93k3P5CFbBrS5X5UhIfxxf++bpSoSpGxcCX/gUXx3yJjtZcluxYLDBjBgFvvsV1RzK4rcDIlXuPobz2GsyY4RzksX//SzfA5wWArSK7RH0/AEFBQU4fV64SABERERiNRtKrPIvT09OJiYnxON6ioiK++eYb7r777jM6XqlkkEgkEolEIpFUx0EYPX44na9KEkjyikQPcC5ACMGBXSf4cUvO+bMOqJK6b7c1xMmCoSpCCFKPZWip++zYU/dd6lQRsElI4NWyTBRFqZYcT0VQrqp8pBRdHAJ2bakyX9K6tEP1ELXfJlSOFV5E82XSJGjXznlbUhLMnw8ffqj9dXSRAK3+pEnnaoSXJaowNMinLnh5edGjRw9WrFhROQ5VZcWKFfRxdK1xwXfffUdpaSm33XbbGR2vVDJIJBKJRCKRSJypIozuOFlGkleEy6pCFSz5eiN07Hh+hNEqqfuMUREYjJ5fcYPCAi7P1H1VBGxLzx5szTjpVsi2CZXfThy6eATs2lBlvvhFeF69N6AQ6uV/8cwXk0kL0Olo0eCJ/v0vrYCeEicef/xxZs2axWeffcaBAwd44IEHKCoq0rNN3H777S7dLT799FPGjh1LeLhrq7CakLNJIpFIJBKJROJMFWF0R3kYuLbIBeB0Wh5CCJTevStXSe3C6NnOMFHFVLhfv+Z8tzLFZVXFoNCsQzzxbWJh1V/OhZdD6r4qArYtJARSPTexCvXiEbBrQ5X5Em1R6RjSiP25KXr0fkdUBGMad4UDVdx/LuT5YjJpGTCGD9csijZsqJ52tm9fLQaDdJE4Jzi6O5x5H+4tbtxx8803k5mZyfPPP09aWhpdu3Zl6dKlejDIEydOYKgS7PPgwYOsW7eO33777YzHKpUMEolEIpFIJBJnqgijSngYFLqvHhTqj6Io50cYrZK6r40tk14ju7D1t90ItfKl3GA0YDAoPPjmbdpYN2+u7ONySd1XRcD2LyyiTUgEh3JPuxRfjIqBfjEJmtLJkQtZwK6JKvOFTZv4+/23cffGT7GqqpOiwYBCr4jmDIlpB3N/ruzjYpkv0dEwYQKMG6cp/CwW7dqFhMgsEucYFfTsEPXp40yYNm0a06ZNc1m2evXqatvatGmD8OBCVBvk7JJIJBKJRCKROFNFGB3cvwkGg+sXZINBYeRNFdHHz4cwWiV1n+H4cZ7/x1Xc8tR1BIRW5nbv1K8Nby57lk792sDevZdn6j67gG1n0yYe7NjH7fqoEIK7219xaSlkXKR67Hwqj7l9p9Irorm+OdDkw10tBzDjitsw7tt/cc8Xg0GzKIqL0/5eTGOXXJRISwaJRCKRSCQSiTNVVnsHhZfyQ8toko+kozpZByjENYlg/NSK1HnnSxgdNAgcgpuZ587h9qlTueXv15GVmouvvw9B4QFa4d69l2/qPruAbT9XSUlcXyw42W0gb+5ci6IoGBUFq6piNhh5s981dEnLurgFbFdUmS/MmkWHqVP56Mo7yCsrpshaRqRPAGaD6fKeL5IGQ8WAWs/1/fq2P5dIJYNEIpFIJBKJxJkqwqhX6kneen4q89ZksHT+ZgrzS/AL9GHkhCuY+OBQAoP9zq91gD11X0XKTXvqPlNCAtG9e2tKkz9zNCVI1cj6l1vqPhcC9kNTp3LDuAdZeGwfmZYimgaEMLZ5B0IOH700BWw384WEBIJ79yY4NFSzypHzRdJA2IQBWx2zQ7jq42JBKhkkEolEIpFIJNWpIoz6ff0Fd0+dyl3/N4pSSznePmYttgFcGKu9kyZBVhYcOFC5LSmpupDoyOWYus+NgB2XkMCDvXtDaDik5sDCdy5tAVvOF4nkrCGVDBKJRCKRSCSS6rgRRpWEBHzs1gEX0mqvPXXfvHmVY/ZE//6awHg5pu6TAracL5JzioqCSn0DP9av/blE3iUSiUQikUgkEtdcbMKoTN1XO6SArSHni+Qccbm5SyiivvkpLgHy8/MJDg4mLy+PoKCg8z0ciUQikUgkkgsHq/XiFUZVVabuq4n0dClg25Hz5bxxqcpj9uN6e1tffAPq91wsKbTyWM8NF8U5ukB+ASQSiUQikUgkFyTnYLX3QE46y08dpky10j2iMQNjmmNsCOHOnrqvBkpsJWzL3kRWWSZB5mCuCO1DoPnCfIkvtZUz58jvHMg7hrfBi7FNBtI3qu2ZdxgdDRMmkHfdMLYfW0lRYTahQbH0aD4EX7O/yya/pmxiQfJvlItSmvg35f/aTibA7FutnhCCA/lJbM/5C1UIuoa2onNwy8pYHg6oQmV37h6OFB3FrJjpHtqNeL/GLvd/ujSbDw7PJr00DV+jHxObjKdHWGeXdbNLSlj8119kFBURFxTEta1bE+QmtWpySS6/Ze2jyFpKKxHN0OAgvNxE9P9o2xoWHv0TmxD0i2nGM/1H4+VCuWaxlvLin5+QWJiIgoH+UT2Z1u4ml32WqeVsz9nBqeJT+Jp86RV2BZHeEa7HmpvHW2vWcyo3n9igQB4f1JemYaEu6y48vIVZB5ZTKsqJ8grhjf630Tgw3GVdydnBhgFbPbND1Lf9ueS8WjKsXbuW6dOns337dlJTU1mwYAFjx47Vy4UQvPDCC8yaNYvc3Fz69evHBx98QKtWrfQ62dnZPPzwwyxatAiDwcC4ceP43//+R0BAQK3HcalqziQSiUQikUganAZc7bVYy3ls488sO3kQo6KgoGAVKgmBocwZNJGmga6FpoZke84WPkv6iDK1DCNGVFQUDEyIv4UhUcPP+v7rwtrUvbx9+GNMpnJUAQqgKOAnGvNh7//D1+TlVL/QWszazC2kWjIINgUyIPIKon2qC61L035j/onvEAgMigGbsOFl8OL+FlPpEdpdr1dcbuGerf/A25yNECAAgwJlNiPXxUzk5oRhet2C8mJe2vcpe/KOYlS0+WETKq0C4vlXp6mEeVW+c6dZ0vnvwbfJKM3EiAEBqKhcEdqTe1vcg5fBrNf95OgXbMn5DaMiEKB7qZuURrzT5T8YjUa97te7d/OvlSuxqipGgwGbquJlMvHa8OFc17ZSMaMKlTf2LuXrxM0oKBgUbR6Gefkzo/ctdA6tVHYk52ZzzU8fkV9qQjsDAAreJisfD7mBQc0q+118Yh3fnJxDgHcZ9syvBgWyigJ4s9vzNA2M1eseLjjCO4dnUGgtxKgYUYWKQDAyZjgT429yUsw89+ty5m/fg0A4eOkrjO3SjjfGjNS3WK1WRi75N4bgHG2DAMUANqvCFd49ebX/rdXmwvniUpXH7Mf1xtYBDWLJ8NQVf1wU5+i8qkOKioro0qUL7733nsvyN954g3fffZcPP/yQzZs34+/vz4gRI7BYLHqdW2+9lX379vH777+zePFi1q5dy7333nuuDkEikUgkEonk8sJuHRAXp/2th8XB89uW8fvJQwDYhMAqVACSC3OZvOprymy2BhmyO44XHeOTY+9RppZpY8CGQKBiY37yF+zK3X5W918XMi15vH3kQ4zGckATVu1yZxEneWKb8/v0xqyd3LP1aWYnfseytD+Yn7yYB3c8z7wTi3BcY9yavY15J+ajogm1NqGd8zK1jJmHP+B40XG97kPbX8bLlA1o+zZU7N9ssLE4fR5HCk7qdV898Bn78hIBTblgq7i2R4tO8fyeWfoYytRyXv/rTTIsWeRYfDlRGMDJIn8Ky73Ymr2dr45/rff5R+ZGtuUuw6gIff9KxcfGKV7Y95ped3ViIs8tX065qiIAa8XfUquVx3/5he0pKXrd2UfW8VXi5grlRuU8zC0r5t6Nn3PaUqjXvf7nj8kvtSsyFOxqjlKrkXtXLaCkXJtL+WVFzD85Gz+vMv162c9XmF8hT+x4Re8zuyyH6QffoshaVHG+tHkImgJoadpvet3Pt+5k/vbdgEDR968AgoV/HuDjjVv1ujf+8ibGkBz9HFXoejAYBdvKtrHw8CYkkrPBeVUyjBo1ildeeYUbbrihWpkQgnfeeYd//vOfXH/99XTu3JnPP/+clJQUFi5cCMCBAwdYunQpn3zyCb1796Z///7MmDGDb775hhSHB4dEIpFIJBKJ5MIivbiAH5P2oFLdqNYmBCeL8lh28q+zOobf039xG69dQWFp6s9ndf914YO/FmE02HDhaYCiQLr1MGnFuQAkFiXz1sFPKBdWXXFgP8/fn/yVVRmVwuXPKYsrhFXXLE37HYCUotPYlFS3+zcqKm/99bm2/8IUtuccREWtVlcVKocLk9mddwSALdlbOVWcz4GccE4VBZFT6ku2xY+k/DCO5YewKmM9+eX5AHx9/FvdeqMqQkBa6SHKbJpQ/+GWLRhcVQQMisKsrZowXmazMufIepf1VAQl1jK+P74NgF8O7ibbYgSX50uh1GriX2sXAfDPnR/g712uKxaqnq9w/3wWn9DinKzOWEOZWoZAaBYiFR87S1J/xapaAZjxxwZ9f1X3D4IP128GoKjMQpFvBq5s1jWFg+B/e351edyShketcJeoz0e9iNwlLtiRJiYmkpaWxrBhlWZXwcHB9O7dm40bNwKwceNGQkJC6Nmzp15n2LBhGAwGNm/e7Lbv0tJS8vPznT4SiUQikUgkknPHttPJqB68dk2KgY3px92WNwQH8ve5FIQBBIKk4mOUV1g51BWramVz1jbeO/Ixbx+ayU+nFpNXnnfGYz1YeMRjucEgWJG6E4AlKatwLQhr/HhqKUIISmwlnChO1lfNq6KisjdvHwALT63BaPDsZZ1Vpi3y/Zl72KPiwqgY2JVzGIDduftJyg+riJzvuDIPxVYvThYGcKhAO/YyNcelggE0wdnLaGN79p/YVJVtp065nV82IVh/4gQARwszyS+3uKwHmqJhY+YxAL49uM1tPQ3Bhgyt7smSE7qLhLvx/nhyJQB78/ahCoFa4YIiUBAo2ncBBdYC0ixpAOQVl2I/P2bvcvyCSjB7l9t7pdBShqqqfH9wIyYv1eP5Ej7FrgslDY4qDA3yuVi4YAM/pqVpN1J0lQBC0dHRellaWhpRUVFO5SaTibCwML2OK1599VVeeumlBh6xRCKRSCQSiaS2eBJC9TruJKSGGkOt+q/7GAqthbx24C2OF5/AgAEVlZ05f/LTqSU81mYanYI71H2stRiHfeV+d55rKwI7qZZM8q2F+Bi83Nap2qc9rkJtUGqoKxz6PZJXiE1UKhaq9ERemQ+5ZSUV7Wo+ByaDsVbX1b5/Qx3Oq8VQWENNsBk0pVRtwt6V28r1/1fWdh6PqPhHP6cCvPzKiG+TTlB4pZIgP9uPkwejKC3SrmmBxVrj/j1MEYmkXlw86pAG5JlnniEvL0//JCcnn+8hSSQSiUQikVxWXBndFJMHYdQqVAbFNj+rY+gU1BWDm9dhBYVWAW0wOwQdtAkbG7O28fpf7/Lsnn/z/pHZHCo4Wq3t7GOfk1ysxSewC/sCgVVYeefQe+SXF5zBWD1nkLCpCiPjegC1UwiYFCPeRm9aBrRwq8AwYKBrSFcAJsQPxap6FshjfZoC0CO0jVvrCNBcJq4IawfA6dKaxqpQYtUEZz9juEvzf9BW/EttJrqHdMGgKPRr2hSjG2WDUVEY0lybWy0CIwnzcp1FQ9u7wqDo1gAMTIgBD8cFCq0jtLEGmkJcuko4jndATCcAQr3CcLTgcOwPwKCYiPWJAcBsLqdNz+MEhjpbIQSGFNO65wm8vMsRQjCyaTfKS4wez1dZjp+HY5E0JDaUBvlcLFywSoaYGO1GSk9Pd9qenp6ul8XExJCRkeFUbrVayc7O1uu4wtvbm6CgIKePRCKRSCQSiaRhKFdL2Zr9G7OOPsv/Dk7jmxPTSSzc61QnzNuP21p1d/nabFQUWgZFcFVcKxelDcewmFEoisGlkC0QjIq9Xv9eppbz2oH/8e7hj9mdu4/EouOsP72FF/a9zvfJlbEbskqz2Zaz06UlgUBQrpazNnNdncd6X5trsdnMLoVGISDBuyOhPoEA9ArrUoPyJAF/kyZgXhc3xqVCQMuyYGBEjOa6HOYThC8JWpkq8M+1EJJZjH+uBWyCctXIE21vB6CxXxQDIrq4tBIwYKBzcEvaBGoKiRJbzcvpGRWBF+9sdjuqUNzGGWju11nPLvFgr16oorrtgxbTQWFqhbu1yWDkvjaDXO7XqCiEePlyQ5NuAAxu0pHwkAJcKxoEvj6lTGinKQ6mtb+RAou3S5cJISCtIJCbE7RMEPnlniwkFGzCRnaZliEipkkORpNKVT2SYgCjUSWmaTYGg4EWkeEUJ4e6jl+hgmpV6BfSzcN+JQ3J5eYuccGOtFmzZsTExLBixQp9W35+Pps3b6ZPnz4A9OnTh9zcXLZvr4z8u3LlSlRVpXfv3ud8zBKJRCKRSCSXOxZbEbOOPsvPpz7gRPFfnC47xYG8TcxOfI5V6fOd6j7TbSiTWnbT13DtQmmnsFi+vOoWTPXIXFEbGvnGM63lE/ibtNTnSsWrsbfBhzsT7qd9UCe97o8nF7Mv/yCAHkTRrkj44dRi9uTtB+B48QmPq/gASQ4ZG2pLiLc/L3R4FNXmC+AUIDDa2IY3etyn170mdgheBjMGwMdQRpCpBH9jKYoWWpCb4q/R63YJ6cTUZnfhbfAGKq9BoCmAJ9s8RpxvnF73g/h7uPL3FG5+ZwvjP9jB9Z/sYvwHO5j0vy08vjOOeAdZ+cm2t9IvogugKSzsipxuoa15oeNduktDuFcAnl1SBLG+wQD0COvM1VETKFeNTudAFRBkbMU/Ojyut+odH8//rrkGPy8zRr9yzEFlGHzLCfLx5qPrr6eDg8v1pIReTGt7lW5ZYz8HjfxCmd33ToK9NIVM28C29G9ZQlhwIfboCXaFg7+fhR6tkhkQ1R+AnmFdCDSHkF/iU+16ZRQEMCS2A8FmbaGzxFri4fg1iqya5UJYfEE1BYMdxQChjQpQFAVFUfjnlePJPhyKUJ2DSVpLjeTtbcTTQy6sFK2SS4fzGpOhsLCQI0cqg9gkJiaya9cuwsLCaNKkCY8++iivvPIKrVq1olmzZjz33HPExcUxduxYANq1a8fIkSOZOnUqH374IeXl5UybNo2JEycSFxfnZq8SiUQikUgkkrPFb2lfkGZJqvjmLIyvzPiGZgGdSPBvD4DZYOSVK0Zxe+t2rEj9nXK1nC5h3RkY3fusx2Ow0y6oI691epc9ebvIKssk2BxC5+BueBt99DpW1crv6avdKg8MGPgtbRWdgtvrwro7FBS8ahELwRVdw5vzff83+enEZnZkH8TH5MVNCYNpFeT83hvlE87DLcfxw6n3MCqVfv82YaBH6DV0D3WOCdE/sh89w3qwM2cXeeV5RPlE0Tm4EyZDhahgtcK8eWSt+IbuRYcQAmyKoSK2gsC/vAyvNT9g25mCccAgmDQJH5MX/+xwByeLM9iecxAhVLqEtKJZgPNYe0e0ZGt2ItpcqX7NFaBtcCP9+7j4MTQPCOSLpO/JtVowG4yMirmSUXE3V2sbFKIQ3qqYrLIifVuEj5GAIOf9KIrCfa0HMTK2I7MOryevrIQe4U2Y3KIXRoNRr2dQDPyt9YOUijfIKsokMz8AIRTCA4oJDSjh/hb3E1ChsLLYSvA1n8ZsKiej0A9LhctHkLeFxsEFFKsnEEKgKArlpf6oArfuFUJViPAKA8BkEh5DKRiMqt7vtZ3aUmgZy+vLV2IIKcRgVikvMhOtxPD1LdcRHiDdJc4VNqi3u8PZTejbsJxXJcO2bdsYMmSI/v3xxzXt45QpU5g7dy5PPfUURUVF3HvvveTm5tK/f3+WLl2Kj0/lQ/+rr75i2rRpDB06FIPBwLhx43j33XfP+bFIJBKJRCKRXO6U2krYmbMSd2KQAQNbsn7VlQyqsLEsdRbbsn+tEOAV1pxeSmJxe8Y3+TsBptBzMm6TwUS30J5uy/PKCyi2uV9tVlFJKtJiMLQJbIW/yZ8ia5Hbur3C3e+rJgwGAzck9OGGhD5u6+SWZfJr2keYFJuTWsSoqOzKXUT30C60Duzq1MbH6EOfiCurd2a1wsyZqPv3capYywihKGBSnK+xxVZMTlk6EevWQVYWTJsGJhON/aJo7BdVvd8Krmvcg48PL6esmg+EwAB0CYmneYAWCF4IweKU99me8xshXgaCvbQx7M77mZMlW7mr+ev4mzTrgN05yTyy9WtU4TzOdEsB92/6nG8G3k/LwCi931kHN/Du/jWUqZoouOzkIeYf28XMPhNoERSht28e0JyXO77M8vTl7MjZgVWU0z6oC8Ojh9PUv6leb0v2H9iEFnwxKqAYcI6hkGo5yfHioyT4t+REShAmN57eQkB6ZjAZhWU0C/WniV8jkopP4i42RGPfWCcF3cQrunBdl/asPZxIXomFlpHhdG8Sd86UeBKNhnB3uJjcJc6rkmHw4MEeI68qisK//vUv/vWvf7mtExYWxtdff302hieRSCQSieRiRVUhJwdKS8HbG0JD4Syb3ksgr/w0VlHutlxFJd1yQv++PG0uW7N/caihvReeLP6LLxOf596W72BQjJxvfI2erRMAAipiHJgMJm6OH8fsxM+r1TGg0DKgBZ2DOzpt/yNzCSvSf6DYpgWE9DcGMSx6PP0iR53ReNef/oVytdSlskdBYXn6t9WUDG6ZNw8OHKDImq9f29w4P050CqU4yAv/gjLid+cQklJEVmkKEd5xcOCA1m7y5Bq7D/HyZUB0AWvS/RBCoKKgVMwDX1M5I+IqA28eKtjG9pzfAJyOTSDILktjedpnXN/4YQBmHV5TUeKMQGATKnOPrOOVbjdqh3hsO2/uXelQRyOxIIvb1nzO0hEPEOzlq5dHekcyqckkJjWZ5Pa40i0pGDBg87D+nGY5RYJ/S0qK/TidFEWzhAwHgw6BEArFJV4kHY/G0kdTWIyOG8r7R+a67XN07FXVtvl5mRnZobXbNhJJQ3PBprCUSCQSiUQiqTPp6bBmDWzcCMUOK4d+ftCnDwwaBFXSY0saDl9jgMdyBUVfaS625rMla7HLeioqGaXHOVywjTZB5z/Olp/Jjy7BHdmTt99lQEcFhQGRlVYAQ6IGoqDwXfKP5Fs1xYEBA30jruT2hFswODjV/3hyFpuyfnPqr8iWz08pszldmsr1je+q83j35W9xm8JSIDhRfIgSWxG+xsqsCnnlp9mTu4ZCay6hXjF0DhmE7+kiWKcFqSxTS7B6GVhzfSvWRbUkrSQIgYJfaCnN2mfTM+U4/X9yCNi+bh0MH17j/XYwfzMB5kQGRXuRVBROdqkfRkUl1jefeP8cDhUmYrFNxMcYwLbspSgYXCpPBCq789YwMvYezAZv/sg4jOpmMdMmVFam/QWAVVX53741AAR6WWgSmIPZYCO31JfkglCySov4IXEXd7VxbzniCn9TQI2xOfwr7pdusbHsLNmMr7GUctWIKgwoisCoqHj7lhPsCwkhIQAMjOjN4YJj/J6+Vk+Pav87JLIvQ6L61WmcknODTRiw1dMSob7tzyVSySCRSCQSieTip8Jn3C4QVaO4GFas0D79+8OkSWCSr0ENTaA5lGb+HUkq2u9GEBR0DRkMwPGivageVnkNGDlSuP2CUDIATGxyA/v3HgQh9MCPoCkPon0iGRI1wKn+4KgB9I/ow9GiRMrUMpr4xeuB/uzklmVVUzA4sj7rV4ZG30iAOaROY7Wb6de2zh+Z37My/UvsIRpVVH5Pm8PtWzvRpKKOQTGw6rpWLAzqhlqiYM/bUGz1Yl9uDPkR3oSOD6f7EoedrF0LEyZ4HMfRwh0YMBJgLqNjSIpDiVIxTpXjRftoE9Sb7LIUt644Wt1yCq25BJuj3CoYKutqc+9wfga5ZUX0iD5By5CsimwQmjVF16iTrDvVnIUn9tRZydA1pDfL0n5yUyowKibaVgQW7dfSTHpGNgAmg/O1U1UY2DkHX7Nm0aEoCnc3m0Sf8B78nvYHaSWZRPlGMCy6P52C20o3iAsUgYJaz5gM1XOlXLhcPOoQiUQikUgkEldU+Iy7VTBUZd06rb61ZkFMUndGxN6BUTHpmRrsKBho5NuKziEDAdyutDtS1Z++LuSXF7Hg5Er++9cXvH/4W/bkHvboplsTCf7xvNjhKVoHtsCo2PBSrJgU6BN+BS92eApfh0CRdkwGE20CW9EpuEM1BQPAsrR5Ne53Wdr8atusajmZllPklp122aa5fwe3KSxBIcwrGn+jNp49uWtZkf5FRd4JtULxI7DZykhZ8TlF1jwAzM3bsDy8HTahVBF2tNwgycVhnG7VDRISKos2bNCkZA9o11jFgIpJsWE2aB+jYkWpmCN2xUKAKdRlutHKkSj4mQIxGYx0CmnsMoUmaKkpe4Rp48y0FNIuPI0WwVlaas68MoIzi/HLK8MLKwMbHyWvvPp5FkJz0ThdekpXWDiSW56BAZXqsRNExUjLKbLmA3CsbLvb4zIYoNSUTH55nr7tdHExn29K4sNfC/lupTcf/VrE5xuTyChyHQdEIjnXSBW+RCKRSCSSi5sKn3FHbE2aktqyHdagYBqbVUxbt0JSUmWFOviMS+pGI98WTG3xH5alfs6xot0AmBVveoQNY1j0rZgM2opsvF+7ivVi14K/io0E/04uy2pie/YB/r3/E8pUq7Y2r8CS1HX0DG3PPzrcjZfBXHMnLvA3qTT3K0ARmQAYFRNNfIswn+ECY44bJYEj2WUZ+v+tajkrMr5jw+mllKqaO1CcTzNGxt7qFGOhf8S17Mr9w02PgsGRY/UV7z8yv0dTFDhfB5/8cswWlXT1OM0DOpPfvRd55TvcjlNBkFISCb2bV95rxcWQmwthYW7bNfXvyN683/Q4DJX9aYEqVWGgsV9bALqFDuN48T43+zfQOvAK3WXnrpb9eWzbNy7r2oRgSkvNrcDXqNBDPUHr3zKI/zMbs6VSYVDuY+R45zAK2ziPf3fuH6xMn0dWWSoA/qYQBkSMpU/EGN0V5kjhXrwMCuVCrTBzVyqPy2DDqAiOFh2gh9cAskpPe3StEAjyynMJMgeTXVLCuHnzSCsowFahNLOqKj//9Rcbk5P56dZbifD3d9uX5Pwg3SUkEolEIpFILhbS050sGIS3N791Hchbe0+TlZgIQLC/D5NHDuX2a3wxfPoJWCxa5Vr6jEvqTpxvC+5s/hJF1jwstiICzeF4VUntGGQOp1PIYPbkrqlmAq9gINAcRrugvk7bi6157Mn9jRNFuzEoBpoHXEH74KvwNlam4jtdmsPL+2ZhFVYEFaH/KuS37TkHmHPsZ+5rOa7Ox5RhOc7so3/HKsr0bTZh5c/clZwqOcTdzafjZfT10EN1wrwiOVbD4nOYV2UGhK9PvMWB/G1OAmmqJYk5if/m9oS/0y5Iy1rRyK85E5s8yrfJM1CFrcKqRLNTGBx5A1eEDQOg1FZMRulxl/s1lakIBAXlWQBk+3uB+wQbCBRKbEYtyKoj9vvNDeHesRiU6gK2omiZFbyNRvwqrC46hQzkz9xV7Mv7i1MlQRRbvTAbVKJ9ConzEQyPuUNvPzS2Pfe26sXHhzdXiPcCUeEO8nj7QfSOaA5WKzE/fMPIFa4VF2aLjZZbMmm+aR3kfQGTJrE1fyU/n/rAqV6RNZelaXPJLc/kmrh77EdQkYVDYLJZ8c0vw1SmYvUyYAn2QiiVtgvh3pGcKjnh1rpHQSGkwmXmk23bnBQMdmxCkFlUxEdbt/KPwYPdn3DJeUEVCqqon7tDfdufS6SSQSKRSCQSycXLmjVOXxe0uZL/bDjhtC2vyMLMH9aRNawbT0ydCjNmVBbWwmf8ouUCyLDhbwrG3xTstvyauAex2Ao5VLC1wrxfQcVGsDmSWxNe1K0eAFJLDvHt8WcpU4v1dJdHC7ew8fQ3TEp4g1CvOACWpm7EJmwu14UFgqVpG7gtYTT+propBFakfY5VlFVTiAhUMkuT2Zm7nN7hY/TtpTYLW7LXszN3C2VqGc38WzIgYihRPpW5CofHTGRbzmqP+x0RczMARwv3sD9/q8tjAoWfT31Km8Du+kp6l5B+tAzoxI6cNWSVpeFvDKJb6EAivGP1torifj5YvQz2SgBEFdtwY9EPgFExEOcbBhk5zgU+1d1IHDmYv85tMEdFAasoJbl4H039O2NUTPgbh7Elq6TCBkazwMiwBGIObUaguTJVZk5ZGhbxBWOaWEkqCKXYZsbfVEazgBxyyo9RUNKJwI++JmzPPtI8DVCAt9EE69ZhPZ3Gsqu2uXU435S1hF7ho4j0bkTrwM7sOriAFtsyabr7tAsLiUha3hIJodAvfDC7cqtfW9AUDB2DuhJo1u6j7/furaZgsGMTgu/37ZNKBsl5RyoZJBKJRCKRXJyoqpZFooKS2Ea8sTvDbfVvlu9k4tC7aZSQUGnOvWEDjBt3aaW3vIgybJgN3kxs+hwpJUc4lL8Fqygn3q8NrQKvcEpdaVXL+OHEC5SpJQ6r+NrfImsOC5Nf5o7m76MoCgfyE50CM1alTC3neFEq7YOb13qcFlsRhwu3Ud2/3o7gz5yVupIhrzyHtw/9m4zSNN0lJKnoKKsyfuPuZtPoFnoFACFe4fQJH8nGrKUuex0Qca0e9HFX7jo9i4Cr/eeUZ3Ky5AhN/CpTFfqbghgQOcZFfQ0vgw9N/NqRXHywmpBvCTJj9TERKTTBPerPA/S4qhk7s5Ncnl+bULkhvicsckjd6edHSaCRkwWbEEKlsV8H/KoonYptedREiU2LXbA/7zgzDy+qOGK7xkP7uzMnic8Tf+eeFlraz/WZ8ylTS/AzqbQPdVYjFNusnPjkWTocMGJWfLGpBgyKyunYAI52jKQoyJuAAgst92YSnlKETdUsZfJ3r6dtSTJ/jmmCKxQU9uT+wVXh42m7cA83LEmixFZUzRXCy2Kj759Ggo6+Df3703z8OPIKYggK0MZpj92oCii3GmhqHqy3zSst9Xiu8ktLEUI4BYC0qipbTp4kr8RCy/BwWkWEe+xD0vDYMGCrZzjE+rY/l0glg0QikUguLS6A1VvJOSInx0mI3hHWGOvJXLfVFYPC8m2HmNK7d518xi8aLuIMG3G+LYnzbem2/FDBBoptuS7LNEuCJE6V7KOxX0e8DCaPsR4AzAbn4xZCJbl4H7nl6QSYQknw7+qk5Ci1FeNewaBRYivQ///F8VmcLs2oGJ/Wzq4cmJ00k5f93yHES3MruKHx3UR7N+L39O8oqhCmA0zBXB19M30irtb7LLYV1hgss8RW98B/g6Im8kXSC9ULDAZOdY2gy4Gm2vekJG4qCGeXYgOhBXuk4ghBoWWQgVbHM/V7SwjB3vYl/HrkdmyiXOsSI11ChjMs5n7dSiXcq3GNYwzzagTAgpPrMSqatYtJ2AgqtGAut2ExmskL8GXByfVMaXY1JsXIvrzVbjNR+J0uQV23BkKuotBaSobwZ+/NjTjZ0v4c0Cwk9vdsRJMjp+n2reYiYxXlNN2ZzZG+URSFV7fQEAgspXkwcyaGAwdoF9SNQwW7KbDmVQR21GZDhHcszfzbaI3WrePAgd1sbBZGXKSJhMjTBPhYsNqMnMoJ4VhGFKdSdjOqSXcURaFpSAjHsrNdzkYFaBwc7KRgWPzXQf61chVZDs/KHo3i+O+oUcSHuLcykjQs0l1CIpFIJJKLkYto9VbSQFRZ0csz+2BQFLep6wyKQmFJKcTUzWf8QiK3LIsDBbtRhY0W/m2J8a0Q0OwZNqoEwHTLunWQlQXTptVJ0VCu2liXfpTU4jxifIPoH9MSL4Ox5ob1JMNyFANGDykvFTIsx2js15G+EV3Yku3axx4g3CuY5gGVgu2p4r/4+dR0cstT9W3+xlBGx/2NloG9AAgwheBt8KVUdR2UQMFAlLcmjJ8uzWB//m63+1eFyoas1YyOvUHf1jdyJH0jR7ptAxDt07giHoN7RUOkd5zHPlzRIqAr4xo/wZKUD7GolUqKAFMYvW54HN9DXwGa0sA8+0MGj27Grkbx5JX6oAoFH1M5Eb7FtDiRxunVh4ghBICTJftZ2cYbm6iMxaFiY2fuUkrUQm5o/AwAnUOuZl3m126EZgMxPi2I8mkGwIG8E4Tm5NFtz0na7UvFu9Sq634sPib2tG1EVtPDRDZthbVCseGKJtuy9BSeRsXAyuvaUtrCq4o3iPbtRMsISsYmMOKPTLwU7VgStmexb3gjl33H/7QTDmiD8jJ4E2JqQmJ4JkmdfSgO8CakMJzrT0VhSC/U2+Ts2sn1GRH82Lc7ydnVFZ4HcjJJLsijSVAIk7t25aWVK90e2+1du+r/X37kKH9bvKRanV0pqdz8zXx+veN2gmtwZ5FIzgSpZJBIJBLJxc1FvHorqSfezoEEE3zcKxgArDaVlo0jICfFueAsvGQXl5fxW+IR0osLaRQQxLCElvjUY95ZVSvfn5zDpqxVTiv0bQM7c3vCw/jPW+CkYBBCkBYawZ9xTVGDg+kZ4k/Ugb31yrCxLv0of9+2kKzSIj0XQaiXH6/2uI7Bsa1ral4vvAy+Hi0TQGA2aDEWBkZ257vk5aSWnHa58j+l2RiMFbEIsktPMe/4M9UE0iJbLt8n/4vbEl6nsV8HjAYz3cNGsun0Ty6FfIHKFeHXAJBScrLG40kpSXb6viV7Db+mfktuRZDFUHMEo2Mn0jOsv17nirChrM5Y6LI/AwZaBnQmzOvMFKmdQgbSNuhKDhdso9CaS6hXDC0CumjWHP2Pw7p15FsLUEuKGfXDPq6IPa65FQR6419QSou9mUSkFnLaO4KY4BDKVAu7OhRSHFE9bScI/sr/g0zLJCJ9Egg0RzA67lEWp7xV4Q6iKZIUDPgaAxnT6P+0ZlYr/ZftIH5nFUVahWbAx2Kl567jhLwyHeOgIYT1jCZbzaCaBYoqaPxnLj4VWSh8WjSjtNMxFBfBJyuqY+4WBMn+2A5qiqgmu7LZPywOYXBWS/hnWQje/BdUWCnsLTnO0rEqaS3jsFfNFBbeECeZlHc9VyzYDBYLZTYbPY8ksrZDK04HB7ocR2G5Zk0xqXNnNpw4wW9HjuhKVfvfwc2aMblCySCE4L/r1rnIG6LFbsgoLOTbPXuZekVPl/uTNCxaktb6WVXWt/25RL5lSSQSieTi5Ryt3kouUEJDNUuVCsuVtqmJNI6MJiUrH1V1fq02KApB/j4M6dYS3lxQWeDnByEhDTqsn4/8xTNrfqOovAyjomATgmAvb94aOpqhTVtUb1ALF58fTs5l6clNHM5oQlp+MAII8ysiMyoJY8bzTP2jUDeRLjYYeSm8GYutJgxpJYi0YhAwpls3/jVqFOY5c5wybNiuvoq/fE+QbjmBj9GPjsF9CTI7+2zvz03l/g3zsAlNwLaf3dyyYh7aNJ+vB91Jl7Cazd7PlDZB/VmX+YXbcoNiomVAbwC8jV681uUR3vrrC3bmHtTrBJj8uLPZdQyN7qVv25K9AKsod6E40FwA1mXOY2LTVwAYHHULJ4sPkly8X3fHsFtX9IsYR8vA7gD4OmS6cIWC4lRn0amvWJm5yKlOTvlpvjoxk4zSU4yO1QI/hnlFM77xg3x/8j0UFAcFikKwOYJx8Q+43WeZasGEFwYPrmPlqoUiax5F1jy8Db5YhRUvxagpZrOyUP+sjH8SkVpIRGphtT7s8yOtmZl9o2OqlTtyIP8PIn0SAOgUMpQIr3hWZ3xGquUwRsVEu6AB9IuchL8pRH/Wt92dRJF2aaqjaJtNBgOsW8c1JxS+vNaGMDkfs29+OSaLjSg/bd853VuhKEfdjtOggNlYBL0HoR7UlNlmiw2fgnJKgr2c6jbblg1oSsuC8mKWXidIbxmMoy5C+7/KFwFL6XH3kxjee49wXz9ySkvodSiRX67oXG0MPkYTTYNCoOL4Zl57LUsOHeKb3bs5mZ9PXGAgN3fqxJi2bbXjB07l53PodJbb4xLArwcPSSXDOcImFGz1dHeob/tziXzDkkgkEsnFy7x51RUMCQmUX3EFluBA/AuKMGzeXK/VW8kFjMGgucKsWKF9PX6cd28awJ2L9pBfpAU/A03B4ONl4u2Hr8fr4F/O86Fv31rH7LDYSvkjcwfJxekEmv0YGNmDaB9nYXzjqRP8bfliXQi3R4HPLyvl3qUL+enG2+gYWbHa7ODiYysqRBUqJsWI4u/v5OKTV57NT0lb2JjUEoSiB7zLKgpgQ2IgXQ6vJN8aTLA5FFUIng2KZ6VqBoSTZcfiXX/hZTLykkOGjUJrHsvn3cb2YSEYMCJQWZo6hwGRNzIs+lZdcfHRwT8qMjY4v+QKNPP/D/9aywd9b6nVeTwTwr2b0Cl4OLtzfyOzNIDTZf4oQJR3AWFexfSJnIivqXLVPMwriFc6P8SpkgwSC0/ha/Shc0hLzA7ZKgAO5W9w634gUEks2oFNlGNUzJgN3tye8DL78taxK3c5RdY8Ir3j6RE2imYBlYJh84BWBJmCybe6DmiootIzrA8ABeV51RQMjvyevoBBkdfgb9JW3XuEDSbON4GNWUs5XnQQb6MvnUP60TN0CD5VlBsWazFfHn+T48V/AipCQKRPMybE/40436ZOdbdkLWFZ2mxswqZbE/yS+jHj4p+gdWBPmDYNny/8UBbt1u+rqihAkDkQ+vdn76AMRGGKy3p20iyVgn12aSrvHHqX3bkKBdYEFARR3n+SVJTL3c2fxDjvGzhwAIutzB4ugbSYIPa3i6Uw0JugQgvtDqQRnZZPqa0Mk8mXkKO5tP8lhX3XNUYVmnCvCjCWqRWpMf0B8Ar3rAxRUAgyB0FoKH7GyhgGplJn1x1FFcT/mYWfqR0AO4JySW8ZhOJCLjQo4GMu4dewbK5JSKB1aQlHck/T4+hxfu3REeHwTDIoCpPadMHfXKnQMBoMXNe2Lde1bet23GU2d65FlVis1hrrSCRnglQySCQSieTiJD3d2UXCx4dTk27ijfJsliTtxJqiEu7jx5RrBvKAGI159myn1VuGD5cxGi4FBg3SlQwATX7+ngW3TmZBlmD9nkRUVaVX+6bcOLATESeTYNYs5/YDB9ZqNztyDvD6gdkU2yyYFCOqUPk8aRHjG1/N7QljdGH8vZ2bMVRYLzhi//bRri3MGDJKd/EpshaRWHSc06WnEQi8DF40LmlM/PIiDBUuPvuHxbE9uQnCKdieFl1fUVUithaRHlZGsDmU4wHB/K74aNJUFVQh+HHbPh66ug9RCQmUHTvIwYJtRO1QUK4KRjVUCiVrM38gwBRMnwgtM8Gq1IPVFAyO41ibfqRaRPuGplvYFD5JzCbdUoqCCigcLoyiuX8gD7S6wWWbRr5RNPKNclkG6H75nlCFirHisIwGM51Dh9A5dIjb+kbFyLjGtzIn6f1qZQoK7YM60zqgPQBLUr+pcf+/ps1nfOO79e+xvgnc2Ph+j21KrSW88dd9lKmatYGiaJ9MSyIfHHmc+1u8QSM/zarmQP4mfkn9uPJ4K9wVStUSvjn+H+5t+RYxPgn43nkvpzoUkbtiMS13p+NlqTx3ZT4mjnWOpctt70J8G8ypH9Z4XGaD5u5kVct5+s9X2J/vj/1OEShklAbyRVIWPpn/4fZ1msLCKlTKvEwsGt2JpGYRTv3t6NqE5omZXLnFiL+A1JKjeG2wcqR9GAGNyvExllNi8+JQUSStS1MJKUmkRWBXwoqNxMc05WTJCZcuOQJBr7B+cDwHf1MgQnijUIrV2zkWiXdeOcYSldAw7XflYLsQBBZPWT85UZQMvXsTkpRE75gmbE47QWhJKdn+vroLRK/oxvy9Z+2eU440Dg4m2MebPIvrbBRGReGKxq7jSkgaHhn4USKRSCSSi4E1a5y+Jk+cwDWJWymylukCXpalmHd2rWdbXAKz774b03vvVTZYuxYmTDiXI5acDaKjtVgbdoWTxULQp7OYkpCgZZEIDdVcET6Y4WzBAFq7WiiaUkoyeWXfx1iFJnzZ/wJ8f/J3IrxDuCZuIKoQrD953G3kAJsQrE08Avs0F58CayE7c3ahClUXbsrUMo4VHiO/PJ+OQe1R1q1D3V+OtVVLdEnXgeDiEnzLyskoVmgdBBvCYjEVG7CqrlfnVSHYfDSZMb17k3lgBaqwYbbg0vR7TeaP9AofjVExYhM28OAP7CkWRk2oQmXT6SOsTd9PmWqjc2gTRsR2wdfk5VTnqV1zOV2qxU4QDmNJKiriX/vm8Wa3u6v1XRPx/h05UrDFjTWDQpR3gi4M14UrwvpiUswsTJlPZqmWltBL8WJA5FCui7tJV8ZklVVkoKg4ffazqCdnVNDb14WFKbMoUwurraIripZJY97xt3my3UwA1mZ86yYbhwAEm07/xNjGfwNgXI8H+G9AKd8MOkhkahHeRWVY/M3kxoXwULtphIZosQhifFrVOMZGPlrdVelLKxQMjkdemaLy+NLNlKtRmA3e2FTFQcHgfHCqUDjcNJr01sMI+3wBKSXFWIUP8dtzWR3QpvIceAsKzD6cKsmjRSCwaRPjp03mf4dfBVSn86Cg0MK/NV1DesLm6RSWl1JiVTEFGrEEOlvFmMpsqEIhvaSAOL8QREgE4Dk+R5hXqPaMAloEhxHp6096s3ZsMwlCvH24rnl7hsW3wOjC2up4QQ4/HN3DqaJ8Yv2CGNeiI82CKoNGehmN3NG9O+9u2FjtytrjNEzu1tXj+CQNhxAGVFG/mAqinu3PJVLJIJFIJJKLD1XVskjYSUjg+eJUJwWDXhXB2pRElrToyPUJCZWC5oYNMG6cTG95KVDhM+7kOpOUVF2p4Ei7dlq7WrAkZS02obpc5QT4Lvl3Rsb2rynDIQCjtmyHCjPtwwVHUIVKZqw/B1rEUuDrR4itkNYH0iH1NFll2UR4hxN69DTXZRWxoF/3av15ldsAAUITeEoCAp2zq7hACAGhoeSXnda3VTX9Biiy5nK69CTRPk2J8i0krTjQpTWDgiDCt7p/fm0oKC/hka1z2ZuXrAdjXHRqOzMPLmPmFXfSNlhbad2afZhjRa6FbRWVTVkHOVaYRvMAz6bvVekddiOHCza5KRVcGTG+Tv05EmAyE2kSlFpLEIC/USHEbMKgVD5zgk2hCGGfOpUh+oT9/wJCq8THqA378za4LVMUyC0/RYm1EINiJNXiPh6Bisrhwh36d5/TeTyzJ4HTq/eSlZuOTdgIMAUQExaJT/+DMCgGoqNpE9yXpakzsFLmbhR0CNGsQRanbK5QcbiYWyq03ptBiq9KU/+mpEUHu1QwaGh9RPboi1i7g8IUTTnUcV8Kawa01oM0CoPCvvZxBOxK0polJZGQaMGLXmSU7SDIS1v5t6kKWZYgboq7EeM+zc0qteQ0fuZyjnaNrBb00eZtxGRQSStJJc4vhKE+XTmquFYyCAHlNhPXN7kaEtfr24O8vHms77AaU+p+tG8zr+1YhcFBizRz7wae6DKAhzv307c9eGVvjmXnsOivvzRFhdCeYkZF4a1rRtMyvO5zSyKpDfLNSiKRSCQXHzk5ToJUfveurD51rJqCwY4Bhe+O7IbevSs3FhdDbu5ZHqjknGAyacE8+/evuS5o9eoQ/HNn7l8usxTYySrLJbM0G6PBQK/Yxk4v/o5EFRQyOiUTAIvNwmmlkIVDu/Nm7I18f+oqlh25km+PX8UHHUfy242dOCm0oG3BXv5ccTiJiLyCan2WmY2AQqi3Fo+ge7CvbsWgCJXgkiIiC/IILilCESqKAj2bN9buIYdhVjX9rkSr1COypFLwrYJAoXtE4Rm5Sry0+wf252uCmE2oeuDAgvISHt46hxKrJqTuzDmmKyHcjXJXzrE67z/evyOjYh/BgBEFg/4XoH/krXQIdu8W4YkDeVv4LPFl0kqTMCoCkyIoVYtYmf4t3554W49rMCpmQpUzWtUlBkZG193iyiZKXcYC0PeiQI6DkskTCooWePGLL+D55zGsXEWU6k+7oLZ0DO5Agn9TfEptmtvS88/DF1/gpZoZHvug2z6vir4bf5O2gp9dVuRWPxdYaMGntJxCq5Za868O0bhWMGgjNShwIP8YolcvBAoC8C4txye/FKvNgE1VUAXs6hKPzWFVeO+bL3Jy0272Z8eyPaMxuzLj2J4RT2J+KB8umEnhB1oME6vQlGlJPaoL55YgM2U+RlS0sbbcn4I/PYBKSxWo9GTqHjIeH6M3bN5cWViLQLQrTh7h1R2rEGjWUfYPwH///INfj/+l1zUZDLw+8mpGdI4kMKQY36BioiJL+efw3oxuc3YzwkicsaE0yOdiQVoySCQSieTio9TZxzTXz8fjIrKKIL24CJqEOhfYYzRILn5MJi2Y5/DhmivMhg3OK/p+flqQx4ED6xyLw1iLNRlDRZ0HuvVm8y8/VCtXgF6HjtIuPBKAMrWcXwZ0ZWViDye5XagGTh8NY1VYV+KvS6frb6XE+gXibTTR62Aiv/Ryjjyf5+eLxcuL5hVm0l1STtDV25/Y3TvofDIJn/LKleRSsxdK377ElZbA5s0EmSMotOZR7lPd9Bsg0BRKhLdmSXBd/GAyLEvZmh5fkXdBWzVWgB5RJ7m+yeAaz1FVThVnsyZjv8syFUFueTG/pe7m+vieGGp4uRbgZCFgJ788mwzLCbwMvjT2a6mlZKxC19CRtAzoxZ685eSWpRNgCqVjyFBCvWLrfEwAqrCxKGUWAuFgpVChPlAEe/M2cLz4GhL823G6LANPQjPA6fIMQrwrhdrUkkx+SV3Pwfzj+Bi96B/ZlUGRPfA2Oru7CIFbRYMQ4GP0w9voS5xvK1JLjri01DFgpLVP1zPK4tNl2jT8TSGsy5xHqkXL8hHpnUC/iIm0C66MMRBsNrt01gAwl2kWNn4mLWNDYWDN6WbNRhPG8AhMioEyVYvd4VWmBS0VAhAGTgcHkNS1JWSCVbVx/PRJbl50gpToYPa2aUR+gA9BhRY6HjxFo/Q8koJi6RjSGC+DieOdwygKrz4OYVA40TmMNlsqFJJJSTynPMSHXrH8VbgSH7P2PCorD2do9HiubzIM9u6tcyDaj/dt1rPWVMWgKHy8fwujmmoBIbMthVz91Qxy8r0Bb0ChoFDw/O8bWHFiH3NG31Pj+ZQ0DKqof0wFF6F2LlikkkEikUgkFx/ezj7S4SVlmAzu/dCNikKzoArffEd8an5hlVxkREdrsTbGjdMsVSwW7TqHhJyxa0yv8E6cKE5FdSEGKUCcbxSR3poCa3CTZrw6cDjPr1uBVbVhNBiwqSo+BgOPWQ1E+mlzzty8JRuzfSsk0CovnkKhOMuPY37tISEHQ1ISfWKakHfkBEt7dkStOA4FEAYDzUdcg+/BRFBVjMt/55PgYNak55FfXoZB0VZzhRA0C/DhyqLTmhXH6dNENY8n3XKcY13Dqpl+AwyKGo+xQii/MmIMfxVsopH/PpIKQigu98LXXEZCYC5NAxrRL8J14EVP7MtN9lhuVAzsyT3B9fE9uTKiLZ8nrfRYv3d45cpssbWAn059yL68Tbr4GmQKY1TcHXQOqW7xEmAOo0/ETTWO2Sas/Jm7ke3Zqyi05hPl04g+4SNoXhHIESC5+BB55Vmowr7yaD+3AkUIzBjYlbOGBP92JBUfrsjm4PrZZcBAYuEhWgZoGQvWZ+7i9b/mIoTmyqCgsD3nAAtOruK1zg8T7BVYsScDBsVN1oyKaWwyaGLAoMibmXfiFRc1FRRFYfBKnBQM5aqN/SFe/BLvT5afmU7Cj9Eni4lMc0iXWJHFp+XkybQM7EWprRiBio8xoNpehkS1YlfuIZdjLfcyYFJUoiqULPGlBo7gLoelpvbqFdoedq8hzCuYNEsuAGVmuxJNa6cKAwUTxsOqdAr+3KZb0MSl5xGXnlelVzhdqlkR5bVsw+6R7uMsHL8inDZbivTvhk8/5cGpU1G7TyajNBtvg5dudcTevWcUiHbH6RS3VnuqEOw6naIHYZ208GNy8u3KJ8Xp75qDeXzW5A+mdBxQ4z4lkroilQwSiUQiufgIDdVWpitWqv237+C6AZ34KXG/y5cvmxDc0rorfD6/cmMtzFIlFzEGQ41+zbVldOwAFqeswWIrraZoEMAtTUY7uQpMat+ZUc1bsejoQTKKCmkUGMQ1IREEbj+o18nr0JOSP9z7woMg63gg9G4NSUn4m7wIstoILLaQF+Cn79ukGPAZOggOzIJ9+yAnB9/TpxnRpi0ZJjOnC4pRFIW40EBC/HwhOxv++gusVsylpbRp051NvU1AcYWwKzBgYFDUBHqFjdJH42Xw4Y5m/2bj6Z/Ylr2UAmsGAaZQeoTeQN+IsXhXSZ9YG8yGml9DzQZNydEpuCldQpqxJ/d4NYFcQWF4TFdifbXrbVXL+fTY82RYkp3Wx/Ot2cw/8RYKBjqF9K3zeMvVUj499h+OFe3XAyWmW5L5M3cDV0XdyMhYLcZHsbWgQsFQVamlKXzKERRbNaHVpJjdxvoALcqAPe3m6dIcXv9rri4Q28sBThZn8O7hb3iuw9SKEl+EqBR27dPT/ngsFwqgnds2QVcwJu4hfk2dhVWU6UoPX6M/N3ndSdDm7/R+SsxGnuwayh9R3hX7h+WKjf9Gm3nLdyjDFq93yuKT1K8l8yyLSbWcACDSO5bxjW+nXXBHvc8rwoYSYNpDodWnosdKpUxBgA+KnwE/oyaY35kexqoOVKlXeX57h7fCaDTC5s0UWrVxWLzNFARUVygnWnJg2jTKZn8IC/YDApNBxWhQK86VglXVgvUZFQP070/KgHYkprxPU1NONSsRVcDJgDAyenSExHxto8UCM2ZgSEggxjEQbdXUylDrQLRmg4Fy1X16Svs9k1VcwLEMG/br7IpZOzdKJcM5Qm2AwI/1bX8ukUoGiUQikVx8GAzQp09l6sKkJJ67eijbA06RXJinR7q3pwCb3KYbg7KL6myWKpEAhHsH8+/Oj/Da/k9JL83SBUwvg5m7mo1lYFSPam1CfHyZ3KFr5YaUFKfyIt+ahHIFa5kRQkMRQrAuNYly1YZ3uXPKRZtQuXvvOnYi8LJb6litKPv2EhATS1qX7pRFRWPCCmtWw4kTlY1zcvD37sz9/d7jcMEO0kuT8TH40T74SgJMIdVG5GXwYVDUzQyKurlW6SpLbRb+KthJsa2QaO/GNPNv69TmivAWeBtMlKqu00jahMrgaM1CQFEUXutyBy/t/ZpNWQedokMMj+nKU+0qAzTuzdtAmuW41k4V+OSXYSpTsXoZsAR5sTT1MzoEX+nSvcITv6d9R2KRtqJvF+7tCo+VGT/SPKA9rQO7EOYd48F3WlM0KIr2Ct4xuAc/p3ztdp8CQcdgbX4tS92ox3KoiorK5qy9ZFpy/p+98w6Pozrf9n1mZrt6t2TJsuXejbGNCzbFYHqvAdIIqYT0QnrPF1J+hJIQQiAEAoHQezHdBRv33iWrd2klbZ+Z8/0xq5VWWskFUzM3ly9JO2fOnDlb2POc931e8t3ZFLjG0RTeFE/RSK5eYf2ejk/riyqYnnUS3XqYNW3PEjJ6SHfksiT/Yka93JB0nT8vHM3b6V0UerrI8QRRhEkw5qQ5mMY3Qzt44apLKf7HfQA0hOp56eHvseuUsYC1m+7XW7l57+84r/gSzi4+H4BnG1fjceioSpigrhEzVQTgUnU8rhjbpxSxqCpIhsNHaUuQb8eO44+O3QmLzF7DzIkZJfx2xhdh2zbMysqEyLB1cknKSJ193Q2gaeRf92UezW1k0taNTNlejyMSF3GERPcKNk0YyZzzvwjHn8WCSICfbi9BIClL6+x7VgW0hn2sby3lumuvg38/9Z4Z0S4aUc5LNXuHPL5gRBlCCN6o3YVhDC0wgKAjMLTXjM2xxURgvktPhXd7/vuJLTLY2NjY2Hw0WbKkT2QAsu/7N898+pPcq0Z4ZP82/NEw4zJz+fSk2ZzZFUPcdVfy+YcRlmpj00uFbySXlZ7DAwdfoSHcjld1cWbRCSzOP/7wOhiQ4lMkDTRVQTeGSPFRBBOK86Gjg9ZwEH/UWjBFHMlf3STgbWun2t/O2Pguqa4obPJmcqAzBG9YzvVVCCoKc5k5ZQrK7t2WkV92NhgGSnMrEwrnMIE5hz0fhxIYVre9zNP19xM1+3xPClwlfLL8GxS5SwFIc7j51Jgl3LnvlUHnKwimZZUxJ7ci8Vi6w8MfZl3LgZ5GNnUcQBEK83LHJyIYetnmX01aW4TydS2UbWnHEerb9Y15VKqn59DsWUdR+dy+x02dte3baQq3k+PM4ITcabj7eRzoZoy32162nChMiaefcBHKcCIUlVWtLzA+fQbdMT9D+ywAkkTEVaG7mNnZC9nQsWpQRINAMDdnMXkua3e7MlAfj6SRqMJEUyzPh5ipxk0OJQeDDeS7szml8BzuOrALjxpF7devKSFiOjmp4KxEKkzECPO3A7+kOrgv0S5gNPPQwdvxvdTMRHU8Qgh6RhbxSEYn43OacShmYiffoYTJcoep787kQV+Ab5WXY1QeYG/PbkZtdrDqpLH0bcAKwqaDJ+seZXH+KaQ70tnm3wsInKqBUx28Q799VjFNuxrIcFhVWc5+bgenf/bT3OLaxZ7uWjIdPm4YdwkjfQWJFAQRnz+JZOO00pRPgxYXmRWhkFeex8q8saw+qQJfTwRnVCfq1AikuZCK4LKxVgnMHJePq0Yv5N4DK9njL6DA04NA0hbxEYh5OCG/gul5o62UpAcf7CutOxyLFlkCw2Ea0WqayeBipyQe0zTrZ65ncGpKMhJN+Qgl+dscNbfffju///3vaWxsZMaMGdx6663MnTt3yPadnZ388Ic/5LHHHqO9vZ1Ro0Zx8803c9ZZZx32NW2RwcbGxsbmo0lhofXlrPdLXDhM+h13cn15Odf3D0t94PGjDku1sQHLz+D3ux7m+Ya1KAhMJFFT58GDr/Ja8yZum/1VsuO58AAhI8KTta/zQuMqOqJd5LmyOKtwARd43Kgha9Ht27SBs487jqfX7UxE3vTHNCWXLZgO996VEBhCTgdd3sFh3/P3VNEZi8KUKch9+3inM0jtgOAAiWRfUyuCPGZNnGiV/KyosKJ53nwTLr0UU1o5/kdTJaI/GztW8mjtXYMeb4k08Jd9P+c7E/5IuiMTgM+NPQUhBP/c/wYRMwZYy6aTCifz42kXp4w2GJNWNHSpSl2n9JE1jH87taGkI2RQsaYF384/wMnnwpVXsrF7P/9v1710xQKJ59ejurh+7GWcUmiJSD16F1pLO9PXtTBqSxuOcD/hwq1ycHoubSd4YDREzUjKa/diyQR9z/knyr6AW/Gwuu3VRGSEgsqCvFO5sOSaRDuv5kYFvI4wTsWMx0SALhXChoOwoeFVrdfH1MzjWJx/Fq+3PI8GCGEZH+pSocI3kTNG9HlovNL8ODXB5MocEom3K0qnv4bWtAzyXSOomTaB0owVSQID9KViFKf72d19AObNo2rryxhS4AwbeLujBDL7i2yCiHTw74P38MWxNxzy9ebP8dI6dxLjdsaNXMNhHH+5g2+Vl1sVg7KzYc22pBQEIQSaIlk3aSQd2b6Uz4JHs94k/lgP+wNVVsSHEPRkJL/HFBReaV7LtCxL8PrapNPRFJV/HVjJwR5nvI3gzJJp/Gja+db9HIURrZSSZ+s28NDB1RwINOFTXZxZPIurR59IvjsjceqqlkpUp4ERU5OLvQhQHQZvt1hzsKR0El7PEwRDToYSvaYUZQ079zbHDkMKjHdp/Hg05z/00EN885vf5I477mDevHncfPPNLFu2jN27d1NQUDCofTQa5bTTTqOgoIBHHnmEkpISDh48SNYRppfaIoONjY2NzUeXK6+0FkvvUViqjQ3AqtbtPN+wFiDJk8FE0hju4G/7nuH7k3vz8cN8b/Of2etvpNPvQY+l0eyMUB94llCxnyv3Oaz87qoqvn/aMnbXt7CrriWR2qMqAtOU/ODiUxjf3gRVVYkc6/VjRyEHpPgI02Tm3oM407NBUaiePYfvRdOYXbOfWbVVuPW+6hJhzcm/vXmM+t1PyHnoP1BVhZSS2uX/4c7xa2mKNeNUnMzLWcBZI84l15V3xHMlpeTFxodTH8MkZPSwpn05SwsvtsYvBJ8bewpXli9kXdsBYqbOlKyRjPBkp+xjWHQdbruNMZv9NA5ZrwCEUPCoabBiBe0NB/jZnA5i8ajy3uc3ZET4/e77yHamMyu9Au+Dj3P609tT9ucIG4xd28yMDSE4eB+5FywZtrIDgCn7FrKa4uCysms5c8Ql7A/sQiCo8E0kzZGRdM783Gms63gdrxpD6ZcC4cTAqRh41EwmZJTH71Fw0cirmZw5g7eaX6YxXEeGI4v5eScxO3tBwvTRlCar215GpjCe1KJWcHZTuJZ81whimS4yXEMLKFKCorZA9gl06J198xNN5R8gORCoBGBW1iTqQs2AxK3oOBUdiSBsOIjFQyDKPvtNuPvRw/6sN6VJQ3k6y0+exGD/BivuQyjWGBvDbYnnXcHEoRgIYS3oYqaKiUl1sDFxtioUbph4Gp+uWMS6tkp002RGdhmFnuTnCzhsI1opJb/e9hhP1a1PRGB0mkEerl7Fiw2b+ccJX6TEa0XsRE0DRQHhNOLVMgQIiRDWay5qGol0JmeWSjBEyjlAlUwqHp1y/myOPcfSk6GrqyvpcZfLhWtAtFwvf/rTn7juuuv4zGc+A8Add9zBs88+y9133833v//9Qe3vvvtu2tvbWbVqFY64YWp5efkRj9UWGWxsbGxsPrpo2nsalmpjA/B0/duJHe6BmNLklaaN3DD+Qryam0drX2F9tZ/6huKkdo0N2TxUZLJwywFG+6yykL777uX+T3+G52MOnt2wi+5QhIkl+VyxcCYTOpoSzvPFvnQ0obB2wuAFQWYwhDsaZVS6tShfkT0Cf1DhpYkzeXnCdDLCIZy6TlTT6HJ7kELhpK4I58ybh6ysZFf3TloiTXS3OiHTSdSMsrL1TTZ0vMP3J/2EQvcQEQND0B5tpjXaOORxiWSbf11CZOjFp7lYUjjpiK41iAcfhJ07KXCNpDFUBUBHiZeaaTmEMxy4u2KUbe1gXFs2WtxMsXb9Gyz2O1m+bMKg7hQEDx14nllvgXPnTrIcefhjbSnFC4Egz1kMK1YQq9lG+FQNt0sfJDT0lrSsCnQN6iPdkcnMrHlD3l6P3oZXjfXVCOjXt4aJUHuImTFU1VpsxGSU/T172dOzh4ARoCXaTr67hPHpU8h2WtUawkaQkBEgFbpTQQIhw9qBT+/pglRBAf3Gk+NSoLUDpZ/ZYMyZ2hdAU6wogMtKz+LlxlfJdvagxFNAADIcYUK6Rro2hpKM0iP6rDcXLuDF0SrZIkRnxE3M7PvMdyoGme5wIlUgXfMBEo8aw6UaiesLAaaMEdJdZDoGpx5kODycUjR50OMpOYQR7dute3mqbj2QHOViSIk/FuT/dj3LH46zolqmZxezsa0WqZhoqhGfM0FMVxFSYUp2EUII6gKddCtRlDwFs9MJep+hJm4DJSvCc/W7+DFnHN492HxoKC1NTgH66U9/ys9+9rNB7aLRKOvXr+fGG29MPKYoCkuXLmX16tUp+37qqaeYP38+X/nKV3jyySfJz8/nE5/4BN/73vcsU9XDxP6WZWNjY2Pz0eYowlJtbI6EumBrSoGhF10atEe78WpuHti1lvqGwYsJKSVbe0p5prSer7bHHwyHcd7xV84vL+f8/ik+/7oraYfWoagULzuL1kznoH6dMYNxmXlkOq2d8aDXR3zrEikU/J7Bq0LdMCA7m/ZoOy2RJusaUZ1ecz4Tk5AR4qHqf3PD+G8dzhQlMBna9b4XQ6Y2enxXNDUlFp8u1cOo3Nk8eHoHTWOt3WUJKAJa58/mZOMa+Me/IBymOdLBpK066+aU0pmTbMZpIsl97DmMllGoQqHUO5Yufwf1+RmsLhlDu8dLQbSbBbUHKG01KHBbwlJk+wYmhnX2XFCYtGjtHUdLJJ1ufWghZihWtr2VMHIciBCgyhg7unZyXPZMYmaMP+3+NZWBSiRWJQlDmrzZ8gYbOtfyw0m/IceZi0t1owkNPcVzEspwEnOreGPW6yJnw244c1CzvnuTkOPywpo1lHlHsdW/jahbI5g++HULgpPzlgJWieE8dw+mlNZYRa+ppsCl6mQ5Oq1TjuCzXissRFu9GcOMUuTrxqXoqJgYUiFiOjCkQppqvTeKPXkUe3wE9LbEXPaNErxahBNypgx948eAJ2vfQRVKUuWQXgxp8lbzLtojPeS40rh2wgl8be2DuJzJz5nToROJanxuwnwAdne2gADhNlEKwxATYApwSIRqRTb4o6H39L5s+jARmO8yXaLX+LGmpoaMjL7ImaGiGFpbWzEMg8IB330KCwvZtWtXynMOHDjAq6++ylVXXcVzzz3Hvn37+PKXv0wsFuOnP/3pYY/VFhlsbGxsbD4eHGZYqo3NkZLnyqA+NLTQoCDIdPjQTYODjQ6GKq8HknsmTuOrHZlHnOKz6Prrubl6N3/a+BbVPX4Aclwevjh7JsfvaEk0nZHuQm9OvTPdO4rjyktg8wYaww2JcQ3cbTYx2da1BX+sk0xHVtKx9kiQllAPeW4fue5kESPHWUialkmP7k95fQWFcWnThr7XwyBmGtQE2lCEoMyXa/k2vPFG39il5MezStmVUUZJwE+aFsGQCk2hdFrDLsaPg4uuuw5uvTXxjE7dXMeKk8clXSerPciUrQ1QNAoAry+PNxd9hduietwPwVoU/yd3AX8ZUcaM5b3lGw0mbGlk87yRdOV78KpRBJKIqdGjuzERpDliR3zfQ81pL0JAQ7gemMlrTS9xIFAFcQvEeAskku5YiPur/sEN47+LKjQmZxzPFv/bgJlkEmkogoPT8zh5q7WL76iupWx/N9UVuaTK8RcCcnbvgyqDXGcODkVj+/T8FJUdJC7FwalFpwLwWO391nsr0Sx5ld8ca6Y+WEOxN757e5if9acXLeL15ufIcERItj0J06W7Ob/Eqm4RM2OYMjCkeAPQrbcOPngMqQ91pBQYepFIWiNd5LjSSHequF2phTq3SyfdZb2Xy9NzEmk7QgBOs1+lEQESfI5UApDNe4E8BtUlZPz8jIyMJJHhWGKaJgUFBdx5552oqsrs2bOpq6vj97//vS0y2NjY2Nj8D3OIsFQbmyPlrOJ5bOrcn/KYIhQW5k4h3eFFNwyCQRdDVxYQ+EPeo07xuWDMFM4fPZmq7g4M02RURjYOBDz2QmJH97imWoqzimjq6sYwk0URRQhOnjSGstwsWLOGiBkGJFG3RijlbjP4Y/6EyFAX6ORXm17mlbo9mPGvuyePGMePZp1OWZqVrqEKlZPyz+WZhvtT3r8iFBbknX7o+06BKU3ur1zJvyrfoiNqCSkj3Fl8vuIkzusX+rvOC1sqnCChsmeAr4SA/9v1NBed/TMoLyevYzeN4TYmb29g5ZKxSQvi6ZvqyHSkWR4awOMnLeS2lhqg98u+iI8LvtxQw4uXXMi4+x8k3ZGOIuqZtKmRt0+pIGj032W0/ABGuLOO+P4ztHT80bZh2xS7rR3L55qe7rvhgROAZHvXNmJmFIfixDRNVIykUpcAKpLK43M4eZvl69Gt+zn1qT28dN5k6iqy+/VtnTT6QAvjn9oL6UsQQjA9czr/nUlSGyuxRHDD2OsTpp7b/JuGGGvfeN9qXc7lZZ9JPnSIz/oZWSPZ0GF5SAwUEDIdYSZljACgNtRAuF8VlFRs79rNJf3+jppRVre9w8aOLZiYTEwfx+L8haRpw+STDEORO4s9XQ1DCpkCyHFZ5rL3Hngr4eEyEAXBP/e/yYkFExiTkUuey0d7tBuHw0joL1KCYQp0XeX8Ue9O8LP5cJOXl4eqqjQ1NSU93tTURFFR6lS4ESNG4HA4klIjJk2aRGNjI9FoFKfz8IQpe2vHxsbGxsbGxmYYTimYyZycCYgBiyBVKKRpbr4w9hzAynUd2GYgmqL2hX3/4hewdKkV5t0fr9d6/Be/sNr18xARQjA6I4exWXmWIaSiwPz5ff1XV3PvgokUZabHr6ckSvXNHl3Cby5dZpX5q6rCrXoAwf6Uu83WcjDLYYkHzaFuLnnlHl6t35NYCEngjcZ9XPLKPdQH+3bZF+efzcJcK89bQUmUSnQqTj5T/h3yXEfm89DLzbte4M+7X0gIDAAN4U5uXv0Ae5v6qiM8PkIlxfrLuicBuiPMivq9MG8eo9OsFAdXWMfX02dqKEzJxB0NVKRZ/hnmqFHcFOwYcmwCuFMPQXk5+c5CnIpk4tYGhGlFPPQusgWSDEeYs4rOP+L7X5i3cFgzSQWVSRmWR0BQHy4M3orDaA23YkidnT1rEv327nr3/gvmuXlnkjUvXbEOXDGdsx/Zynn3bmLSujrKdrcxeV09F/1rA8se2YEatSI0pJS8Nj5IOM+FJgwUJAoSTRi4hMkD1fdixnfuo/JQUR0Cf2z4KI5ULG98esj3o4LCq03PxX8/9O6y0m/i2yLtfG/LT7nzwD9Z17GR9R2beKD6Eb6x6Ub291Qe8TgBzht5/JACgyoU5ueNJy8uMmzoqEopMICV4rO5oxoZP/7lyfNxOo1B1UBUReJxmtww5cSjGq/NkWNKcUz+HQlOp5PZs2fzSr9y36Zp8sorrzC/3/83+rNw4UL27duHafZF1uzZs4cRI0YctsAAdiSDjY2NjY2Njc2wqIrKr6d/loeqX+fx2hXWzqDQWFp0HJ8sP40ij7WbqgjBgpGlrKytJmU4ObC0fGzfA8cqxWfJEuj3JbL4kYd59rOf5XXVy4aqejRV4aRJYzhuVDFi+/aEoWSRewStkWb2zBrsVaKgMC1zBhnxCgd/3/027ZEgxoDFjSEl/miIv+1cxc9nnxmfB4ULR36GRfnL2NixkpARoMBdwqysRXFh48ipCbTxQNXKlMccMYPtnbWM8uXjVFRavOqwi3GAmp4OyM4my5HOzKzxbPHvwxnV0YTlXZDZE2GWs5Qit2WQ2DlzJo01qcti9s7D2w21MG8eoqqKsWnj2duzi6JwJ53pPiSgCROnojPGN5bjcoY2eByKRXlLeK7hKbr17pTHTys8Y8D8DjcJgrARIRQLgrSWt6nmTEp4Z1kuF745CdZuA0ARkuImP8VNQy/82yryeX5JCClMVEAVfa8biaQ12sJW/2ZmZM0ix5lHfbhmmLHC5IwZwx4fiCENauLmn6kwMTkQ2APASG8xmY4M/LHBZpxgiW2zsqYn/v7z3jtojbQn7qX3Z8gI8/vdt3DLrN/hVI4sDWFh/gSWjZjBiw2bkx5XhYJPc/HNSeckPRYbxvukf9nXV5q2o4jBlVaEsKq9vNywg4tHzT6isdocHceyusSR8M1vfpNPfepTHH/88cydO5ebb76ZQCCQqDbxyU9+kpKSEn77298C8KUvfYnbbruNr33ta3z1q19l7969/OY3v+GGG244ouvaIoONjY2NjY2NzSFwKBpXly/lqlGnEjajOBVHIoy+Pzccv4DVdbUpdxpVofDl404Y3Pm7TfEpLLTSKnrTL8JhHH/5C6eVl3PavHmQnQEHdsOD/0ryfsh2ZKMsWkx3bixRNg8sgSFNS+fysqsSbZ+o2jJIYOjFkJInDm5NiAy95LuKOb3o0qO/r3683Lg1aYz9iTlUDEwaQh2M8uVREhZsG6aEpJQwPbcEancDMMKTR54rG3XiJdR6YmQ7MzgpNoJM3x8T56g52TD8OhinqlrmnUCRuxiHcLCKdKKaFYrvEA7m5y7lspGfPIoZAJfq4rsTf8Ste/9Ec6Sp33wITso/hQtH9gX0a8KFLsMMJTRIoNhbTE+sc1hBRggwVAWuv57IX6vhjSeHHaNAwKJFbD4lF9lQB0PtzqNSGdjPjKxZnFF0PndX3T5sn7OPUJQRCBQUzBSlORNjiEfYqELlopKzuafqwUFtrPeCj5MKFgJQGTjI/kDqaAWJpFvvYU3bOk7MX3Bk4xWCn02/lBnZo3ioahUHg614VCdnFs/kU2OWJJV0XVIwkeWN21N6OKhCYXHBBIQQtIS72dBePfQ1ETxdu9kWGT7mXH755bS0tPCTn/yExsZGZs6cyQsvvJAwg6yurkbpJ2qXlpby4osv8o1vfIPp06dTUlLC1772Nb73ve8d0XVtkcHGxsbGxsbG5jARQuBRUzt5A8wrLuXW087h+6+/SHc0msidzna7+b+lZzMlr+C9GdiVV0Jb2xEZSorJk1n4la+g+NfwavNL1Ifq8ahuTshdyGmFZ5LlzEq07dGjw14+qEeRUiIOFUJwlHTHwkPmoXeleYi4HMRMa3f3Ey0qLw0jMKSbWUzJLYE1/0o87khLZ+nE0/siSNqSvQ8yQ2Fm5BextaUpZVi7IgRnjRlvVQeJk+vK53uzfouZlYWJiaa8+6/dhe4ifjH1/7GzazsHg1U4FSezsmaT60r2nliUdzKvt7wwRC+SYnc5LtWFPyYS5oDDomn0XHkFL03Zy+j1bZRvbsUR7ttNj7lVqmbkUjt7Mieccg1a86spBaG+EUgcwtrtn5U9l4KGR2iONKVse1rh2XhUb8pjQ6EIhamZx7HNvyGl0KCgMDNrbr9rnETACPJY7TPo0kgIOIXufL45/ksJr4XKwMFhr6sKlapgNSdyZCKDda7CJWUncEnZCZjSTIpI6M+nKxbzSuP2QaJbb2rIZyuWANZ7Zjgk0q4u8T5yNOkOqfo4Gq6//nquv/76lMdef/31QY/Nnz+ft99++6iu1YstMtjY2NjY2NjYHEPOHjuBU8vHsLxqP42BHkamZXBKeYW10/1eoWlHZSgpNI0FeYtYkLdo2OYV6bns9jenXDYKoDw99z0TGADGpBWgD+G+LxXBloklLK23ynhO8Ic4q76Q54r9iQV0rzYhDJU/zb0m4UuRYMGC5BSV7GzLG6O3ROLbb/Otq6/kU889Erci7EMVgnSni2smz4Rbbu074PVCVhaKoqAcQxs0RShMyZzGlMyhTfuuKL2cLZ2baY810FftRCKlQBNuvj7+mwBkObMxpEBFDpkuoUtrXselT+DfmT56TnezdWkJ7u4YjohJzKUQTnegKgblvsUATM+ayQPV/xpSaDAxmZVt7aCrQuVb43/CXZW3sbenTyRTUFhaeBbnFV925JMELBtxAdu7NiLk4MW4KjROLTy77zEhuLDkbJYWLGF9x2aCRogy70imZExIel27lKEFRrC8KA7V5nAYSmAAmJAxglvnfIofb36E1khf6ky208vPZ1zM5CzLR6TIk4lL0YiYqStRqEJhQsbR+aPYHDnmMagu8W7Pfz+xRQYbGxsbGxsbm2OMW3NwztiJ7+9Few0lTz8d3nwTVq3qWySDtehdsAAWL7ZSLI6AT42by43rnhnm+JyjHfVhsXTEVP6481l69MighauCoHHuTPKfr0s89uONjUzLmcNfHHV0yy4UqTItrYIfzDqHUdUNCV+KBIsXJ//da6jZ63VRVcVifw9/Oe08frRiOW2hvnkdl53HrUvPpqCyanjh4n1EURR+M+03PFT7MCtb3yRqhlBwMDVrJp8qv4Z0h2Ui6FQceNRJRM0dYEi83VHUqInhVAimO0ERjEs7GYBcVyaF7kX4o2+gKgbhTI0IIDBxCoO6UBY3jLdSZrKdOZyUfyqvtSxHmBJvVwQtZqI7FEIZbo7PO4FiT0livOmOTL4x/ofUBw5SU7cRRxTG588kLb8MhllwD0eZdzRfqvgu9x+8k85YX2RKrrOAT47+EoXu4kHnpDvSEqkRqZiZNRVNaOgy9cLdxGROznFHNd4jYV5eBc+d/G3WtO2nMeSn0J3BvLyxlhlsHK/m5MKy43j44DspI4AMaXLF6LmDHrexORYIKYfy3/3foauri8zMTPx+/3tWc9TGxsbGxsbG5n3FNN+doWT/rqTk++88zWNVW1CFwJAy8fOc0in8cd75qO/xgnp9WyVfXXcvMVNPpCwoCNIdbu6cdx1jn3hpcBRHeTnMm2dFJnR0wJo1g1NIFi2yxJmBNDXBT37S97fbDdddR2zSJFbX19ARDjE6K5tpeYV9hprhfiHqv/jFEYs5HwSVBzbwwoPfYurOWpz9UiCibpVtkyu44ppbyS4dB0BntJsbt/yRsFFFliOEIkzChovWqI/rxnyGpUV9niNmYwPrHr+JnrdeRgtb6TYCQX52OeNO+zTqyaf0zU9TE7zxBqxePVgYmz/fMjc9yrk0pcn+nl10xjrIceYxxjf+XUXdPF73DI/UDvamEAjmZB/H18Z/8aj7PtYE9AjXrbqXrZ11KAhMJKpQMKTJNyadxmfHDR/B9H7ycV2P9d7X2S9+DofvyAxBBxILRHl22V0fiTmyRQY+vi9qGxsbGxsbG5tjhZSSNxr389CBjdQGOin2ZnLZmJmcPGJcUom/95KGUCePVK9hTet+VCFYUjiJC0fOIdvlA12H225L9qU4FJMmWWkm2hDBvffdd2yFiw8Tup5Ir+mIdrHdv4+o2YUiJIZUSNNymJo5Fp/mSaTXoGmEjQivNb/Dmy3rCephxqeP4uzixZT7igf1CxAzdbriJSgzHBk4FEffGHrL6K1efejx9hvDB4mUkucaX+KJuucIGpYg4hAapxaexJWlFx8T741jSdTQeaF+G0/XbKYzGmR8RiGXj57L9OyRH/TQkvi4rsd67+vMF647JiLD82f8/SMxR7bIwMf3RW1jY2NjY2Nj8z/FgAXusBzOovW9EC4+DKS4Lykl3XqQqBnDo7oscaE/h3NfRzJfpgnbt1u/T5lyeFE2H6K5jZox9vdUYkqD0b5ReLUjM6e0Sebjuh77XxUZPvh3qI2NjY2NjY2Njc2x4Fj7UhyloeZ7vgg2TSuKIhIBl8uKqjiSdJUHHxwkBIjRo2HGTIJuD95YBDZsSI7Q2LnTOm+4CI0B/RrSpDrfxbrxmYQzvEw2s5mxqx2tugb27eurxrF/P3rFaCrzBTsnZ2FkZTDNLGbMtmaUg/0qOhzOGN4nnIqDSRnjP+hh2HxE+CCrS3wQfOhFhu7ubn784x/z+OOP09zczKxZs/jzn//MnDmWwZCUkp/+9Kf8/e9/p7Ozk4ULF/LXv/6VcePGfcAjt7GxsbGxsbGx+UAoLIRLL4WLL373vhTvoaHmEXMsvAuampIFE7ebmosv45cHm1nx9i4koCkKZ02dxY2nLiXr3/f3eU2sWGHNQ6prDOg3qMHvFrlYV6KgEAACmDSRV5LFHzrPIP87qxJtI4113Hp5Fptn5qDQBXTxGNVUjBnNd/XP4b3nMMdgY/MhxRYZPmR87nOfY9u2bdx3330UFxdz//33s3TpUnbs2EFJSQk33XQTt9xyC/feey+jR4/mxz/+McuWLWPHjh243e4Pevg2NjY2NjY2NjYfFIoCOTnHpq9jKVwcKYdKAwkGrUoYr7xy6GiKN95I+rPu0su46I1NhKKxRN0O3TR5dtsuNtZk8Og1nyT973f2nfDmm9Y8DNOvlJJblqSzocjAqrnQV360PdrFoxse5PMTJ6Bs246JZGuxgavND+Qkta0MHOSu7He44brr4NZ+5UGHGoONjc2Hgg+mrs5hEgqFePTRR7nppptYvHgxY8eO5Wc/+xljx47lr3/9K1JKbr75Zn70ox9x/vnnM336dP71r39RX1/PE0888UEP38bGxsbGxsbG5uNGr3BRXGz9fD8EhttuO7x0DbDa3Xabdd5ATDPZZLG8nP9r8BOKxjAG2LQZpqSus4t/B3TL7LKXVausfobpt2VEJm8VxZIEg16kaVC0aS+tXgHp6XS7BV3pGuO2NiHM5DGYmKzr2EjruBGHHoONzYcYCZiId/Xvo2Sk+KEWGXRdxzCMQREJHo+HFStWUFlZSWNjI0uXLk0cy8zMZN68eawexqU2EonQ1dWV9M/GxsbGxsbGxsbmQ0cK/4TOwkLWLFzIO+eeS/SSS5IX4NDnXTCQjo6kNIvYnDm8sH3PIIGhF1NKnti4w6qm0UswaEVyDNPvnikjEKQO7U7rDuMO63TGuqGggI5sJwKBM6zj7Y4Oai+RVAaqDj0GG5sPMb3pEu/230eFD7XIkJ6ezvz58/nlL39JfX09hmFw//33s3r1ahoaGmhsbASgcEBOVmFhYeJYKn7729+SmZmZ+FdaWvqe3oeNjY2NjY2NjY3NETPA5yCsavxk0iRmZ2bxiZZWrti9hzm7dnPPsmXI66+30jd6WbHCOr8/kUjSn+G0dAxz+P3RrnDEMpZMOjE8bL8yKxuG2Hd1xAwAFBRwuZCuvpKWjmiK6AvAIRyHHoONjc2Hhg+1yABw3333IaWkpKQEl8vFLbfcwpVXXonyLkLTbrzxRvx+f+JfTU3NMRyxjY2Njc0HimlCWxvU11s/7ZBaGxubjyr9fA4MKfnmyBL+M0AU6IlG+dVrr3O/bsB11yWf/+abyX+7XEl/+gIBcnwDSlX2QxGCcQW5fVUgehnoezag3+kyF4aIZIg5VCSSQncuRCLkmD5kXJCIOQf7SLgUFxMzJhx6DDY2H2LsSIYPGRUVFbzxxhv09PRQU1PD2rVricVijBkzhqKiIgCaBqi0TU1NiWOpcLlcZGRkJP2zsbGxsfmI09QEDz8M3/oW/OAH8POfWz+/9S3r8YE7ejY2Nh9/Psqi4wCfg71pabzocg+Z2nDzqlVEJ00a3rsgO9uqRBFHWbuGq+fNQojUixdTSq6aNxPWrOl70Ou1zC77M6DfzA3bOWvEwpR9BtI95GUVk+7wQnMzGR0hMh0ZxNwOgunOQe0vKDkHt+o69BhsbD7E/K+JDB/66hK9+Hw+fD4fHR0dvPjii9x0002MHj2aoqIiXnnlFWbOnAlAV1cXa9as4Utf+tIHO2AbGxsbm/eHY+m6bmNj8/HgWJR6/KAZ4HPwSn4+qhBDigyd4TAb6+uZN28eVFVZD/Z6F/RW2FAU6/5fecX6u6qKz53pZkNFGSv2HUQRAlNKVEVgmJKr583kNDPc1x9Y5ToHRhSn6PcL4bNwjXTydN2bxKSVBqEKhdNL5jPrrBPg0cehuxsBTDcmsXxRbnz707o/t+LmgpJzOGvEMti27dBjsLGx+dDwof+W9eKLLyKlZMKECezbt4/vfOc7TJw4kc985jMIIfj617/Or371K8aNG5coYVlcXMwFF1zwQQ/dxsbGxua9ptd1fYAp2pCsWGHtZl5/vS002Nh8HPk4iY4DfA78Hg+mMXwkRsQwDu1dsGRJnxgAOO65m79+9lpenDWFxzdup7mrh/K8bC4/fjoLQl2Iu+5KPn/x4tQXH9CvetfdXHvddVw+/3S2+/djSsnkjNFkOtOh7nXYtatvDHv2cuZXb2DuzOlUBqpwKA4mpo/HpbosgeHvfz+8MdjYfEg5FpEIdiTDMcTv93PjjTdSW1tLTk4OF198Mb/+9a9xOCyTmO9+97sEAgE+//nP09nZyaJFi3jhhRcGVaSwsbGxsfkYksJ1vS07n7fcBbQrTkZ7FBYEm3DV1/Y16HVdv+aa93mwNjY27xZT6tQF3qQ5tB6BwgjvAoq88xBC+fiJjgN8Dqa5XBAKwxCRDKoQTCkogAMHkh6POjTaQ34yHG68msuK4Fi0qE+ICYfR/nI7Z5eXc/a8eZA92oqi+O8DydEDYJ1XWEgosonu0NNIM4jLOZUM74UoKfrl1ltJKy+3oiuys6HjHSvtoaoK8vKg16g9Px8efJDc1avJTbRd2dd2wBiMfBcd3f8gEtuHqmaT5b0At2P8kFMZ0mN0RkJkuzy4NceQ7ba3NfFs1UbCehdjMkdz/pjppDtdQ7bXjTYkOppSMGTKyZFiSJ2A7sepeHCr3kOfYPORQEqBfJciwbs9//1ESDnEJ9X/EF1dXWRmZuL3+21/Bhsbm/8pWrsD9ISjFGWl43Z8CL9kD0dTE/zkJ4k/pcvF3YXT+NueblRFIITAMEycTo2bl41j9lsvJO/o/eIXhx0uHdF1ntu5hzXVtaiKYPGYck4dV4Fmh+vafEiJmhE2d66gKrALVahMzDieiemzUIT6/g3CNK3FaiRiLZizs99ViHtPrJbX664noNch0LCKGxpkuyayuPgW3A88kxTBoJvd+Av20T0zhJmVRlZkMVlbMlEOHkzueNGiD6foaJqWp0w8ZSJYXMK87GzCZgx8UYTLAFNgBjREROOiyVP43RnL4Le/haoqYqbBxnADX73sOEIIFODkwpF8Y8rFlLkyj0yQAZg0CfMr19Hg/zo9oWfp26vUUUQmJfn34lWPO/x+TRO2b7d+nzLl8F4bkybRde14qv03IGWEPns5gxzfpynO/oUlOMVpDHRzx5aHMI1nyHIGaI9k4HZewJdmXEyuu28Br5smv1xzJ1nO/zIlux4hIBBzsqJxKhdU/IxFJeOShtEdepnmzt/TE9uJBLxqMfmZXyXLd9UgsUE3wxzoXk5TaAtCqJT5FjLSN3/QezFmRnmz5SHeaXuesBkAYFzabE4tvIYiz+hDz81HnI/reqz3vhY+eT2ab2jB6nDQAxFWnn/bR2KObJGBj++L2sbGxmYoNh2s50/PvsWGqnoAvE4Hl8ybxleXLcDjHHqX50PFww8nheY+Nf1kfrW2flAzAaiqwqOfnMOIB//Vd2DpUrj00kNe5kBbO5988FGaenpQFQWBRDclY3NzuPfKiylISzsWd2Njc8xoCFXxj8pf0qP7rTKBCEwMit2juXbMj/Fpyd91umMtbO54kdZIFS7Vx6SMkxjlm3n0O7PvgR+CKQ2er76UQKweiZF0TKBS3DOeRX/tq5LQI/bQcsl2YhP6xFMhIEoR49v+guPuB49adHxfGfA59/TZJ/Pd2G76b2gKAemml+fP+jK5+yvh1luJmQavNW7m5cllvLRgMhk9IZwxA90hMLK8PLDkG5S5M5NSSwyjlZi+HynDKEoamjYWVcm0LhJPLWnu/iUdPXcxuDylgiK8jB6xGk1mDp+y0p/5862fq1djyggxfT+G0YoQCppaiqaW9okGixYRvuR49raeAQNeA72MyPopeelWhY2mQDd3br6Os0euwjAFMj5XCpIna5fyzeNvJ8tlvWb+svlflPtuQhEmqtJ3b6aEnR0juWzCfylOywKgM/AIu1u/S5vhJRYXWlQMspUQFZmfpSj7B4nz2yP7eKH2G4SNjrjMI5AY5LrGs2zk/+FWrfk1pMH9VT+jKrA1UWkDQKCgCo3PjvktxZ6xh57PjzAf1/VY733Nf/Krx0RkWH3+rR+JObJFBj6+L2obGxubVGyorOOzdz6CaUrMfv8LUIRgVnkx//j8JWjqh3yHfsAOn1FWxlkNOXR0h1I2VxXBJafO5Fs1b/eF3nq98Mc/Drt7ppsmS/92D41d3YPM1lQhmF5cxMPXXHFMbsnG5lBEjDAbO1ex3b8BQxpUpE1kXu7JpPUTDWJmlJt2fZkevQtJcv6+gkJF2nSuHfOjxGM7/W/wbN3vkfH/FBRMDMakzeGCkT9CUwa7/Q/JofwQ+nOEfgj1gbd4q+GbQx4vfa6d2Vvm4FIziRnt1F7xOrEJDgbqJFKCoc1kUttNcOutfQeWLiVy4QIieh0ONR+PI/XOsZSS7vBbtAceJWa249bGkJf2CTzOCYd1H8NhyjDRqLWz73ROQRHupIitiKHzWNNO/nPqLPaMKkg6V0Hw+WgWX1u1H8JhdvsPsKepkfVTyqioacMdiSXaRlwa+twKPv25/4PCQmRjAz3PfxFWLUf0+wiVHoF24pV4lv0KCgsxzC72181AkuwV0X8UeZnfIzfjq9afTU1WCc1VqwaLTQsWWL4KcWEnXP0c/uc/i/udHkS497NWInx5eE/5OepJZ0FhITVt36Iz+NAQ15eoIpNJJVsRQuVvm37FnJw7h5zvLV3f5dNTryesx7hvx1JK01pQReplUWX3l/j89BswzRBra46nyXBgCS29LzDr90wRYkHpizi1cnQzwn8rLyVkdDBYlBGUeOewbOSfANjmf4tHav6Q8toChTLvJD4z5jdD3svHgY/reqz3vuY9ccMxERnWXHDLR2KOPmKxsTY2NjY275b/99TrgwQGsEqVra+s45Xt+1g2fejc1g8FA1zXWyZOo2PPviGbG6Zk/c4aOHUY1/UUvLbvAHX+rtR9SsnGuga2NTYxtehDuAP6EUFKyc7GFloDQUqzMxmdm33ok/4HaY+2cPu+X9AebUEgkEh2dW/i5abH+WLFDyn3WeHc2/xv0613puzDxGRvzyaaw3UUuEtoi9TyTN1NSWKEGd8hruxZx4qW+zip8NrDG+C79EOIGjqbO2qJmjqTM0eQ7fIlNW8JbUKgIdFT3Jgkd2OAsGzFpWbSWbCL2AQHOoKg4SAqVRQkHkXHI3QUfRORiXm4ysuhqgrd7MG//CfsPj4MirVoTHPOZEzOL0hzTU1cRsoYla1foTP0AlaYvkk3K2jpuYeSrJ9QmPG5w7v3AUhp0tl9C53df0HKbgAUkUlW+lfILPgKIu5zUNnTihqJ8onn1lBXkMWW8SV0+TxkBEJM31NHcYufcOk03EIlumsPBaZgyr5GkBJnzEAxTUxFASlxv7WLWNUPcSxeQs+5Efynr0VZmo7qBxGRmC4wMwWG8gz5WZ/CTSGR6PZhBAYAk1Bkbd+fhYVWtNjFF1ufteEwuN1W6cl+4q5h+mlQv448WyVwZhqKXyIiEukSmJkGXu/rFOV9BoCu0LPDXF9gyC5iRh0OtZQ85+OYMvGUJmGYAod8CLiene1bKU9vHrJXwxRI+TJwA/7gizQbvWkO/Tu2fvdLD41d/6Qs52dU9bxGyGgfoldJXXAt/mg1mc4yNnW8ikAZJAxaLU0OBrfjj7WS6cgb5v6HGL/U2dixjrXtq+jRuynxlLE4/xRKvaOOuC8bm8PFFhlsbGxs/oeobutkR93QX6YUIXhq/Y4Pv8gwwHVdGeimngKHQx3suh4MWlubQ+SMb6pvQFMUdDO1o7sANtY12CLDUbKuuo6fPrucfS19X8Rnl5Xw63NPs8WGAdxbeTOd0TaARDi1RBI1I/z9wO/46ZS/4FScHAzuRkFNiAWpqA7uocBdwsaOp4dsI5FsbH+GhflX4VAOw0w7hQmrHFWGOTsDMxOU7izUdbXJJn47dyIfeIB/L5zAbbteoytmpS9oQuGCspncOO0MPJoVSaEIy4MhFU6/gRYywGO9d3tmBghJjU6zd9dQAJKIqdGDQZ4aojP0CoXz5mFU7qQ9+DKmjKB1ZaFnWQvInugWtjVdzvSiJ/A6LQGnsesvdIZejPfZ+5lgzXNd5y/wOaeT5p6bNDZThghHViNlGKdjGg6tdND42zp/TlfgTgT99sWln/au32CYfnKv/D60tdH2xv7EOSXNnZQ0dyZfC+gIBxhxoAYRidFckMvIxg5yOwOo/apSGKpCW5aPWE472ltvYVY+i/opUDUFNVuJzxbomAigq/s23K5FCDH8skEiECRHvuzqbOK3m55iX3cNEkm5bwTfmX4us/L65qEn+AhS9lhXVQRmdv/Fu0kw/DIxvRqHVoaUgWHHABDVG0AUMjqtJTEyDYmIz5GBQFVgTFpdfK5bhuoKAFWReFU/AC3h9UiGi/STNId3UgZU96xCSgZF0/SnIbiJTGcZAd2fUmDoT1DvOmKRIWKEuW3fH9jbszshTlYFDvBW66tcOvJqTi1cdkT92Rw9/2vGj7bIYGNjY/M/hD8YHva4KSWdgeHbfCgY4LqeT4xRRdlUN3akXIYoQnDS7LFWBARAKAT19fDrX1upF70MyBl3qRrDZRVKwKW9j0Z6HyO21Tfx6fsewTCT53dTTT1X3vMQT33hagrSbb8LgOrgfqpD+1Mek0iCRg+bOlczN2cJmjh0eoNDsXxX6oO7hl3YxGSYjmg9Be4xgOWLcLDnVfb6nyQQq8fnKGJsxnmUB6ej9E+RcLuJXTOe8Mi7kWbfAk6dNRdP7RdR7nk64Yew57nHuEMfS6c3A6GrSCnQNYP/7t9EQ8jPnfOvRghBsW8xOzruJj4QnH4DJWpiOhWUsAEIvFqRNe4MB53mwN1m66eOgt90kSN8kJ1NMLobU0YAa/e8DxNTRqnx38KE/FuRUqe5606GEjoAmrruTIgMUkq6eu6iveum+ALaGoPXfQb5OX9CVbKs8ei1CYGh/2h7A/H9PX8hM/1zaNdfT7X/AOqqVcMNAW9VNThdCJdkyoEGJIKgx0lbpo+YQ8URM8j1Byhq8+PasBVGjEDraSftCTfGpWlIKRFCIKXEJVR0aRKNrLGeVucMQqYPjzLEQl9KouqCxJ/PHNzEjzY9gi4VzPjCfEtXI59aeSffmXQG14xfCEAkuhkrMmQoYUwSiW7BoZXFW/VPU0huJwBkFIfiwJAKLqHjEwZqv+a6hIDUMKS1DCpN89ESEChDpEoYpkBRrLZSpMOQ14+PQlg+D92xxiHb9NIZrQYg31VKU7hqSHFQQSXLWZDy2HA8Wf8I+3r2WuOKv3DM+Hv+v7X3MyZtLKN9FUfcr82R879WwvJDnnRrY2NjY3MsKcvNGrYigqoIxo848nDM953sbEsQiCPWrOGLlyxM+d1bVQSZaW4uWDLNMqLbuxfeecfKFzYGfKELBi2TtZ/8BO67j1NHlw3yYuiPIgRLxnz8Xb/fC259Y3XKtB1DSrpCYf61ZuMHNLIPH3WhqmGPq0KlLmi1mZwxZ9goBlVojEubAXBYEQqasAQ9Uxq81fgjVjT+mKbQBnr0eppDm1jZ9DN2PPEFpOwTK2KfnEqo+KYkgQHAiK4nUPRD5LWXW+1Mg63tdRy3oY5IyEko5iKsOwlHXIS7Xayu38f6NqsSRI5rMiN7plP6XAfH/baaWX+qZsZttUz/Yy2T/9JIXn0Gjt4Ap664WWHKhaAgJDXSXSdARwdhvYreVbt0DWxv0BZ8ASl1YkYLRjyVASQqJhomCmbi/J7I24kzu3r+QZv/J/0EBuu8YPglGlquQEor7aM7+PgggaH/7wJJT+BJ0DSyrv0Ct1xxMqtnjCHsSjboDbsc7J8wirRRoyEaZWRXF4aisH9UPrvGFNGSm05nhpeW3HR2jykkMN6FomnQ2IRjj4FzTQTRYiTMPnt/akJBjaeotEYiPNkwKcWcWvTobp5ssISeqKHzqy0PMcLVzbfL1/D0zId5dtZD/GD0Kka7O7l513N0RcPxa7kTd3wgmMtzzZNZ3jKBzlifkacSX7g7ldzEXCZjLfxVJG7nVIRQCBkTSRfGoMWOCqQLHZPZAHgdhXSEPRhm6gWcqkhGp1n/X8z1nsJwAgNAjudkADxa3rBRDFJCuqMYgONzz8DEQEpoDqSxp62Ays4cYoaCQGFq5ol41CMTXaNmhBWtrw0pJCoovNG8/Ij6tLE5XOxIBhsbG5v/ITK9bs6YMYHnN+8atIMMlnfBpfOmfQAjO0IUxYo46HVdr6riVE+UH117Ov/3wOsEQtFE09HFufzmK+eQfWAPPPtsXzRDUdHwcawrVjClrY3Tx5SzvPLgoMWwAK4+bgaF9m77ERPRdd7YV8lQ+o0hJc9u3823l574/g7sQ4rrEGKAlBKXarUZ7ZtEhW8qBwI7Ui4uFuefh1dLB6AibS41wS1D9Uqalku201oE7fU/QXXPa/FjZryFCaZEXbOOTjmFbFcFctQoIqUPk8o6AQyk2Uh0zDZc5eU0b98IhsnMPTU8PWtGYuvLlAogCfZ4ePzgJo7PGol48EHmvRmjJRiwShf2W5kLMsjocEPLOzBiBGlZYZg73EJQEDFqca9Zgxlf7JsegZ6RSoA1MGUUU1qfKRomTkyEIBEKb0qIoGDKYHz8YTq6fj/EtQ2isc0Ew8vxec4gFH55yCVr7+PB8EtkZXyBpcUTuaOsjOVZPl46YRIZgTDOmE7UodHlc3NHnRP18eXQ0YEmNCpLcvGneagryGTrhJH4fW4yesJM313L+J42yJkC27ai+iVmlYH2doTYud6kMUgp0eJPzIHuVuakt+ATCgGZ/NpyAKOdMZ5o2w2czT93P89Ybwd/nvgSmjDR4lECZ+fv4+y8fXxn76ncvv1hbpz1SXyes6jufITf7j2HLd3FJAQDYXJx0WY+XbYNt9uKkMhJ/yIt/l8QQwySGTRMPI4paKqVajXG5yMWHfwxL+L5IOPSrfeB1zGW0elptIYjZKkBvEoMBUlUqgSkk86om2l5F1vnykl0hHxke3oYLDZIYoaKP2xFaBR7jqe6500rDSaFAamJoMAzGYAy7yTGey/i95v24o94+82BwYkju/j2xM+meokMS1uklagZHfK4iUl1XJy0ee+x0yVsbGxsbD7WfP+8Jeyqb2Z/U29+t7Xbb5iS75yzmMkjPyL+AkuWJJV24+9/57zrruP0W77Amm0H6Q5EKC/OYcqYIsT27fC97/UJDGCJDOXlyHnzMLOyUP1+ePvtQTnj/5eZxU+nTuaxbTsSQoNTVfnk7Jl8+6RF78+9HiEHOzt5+cA+wrrO9MIiFpWNQkkhqGyoquMfb6xj9T5rt3jh+HKuXXw8M0cVv6fji+rGkAJDL6FYylXqEWHGF0KK+GgHbk5Kn4kmHOgylvK4icmMzHmAtQP9yfLv8Vjd39jSuTIRIq0JB0vyL+DUwr6yrYJIPEvb+quP+DlE6V3s7PY/Qq+3QX+cfh01ZNKp7LdEhrljMfU7hrwXiUQPPYtr3tfo3LiCqKHhNmJkBMP40/ovcK2Q/YPNO+G2KuSOHfjDbyPl4Aoykm4C2SppLR5oaCDz1U68S1wEJw+dOuLY2QBVVShCw5BRumd7UjgEShThRhEeFCQaBi4hE6/d3reUANyYxOIRJOHIWkyZ2jDWQiUQeh6f5wyk2ZMIvlewdtnBShzojZHoNYPURIybxz/KL/dVsLqzDH+6NV9eNcpXi1cz/5kANFr+Jo1aGo1ZmTx82mwqR+chFImUAsNQWDmhgq2N5fxjXzqyqhI6QWkxUd8MI8/2IRTFep4wEUKgxEeVqXYyMasOIRTSpCCCNRcOIXAIgS4FizM3AFDT9Q6/GfsaDmEmVWzQhCV9/Xrs69x0MBP4JA7HIn646yIOBtPpm1EwpMLDDTPxOifwg5FWJENOxhfpCb2IiK5BmibCL1EiElwCmeVlZN6/rWdOmuix1UPqyEJANLI8/rugPPPzeOV3rSKT8XNcGLiVEBkeL/m+8wB4s+Ygf1t/MjfMexGPFku0ldISx+7ZdDKLR9ZzXNFYxmWezprWO6zXrJRJbSUK2c7x5LsnAtARCfJ/mzrojvron45hSJXXa7J5vvggl4yZPsxrajBu1TPscYHAq3mHbWNz7JDHIF3CFhlsbGxsbD60ZPk8PPDVK3lq/Q6e3bALfyjM5JICrlwwkxmjRnzQwzt8CgutMni9ueDhMNx6K+7ycpbMm2elVBzcCQ//0zKkq67uO7e0lH3XXsvNHd28sm4rumkytaiA6y64iLOMGPz974mccefbqzn709eyp6CArfVNCAHHlxazdFwF6jCpJx8EMcPgx68t5+Ed21CEQGBFBYzOyuau8y5kdFafmeLzW3bznf88hyJEIqrljV0HeH3nAf74ibM5feq492ycaS4nxZnp1Pu7Ux63yoMevdi1pXM/Dx5czoaO3UgkUzPHcGXZUubkDh3q/WHGo/lYVnQxzzb8Z9AxgWBm1nxKvOWJx1yqhyvLvs5ZRddQHdyDKjTGpE3BrSYvKGoCa3CLKBGpYZLsLeIUOlGzjc5oLdmuMrqjNaQyA1Cj1mMxM4iUEpl1qIUNGGY3ZGcjzb4KMa4UopIQcMIra6C7kKjZRsRoAiA80kH3LDexTBWH3yB9Yxgl1I63pRAFDVeXg4r/18z+H+YTnJDs3yKlIHd/Lp6nXo4/EAMEXXNTR4soMoQkBqg444vqVDvjUoJTxCM85HAVGAAkUsZNLrVyovpO3EKgxr0QAJxCoEtJWEocquWJEQ09Q5a6n5snHKAmnMaeQB4uxWB2Rh0efxS5rxNJGQJBQ4aH/5w6h70jCyGFNvVm0Ria5p5Ewf690Gk95qgxUbsUyFYTvgwhM4apWK+bMd5A78ciihB4rDCSBJqQTEyzntPpaXvIdab291EEeBSduRmWV8Dymuc4EMxM2RYED1S7+Mq0IJkuL1Ka5HZkoL8eRqyLQlj2CTReiXLK43DyBVCQy7DmFYCkb3yRyMuowqoYkrhy/N4ctBGObcHnmkfE0GnoyeI3b53HgtK9TCuoQVVM9rSNYEX1eDrCWcwdYb2WXWoay0p+zQt1P8CQMZC9EUACr5rHaSU/T1zr4f2baY+E4pKfxKkY6Gafl8Wft77JRaOnpRSLhyLbmcMY3zgqA/sSUmLy/Uvm5iw87P5sbI4EW2SwsbGx+R/E63RwxfwZXDF/xgc9lHfHlVdaZfD6u9pXVSVHI4Bl8thLdjabvv5Nrly7GSllwnNhR1MLX3vyOfYvms9Xr7sObr0VgL0tbTx8y+1snzYLsBYTaw/WctV9/+WWi87h9Ilj38MbPDJ+v+ot/rtjG0BSeke1v5OrHnuYV675LB6Hg0Akyo8feQkkSZ4Thml9Yf/RIy+yaHw5Xqdj4CWOCUIIPjt/Nr964fWUxw0p+fQJxx1V3280b+LXO/6FQGDGv1hv91fyg6138vXxl3J28YJD9PDh5NSC83EqLl5sfJSgYeX5O4STRXmnc3bxFSnPyXTmMs05f8g+dRlDCEmaiMRny1rQCEz0uB+/EY+ecKoZhFOU4zOc1qJHERpCCITfiT5CQROp88BNCSGZRkZHB75+pqkRx+CvpHmd3Ry3uxaKC4no9ZgulYYr0gcJB53zvXh3R8i6yYO7PoYQHpRaGHlHB6FxTrpnutEzFZROk4xNYYzqfERWJH6vksAcJ7F8lb4dZOunEvdeQJpIjGGzq6xj1vy5nNPoLXHZPzrBqmwAIHE5rdd3RtrXkZEXEr4Bot9FVMAtBFkZ37TmKPgEanxVX+ruodTdz+8hLFFaY0gZRggP9QVZ7C0bXqjrHpdPQflo2C2s67eYyHCyiuJRHMSEJcCoam7qjuIYUpDrstLHTsyNYUiRFMXQH10qzMu2DCRfr3sNlSwMFIRpktkvDcTvcxNVVNbWP8xppVdj3Ps9tNce7beA6fekBAMYL9yK8toGWHgCLJagHXpRrhvN9ISXA9JK/+lXQtPIFKBodPb8G59rHjMLLSG+O+rhxf3TWVk9HlWY+KOe+FjMRBuAkb45XD76PnZ0PkV9cCOqcFKetpAJmWfiUtMT7ZbX7UVgcEJhJXMLqkh3RjBMwc6OIt5sGEddEA50tTE288g8ky4quZw/7flVXGTomwuBpNCVywm5tsjwfmFFJb37Pj4q2CKDjY2Njc1HF02D66+3yuf1d7fvj5SWySNAURHmKafwtapaTJlsOtj7+y0rVnPeFz7DqPJywnv3sbG2nlkOBy9Onp4ob2lIazH+g2dfYsnYclzaB/+/065ImH9t2ZTyS4ghJY09PTy9ZxeXTZnGS9v2DpmOIIFAJMYr2/dx7qz3buf/qjkz2dXUyiMbtyXSdRJpO0tPZMGYI6/hHjai/Gn3f5Dx/3rpFRtu2/sYJ+bPIMPhO2b38X4hhGBx/pnMz1nKpvbt6EaM6bmT8b2Leyn2TMMf2RY3LwQh4ktgCRoxDJFBltMqNViRcRY7Oh4Y5PMQzdQwPCpZRhkA+upXqRvroEyLpMxDF8CBaDcj1qyhJK0QQRdBp5Mu7+BIgjm7qyhLz4yfa1J3eQbhCalTIALjXfh/fjnu7z5LV1M97VEf1Elcbp38WitiJqg72ddTQLeuU5FpogqF6Ph0/OdLnOgYKJjxCgUqpiUyKG4UxY0qD22SqSjWolFTi/C4zsSIPIvSLzpBFQJNSmLCSbrvMmvOjR2oQ6gXQgg0wNR3gHM8GAeR8UKRg9qGQeiAEgPhgVI1Xsh0cLSVQFLs6ibHmQ+jRvTJKrqEfhU2eqMZtHjZSIHbWkYPUZZRFRItXuYyyz0ePbRuyLlShUm+x6pqYJghcjs1Zu+oZubuGjzRvtCLkNPBpgmlqAV+ePI2zI2PDdkngKkfRDrnworVZBwI0/Vp95BCg4ibScb0etQWHd/bUTwboiihfp8dHkHoOCf64t2QCxNz8zmhuBRdf4tLx7/NuCyrJHRDIJMn9h/P5tY5nDEmOQos3VHE3LxrCekNCKHhVguTxCQAXRpcXLGB8Zl9JaZVRTIpp5GxWS3cu2s+uhy6EsxQ+ORKlmVuYU3PGNoNSwASmIx2tnKCb4NlzqqMPOJ+bY4cE5HyvXukfXxU+OC/FdnY2NjY2LwbNA2uuQZOPx3efBNWrbKqRPSiKJb/QlEReL3sHjeB+qq6IbtTheDRrdv55rx5HFy7DhPwRKNkhEP4vX0LOgl0hSO8vq+SZRPfu9SCw2VjQwPRgdUy+qEIwcqaai6bMo2Gzm40RUE3U39pVRWFhs7UqQzHCkUIfnXOUi6eOYUnNu+gpSdAWXYWlx43lbH5w++YDsXbbdsJGkOHqhvS5LXmjZxf8uHz0uiINhMxgmQ7C3ENkUv9/N49/N/q1exrt/xUCnwb+MLxc/j0zFmDFi2HQ65zBGqKiIPe8P9MLQ1NsRb1k7Ouoqp7OUG9BdmveoVQNPzHjWDclvEAyKoDtO8w8UzVKFB1a0EabyuB3TEPcncrVFURMx1IU7C+ohzZ3zdDgjBNjttXTTDLilpoyckhPKmD3soOvTvkhhTWDrgQNIyaSuFZksZHH8DZEiC3M0AMtbewIT41gleN0h416IiEyFu6jPYT3wAlgAIJsSVpLmQQU4ZRlQI0tQjdGLosYZr79L7zjJ19ng39nxsBDqJIMwwKREJPDl8RUUI09Bhu7wUI4Rl6J9MtrG/10prHef7a3poZgzqXCD5Tsg2Egqw62HdUEzCgwoY19vjzbTbiFBqR1K6eWHUQrDl0pn8dPfxAyvSS+DTgSv8W6DoXv7mJ49/QU06CJxpjwdb9LFi7GbQRyOIOEBAyNXbmFrBtUgkiC04wa6jY3oJSYyBlCCHSce7T8T0ZJnBx6veThgK6juPBNyh4OfXnnRKS+FZGcayphqX3wZVX8v8WS9q6n0p6Lgq9fr40/RU0RzkutW95JaXJAf/97O+4B7O9CSUq8frGUDH6G4xIX5pod9KIEOmuZgaiColQdM4s28WY9CP7XJTS4GDHnynUujg/exNdhpuIqZGuhnEr1nuzqvMOJuT96oj6tbE5HGyRwcbGxsbm40FhIVx6KVx8MXR2Wp4KbrclOPzyl4lmTY5DpwA0dfdAcQGhWIzeLF2XnjpnvKm7Z9DjH3aKMtMwhhAYAAzTpDDz6Ktm7O9u4onadVQHWsl2pnF2ySyOzxk9aCEshOC40mKOKz02RpOtET9KvzSJgahCoTXiH3zANC1T0EgEXC7Lz2MIv42uWBctkSZ8WhqFrqKjWtz3p7JnO883/JO60H4ANOFkds4pLCu6JklseHj7Nr7/8ktJy7DmQIBfvvE69d3d/HDxkiO+dm1wNb3pAcGok/ZgGi4tRp6vGyEgoNcT1Fvxanm4tWzOLP0HG1r/QlX3S5jEEKiUpy9l+kWXom2z0osEKmUPd1IjsmiY6CVfjaEhCUqVJsOBa3eU8ofawAm13fsAwdqxowethTN7QngiUeoDXYzPqWBlVhlp/iZGZrUnzYGmWOU1t9aWYUSamLlgAc1rX2VCeQ2iRYIGwuw1spSML2hib1k+DZd9i7xJs1DqbkI1ZIqin1a6RLygI0Io5KR/nebO7w8xmwo5GV8AIBbbg2FUpo44iFdF8Pt/Rm7u34ilqPIzkFj8vao5ZxHVd6Vu5BaYeSpaqyUKFTX3cJvyHH92z+eCwr2M8vjpiHl4onkck9NauHzELpx7NyOq+rxqZL4KnqF9ZoRaiCoUXGjEpJH0PnMIFQ0Hima9lxWtEIdShm5Wp+zLoeSgiLFw220ct7OGJqWAiNknCPXNl2Ss6MDVHAJFoHZHqJqQxS9PXkbl6DwkVmLKP5nD4vEH+W73Gzged0BUoiFxvxMltNiJmZ/sOaIA7lgIbrsNbedONLUQ3WgmdUC6xKlVwIoVmK2N+C/4J0JNlkR6PUP12N1E9M/g0qzInl07f0TglfsZuzGImoiQaCbkuYLWJZ8l74xvQWEhYzL30RRK4T0a77ssvQVdduDk8NMlgrEqBF2JgWaoYfpbrygCWgLLbZHhfcKuLmFjY2NjY/NRRlEgJ6fv7wFJkCOM1A79ieZASUYGdHTgczoTaRSRFCkRUkJxRsa7HvKxYNaIYlyqSkQ3UIOghoSVYuwEwycxNcmJZVYKwulTx/Hrp14jnMpsD3A7HSydcnReE/+uXMmfdj2HKhQMaYWkP123gWUjpvOL6ZegKeqhOzlKCt3ZQwoMAIY0KHT3mV/S1ARvvAGrVydHv3i9VonUJUss8QpLXHiw+j42dLyTSMUY6SnjirKrGJ8+8ajGe6BnG/cc+BmYJp6uCFrURHdGeMd4kfpQJddV/BJVaIRiMX75ulU+MtXd/WPDej4xbTqjs7NTHB2aoN5CKKrx4q7pbG8cGS8dCXm+LpZN3MKYvBZCejtezVrYeLRcFhb9mLkF3yZidOBSs3DETQF7TVhVxYMahPJ/dRAqcdA50xM3aIxRtqkLd10Ml1oATgjrAd4ZO4HW9Axrs9zouztn1IiH5luv0Wi6lwxnOGU5QAFkukL4YwIysxnta0FVJNFRGo3XZ2B6BEoETBcY6YIpopGxJVMA8LkW0hV6BiFlvJqDiBsJWmUq3Y6pKCLuASG746JjCgNMTGS8hGUo9NQhg5pjsfUAbO46junet4ZuKGBz11xOyQen92piwQcJmQr1hpNuU0UVkK/EKMiMQHkeIi4yqO0KJz+3kyVX12IUO1GENb3nF+4DwNxpIp74LzS39l1rlAaZA0QGSaJYpNDGgGMGamwrmlBBkoiWEEIgMVE8l1j963U4ZBOKlXjTr4aJQENFI4B53+9QdlbjUKKclF3L6x0j2Z+bx+bx8XKbgTAXVG9nxp5GwBLc9E6F5a6x7B+dT6ICQ3yob3aVMSp/Jtdd92m4/T8oCJwI0tfE8J/T97njQsEjFJxPBRN+Ph7nHLpCL7I3J4fl5RNo9aYxLtbC0oM7KW7NxaFa4kls22tkikbaLxoqTUmhPfAEI3xfJnTfzXhf+hup6jeoIUnoxX9hrmxHOXEJ5oK2lAJDf0J6R+K9eDj4Y22HbBMxD2VSanOsMKVAvEuR4N1Wp3g/sUUGGxsbG5uPN9nZ1qIxvogcv28fo3JHUNPpT/Jk6EVKycXTp8BfbqcsO4uNdQ0ENI0ud3LIrQCyvB6WjC1/H27i0GS4XFw2cRoPv7rZys3G+kKvRCVajyCt2M054ycAkOZ28bMLl3Ljwy9Y1SV6c8YVgSklP79wKT7X0OX/hmJ9WyV/2vUcYKUm9P/5UsMWxqeP4NMVi9/trQ7JvNwppGteevRgysW4Q9E4uWAW6PrwPh7BoFUe9ZVXYNEiwpddwO/3/JqWSHOS10NdqIY/7bmJ70y4kYq0I0+ZeXXLrUxZWUPpljYcob699JhHpXp6DbvOO4EpE87n9apKArGhxTFVCJ7avYuvnTC0yWMqDNPBv95ZREsgAyn7FpdtgXQeWL+Aq45fha+iIPG4aZrs7ryTyq5HiZrdOEQa5RkXMin7CyhxE1Z150482khCeg2euhieusHjTndac2VOGsWzZdP6Hen7Ah3VLOPJDKd1/vz8ACFfkFQIAeUFLWSUFMDODnxqE6ZpdWd6BEaWmhSp4MAgGnsDj+M0VJGW6EORDHLh14TlsyClQVfPnahCokiSEiussocK/u67cOfejuAQ7x0JvVvK/zhQyi8nuMjQIoOzBSR0xtz8q7KIUypAdU6jQzubrT0r+sqOSkm76aBGuDlx2WVQ9zo0NKAEHcjmAOrdXSilGuYsF1qmAn4TsSGCVp+JCDYhe6xSm7JARS72pt5K76fqKK5zkLGtWNEdvUO2CqEKUQiOeJnFntsBUIWKiprwpbC6E9CiI964H1yLAUHYqfLqlePZWjySYMyJIiQBl4uX3FM46w97UNssUaPNnUlxcxfZHUE6sgcu9AVPtk/is4sXo5avRu6wxuddr6OcrSIVERepBKLFQF0bgXgXpiefX86/nmezXAkTVAWTP5ecyi2+CSx7fiWEw5gyjO+dCF0nutHzUwmmJnqkGe65jfCmx0lV9jXRUsYI6014V6xg7L49tF8ukJqCQCbcQawc/Hi6jyM/ZT9D4VRL6TGc+JRoypQVU0KnWXJEfdrYHC4frtpbNjY2NjY2xxpFsXal44iDB7l1bDlOVU0yXOv9/cZTl1BSUw1VVbg0lTmlJWwqK0fp54SvCoGqKPzhvDNwqO/dzvyR0ljlR9EFvf8Bid/DjVG6Q9FE23NnTeIfn7uEeRWl8fsRzB9bxj+vu5SzZx7dzvyDB1fFy8ANRgIPVK1MiA7vBU5F43uTrkIRStI4FBQEgm9NuAIfDrjttqEFhoGsWEH1Td+hJdCAOSBnXyKRUvJo7cNJjxvSYFPHW9y5/2f8ftf1/OPAL9nmfxuz9951ne67b2X6n15izJrmJIEBwBEyGLumBcfPfwX33Ye/Z/iUHCHAH05dLnA41tf4aO7JTBIYrPsSSATLd09Bj5eZNE2dl2suYGfH3wgbrZgyQsRsY3fnXbxYfR6mYlomrIsWke2agYoHpESNGKhBAzVigJT4tFF41BFW5MNXb+ak8t2JnfL+dPtcqD6DNFcRACP3HMQwrXFKqwgApuwLVJISstI6Yc0akHqfwJCR+vUY1a2KMz3hl5ASYlKgo2DE/+lYglsgugYpdQyzGcNsTsy32u+f9dFhEIm+A4DH+4lhXeCFAJfrNABaQj1cvfZy/LHeaAkSa9LOmJur11xBU8jyCwjpjWztWd0nMPT7GZQK26bvh4oKS1iVEcTeGHQaiBod9akA6n3dqE8FELUxaG9H9lblyVRhtAN5Qgpzy36LUylNCN5rPSQlhjQxpIkprfgPIVsg8qrVWN8/4J5F4h8Aq0NgWq/riOni/526mKox+aS5oxSk95CXFsDjMAi0K2wlD3NCOQBNmT4kMHNLbcq57TIcNEf8iHlzEtMpQhLRJa2yvvHrq6vDSTd3x6JJPJ/tjp9jvWYs60+4IbCb6isvBkARVlxC2toICX8QzH6vYRPfY3tg505MM0rvExobqdBxrpuWq320nOsjONIRb219Jqfva6PimRZcIopXieJWdDxKDI+IoqKjYmCYoZT3PBS5riJq9clDCgy6VMn2XXxEfdocPVIem38fFexIBhsbGxubjz9Llli70nEmPfUEz116GX/pCvDS7n1EDYPjSor53LzZnBjsgb//PdF2dG42n/78FwnvO8jbB2tQhMIp48Zw7QmzmVxUkOpqHwh17X5W7Dk45HHDlDyyditfOvWExGPzKkqZV1Ga2GV8t/4C2zprhxUR2qI9dEaDiTJ37wXzcifzg/Gf46+7n6HFbAQk6SKPayvO4JTCWXDffUklT+tCjazP6GTHZB/BDDejo7mcX51DTkNnok1g6zvMlR7ePqti0PUkJvt69tAV85PhyMSQOvdV3cSu7g3xrH6T9mgze3s2MzPrRC4b8SWU2/+C2Pr2sPchgZgZgxUrmL9/H2p6GsYQgpZumpRlHfmcrq/LGGafVdDYnc2Bziam5xezoeWXBPSalC1DRgPvNP+YeUW/g2uuQU4fB7etJHNVG2q/agWGS2AuzkJ85QaYOpVxkT1cOX41uqnwWu2kRKqCRDAmq5UxpzdRtKkcAEdNF749UXomOAYssiVCgorEsbMFqqpQFC+mGSUw25V6Zx7wua1FaNToRB8iucFAQUgD04wgcKVskzRj8WoFqpqBiYaawiBRSitWwumyhM+J6V08XZvB0jc/z7Ki3ZxZuBuJ4NnGiSxvsiI+FhRaIsNB/71x083U461L28jUhdfhBOSOTYhmELtikKYj81RwCohKlFYDeiRoUSjMR45sQs7zQP5Qy4L4c6hvB7MeXRoYAwQ3IQUOnBB+DuE+HZR+JrlSJlJMFARCglgXBmFVDtmTn8mB0fmD/BgA1JhJu+6mO0sjKz0dw2GVUZ26o47XTxyPTPH8OulB5OSDcIGMWEJDpO91Lk0TdX0UoVjvmXDpSP7m9OMVIc4t3M7SvD341Cj7gnk83jiNTV2j+Kc7yE/Ky5F7KwFJ5vowjrOduFTr+jEp6TElkRZwvV0JrmwkUaRL0nqFj+AEV2KxLyUE5jthF+Q9nQYGCGlSvMFP24k+ovl9vkECcCsGpoSo2Y2PvvKYh0IIwYz8b7O26UamumvjkoklcoVMF5vC0/h86UWH3Z/Nu8P2ZLCxsbGxsfm4UViYyBkHIBym9L5/8dvycn47b56189fRAQ89CFVVyecuWsSs2bP42+xZ783YjsB0cDh21A92Jh/Iyt1VSSJDL+9WXOjFqznhECm+LuW9/erxTn0tX3jqJWKGiiGtPGpVCG7Y+Tr3zImxqF8Ew5bwAR4600ddxZjEAqBa6rwxtYXvxM5mysNvQjiMLg3Gbmpm+7xiunNTO9WHjTAZjkxWtj7Hru6NAIlyj70/N3W+xdxnGxizswNX3MvAMBT2a0Ws8E2iQ0tjhKOdRcGdlAVaUeKDyq3eyRUYPDB9fmKXtQ+JppiMGzF01YOhiBm+lAu7pN5Ny+ehNvDCsO3qA68m0lC6l9+MGeuga7obJSqtpHkVTKcA/yY8//cT3CddSPDcdDRF8unJKzh3zEY2t5ahmypjs5oYk9GC3qzBJiuc36mVkvfgdowr0wlO6L/gt0L1nbt1Mp/ZAwi8jul0Rd6kZ25qYcChFOJ1Tgb6+ysMnAdL7ugtAamqObicc4hE1wOphDSFNO/5AIQjazFkLP5o8vtLAjoKofByfN6zuboiytM1Vt7Bi40TeLFpQl/D+M9rKqzd7s7Qm8M8Axbt5xdT1GkgwxqMUKFZR7QYiKp+gocGcoQKpVPAm4ks3YO88DBEKjOAkUJgsIYpiRHBYXRaDzhPRUbeQscY5GGhdJg4QibCYb0/d07O7024GNRvzGEJawE9SFZBCUU9ddTjwx2JkdYTpjuj7/0okIz3dJDnxPpM7def7PdSEJ0mhGRvIQ7qp08knfX8ecpjFLm6E74fOc4A87MP8s+aOWzrKIF582DvchyAKyyhS0KO1YmCxKUK9DUhK+IDMIxOWj+RLDAQ7xsJ5kRBS1oJvvsjCKEigIJ13dSemZPUVkpLK/Nphy8w9DIzeyFh88e8WH83uWoTDqHTbboR6hSuHvMt0rTMI+7TxuZwsEUGGxub/0n8kTDPV+6hNRSgLCOL00eNw53C2M/mY0Q8Z7z/LjZVVYNFhf5MmmSd915wBKaDh0N9e9ch27T3HFm47ZFyxojp/H3faymN8RQEc3MrSHOkCMk+RuimyVdeeIaoYST5bRhSIqTk8Tv/yjw0HIqCP9bNQ2f4qKvIHrQAkFLyR+dy7r7uK3DrraRp6YSMIOM3NrF+afmg67oUN9lOa2GwqvV5hooNSG+LEHjjecg8AV1GCCsat+efzY6MssQO1XZRxvL0WZySsZHP790OQMTs4Rz/Ll4LTqLBm91X80BYGdunz9hOWKYf8XzNLCynqrMqYfg4EIciqci2FoKmjBIzVFZUTeDtg+Pxh71kuIKcMGovi0bvwiVjmLfeCju3EoxVWXMgBKZr8OI9ENuHe8UKfA0aXCQRmiDPHeDUkTuTWhoFTiInjIVNACYiAkX/7CYyMkT3LBdGhoLaZZK+MYKrVgdfGBQPLkcpcvYc9Pz+IftW+QqBRkXBfYlHzfiYUiPiHg1WOktO5vdpaLmMwXn2KqqSQ4bvk9aV4gaQRvyfInvtEfvOMuNtpuXmcW7xWzxdP3lwtxIuLNnGpByr1KEqD/3+VVQ/XH895l9/jbI2jBjlQJaq0APETHAoyDRAEShuF5y4GPOUt6zylUMhLHNbqZajpxRYEsPFFBoqIDynE+v+ccrPAjNqYiJR1XLrgcxUdTgsetLdRNwaDqGDy8WIegN3ukFEqjij/Uqpxq9zXeEBUItgzQMgrRQi6RGYmX2TK3qzxkwrQsSZm8f16W9S6OpOCnyJBynw6dJ3+Ef9AtBmoghPosqniPY9XUIIpGGSuTFK72ZzdxEEJ7iQwoqK6ZOzJKqQqEhaR0coLy9HbrPmNW9DD7XLspMicHo/nyJGB071yN/nJ+SexnHZi9ndtZGgEaDQPZJR3vHHTFy2OTzsSAYbGxubjzkP7NzMz1e/QtQwUOIO+JlOF38+5VxOKh39QQ/P5r1C06yc8eEM//qzaJElMBxr8ekITQcPdwwjcw+9I5WfMZQj+rHh0lEn8FjNO7RHA0lpEwKBIgRfGr90mLPfPa8frKQlGEh90DSZvG8/1fmFVGTnsC6jk7qKwpT5ykKAJMzKohALy8tJ33GQlghUbGlhwymjBoRoSwpdOTgUB4Y06Iy1DDm+8nUthA1rfKY0uH/MEnbI0uQvjvHfX+2axcRTNaauBEVouBwxbtBe5L9j5lLVXIAhBSNz2plaVkO2L4xLOfLn9nOzTuKxXf9KeUwguWzyVNKclolhzNC44+1Tqe7IT6QrtIfSeH7XLLY2lPH/mv+Dsn8XMTOUiNwIlTjwz+qtLmGQuTGEpy5GzPQD4N0XoehpncYLU5eVleh4rvkeRFYS3vRg4nFXrW6JCgOIGHV4lbEwaRIFX7gdJfRfGrtuJWY0IoSDDPdiRmb/CqfWX7w71Jd2gYxXuPC4FlCUey+tnd9HN/o8AVzO2RTk/BlVtZz/nY4p9FcMUi3LXQ7L8NLlOYsfTPol6VqE/9ZMJyat97pT0bli1Ca+VLEGp/uPAORpHhpj8fGaEoffQIlKTKcglqngUCTZrumgaRiXjMKc2oD2YDfK6hiE+0IjhFvBmO9GXv99lOkzka3PQWwjQgKdJkSllVqRpSAFCPc51plm/SHmCgyzAwcgZWjoSi/O3rmxoj1m6538ewibOKlA1dQ8zjrgB38ENejhRLOGDY4ios6+9KF8R4ivjdjCnMJzENv3YlbuSTwDxvHO5EX7AF/OEaFOThxVOWR1BynhgqINsHcu0mjue8UMENAUv4SQRCgdADRNzsYkgIlidRL/sJESdCFQMWnxp1kREtvuBkALmTi7DKJZgz/zDXn0lSCciotpWYOj2GzeP+zqEjY2NjYfY5Yf3McPVryU+Lt3IdQVjfC5lx7j2Qs/yYScI3Nwtnl/iOkGXeEIGW4XDu0ozRY1Da65Bk4/Hd58E1atGhxFsGABLF58RFEEh42uW6aDO3ceui1YQkRbmyWOHEJomFZaNEx+vcVJkwZ7ChxLsp0+7j7hC/xq2+OsaduXGM/otAJunHIeU7NK39PrV3V2oAiRsmpIZiiELxajJ2p9Ud8xOeuQ/e3rqWLhvHl0b3kCRUicYR1Pd4RgppvenXEFSUeshogRxqm4cAgXsRSLAWFKRm1pQxXWgjpWMoZV4TyGWuQKYbI8PJIvl3vIqNRRUBmzrYU5Z+1nztgDSW0lMClzyaA+pJREzC4UNJzqYBFiYm4e184Ocvd6D0JITKkknPXLszv46tzxibbvVC+KCwz9x2vtH4dqHZirfTASFMWJ6RLUXp5Fz4TkqJX2+T68uyKMe9QZv0eV4q1jaVu0n9ggPwBBrvc00n3Hw/Wz8P/1EbTVKacqQczogJP7hLm89CvJSx8+EkkTWegyObS+D4nAgab1ha97PadS6l5DJLoew2zHoY3G6RifdJamleB1n0Ew/BJgDOhTIHCS5rsCAFUt4c3mE/nauNe5evQ7bOq0jC5nZTWQ64jyatNSrhhpfRZ5aSOnNYhrTZScjUHUUJ9wYHgUtOMFsTMrcZTORH0yB2XlbuvoDCdEAUMiVZBOAWE3yu1/hUWLiE5rwbUmgFgfRoT63jvSI5Cz3URPakLLAjmEJ0fSjBlW2o4ZeZ0hKytkKeBRMfV6VCWH0TtdHD+hifXBggGvL+s9NuXUHrTqUmiuRQg3nqp0Fkyu5/ZZK6mJ+UhXY0zxtqK6FqJULYW7/g444meDeYIbpX+/WSp4BMTv1Vz9Msqkoe9JCMhVtsCaNUjilWs8Apk54DUTTf6lx1OEgwN9nfTrUMp4dIMxGbKzEaiJ8SqRVLKUwKcVDz1IG5sPGbbIYGNj8z/FrRtXp1yESKwv5HdtXcfvl5z5wQzOJiUdgRC3vbSKJ9ftIBzTcWoq58+ezFdOn09e+lHuzBcWwqWXwsUXQ2cnhMPgdkNW1lH5IRw2Dz6YJDAEozG2uNxsGzUaT34+p+bnULRje3IKx86d1nnXXDNs1wUZaZwxYwIvbNmTVC6uF6/TwYVzphyrOxmSbJeXWbm51EV2EzYjKCjMzsuh2JvxrvqVUrK35wBr2tYTNiOUe0tZmDcPr9aXk53j8aYUGACcuhXo7YqLNbHMNHp3Uoci15EN2dmEjSCKMBEIPLEYYZyIuBeCKqyaE116B/muEaihCUScWwa9jDxdUbSQgTCt3e6dYzJhx9C7UlIqdPhVmDcPtaqKEd6J1AW34+6KEc5K3oqdmXU2ua6ypLna0/UMW9rvpztWB0CBeyqzcq+lxDcn0a4huJJxBU/x9UUZrKkZS0NXFh5nlJkjDjK1sIHd/gCFvpsBWF09DknqcP25+ypp6raeX1W4eeecSXjHdcaXTckEJ7rYct55FD1r7fb6HGOZuGsWu4veJhpfoCrCw4j0qyjP/qYV0q1pdF02lcD8GjLWhklfH0bptxg2PYLu2W4ip5xJ5sTh3ycDKcj4AvX+3w1xVJAd91lIelQouF1zUrTvIzfnj8RaLiMW24blzGBiJQ05Kci7JxH1sLm5gd9tGkfxie+Q5Q4yLdcyb9UR7Ayk8f82jmZ6WTOTs3Jw/reKGWu7CZgaYanSK4woSNIjEVyrTNQ1vwf9CaSyGzOey28CVlVNkfAD0M0gmqnD3Xeh+NeyImsMk3PbyHNbz7EEGtp87PtvFnPfehyWLYIL8uLOCalft1YZy96olF6DyhTvR0VgznajrrEW02pjBd/2v8F9edN5uWsUUWmJyHlamE/k7OLETBNRMQdWWp+dwkyH9jLK/+lm1EyQOSNQQhchNsQQB/8Wv4RACCf6HJADS00qAmO2E21lvFpJ5XYcu2LEJg4RUSPBuasTqqpQlVwMKokdn8JUNP62VBTrvZBn+OkcYrYszUFQnNsNuzsSGx4SMF0Dqr1I6zkM6M1kqX3Rlv5YE9s7X6ZbbyFNy2NK5mlkOYtS3oPNB8+xqA5xtOfffvvt/P73v6exsZEZM2Zw6623Mnfu3JRt//nPf/KZz3wm6TGXy0X4CCsY2SKDjY3N/wzBWJTNLUOboxlS8lrNgSGP27z/+INhrr79P9S2+zFM6/+uUd3gsXe2sXLPQf7z1SvJSfMedf9SCGpQiGouyjIycL6XAkNTU1KKxO6uHr5dMJK9hSNQEMjmTn7e1MGXFy7ha+ecg7jrLkv8AOu8008/ZHTFTy88lfoOP5urGxOGYQAep4O/fuYCsrzvnR8CQMSI8q2Nt7K/pzaxMWxi8lrTBta27eAvs79NkSf3iPuNmTH+vPdO1ndsTpSmfFWaPFjzGN+ZcD2TMqyd5NPHjEWJlx4c+M0+GhcXyjKyACgNa2wVqUUG60u9YHx6MexuxKm4CJshdFOwv7WIho5snJrOyMJWstMDCAS+eK708nXpHD/HgdMRSxIalLCJrqu0tWvMygdnQTrsGDr8WQgTrzdqGYEC+a5RqGhkm14asHLJPWoGc3IvYl7upUnnrm+7ky3t9yU91hzewYt13+TkEb9gdPrJABzoegqBQmF6F+dN3jBoDA3BVYT0VjxaHi2B5JJ/iXGaJsdVHSSsm0gpCZaU8OdIDt+Vz4Bpoir9xAApqPLn8XD3KE4vz0yIaVkbDOZ+4jUCxj6kjOJ1jEUdkP6R7jqO2szXCZ+p0XamF63LREQk0iWIpKl06x7mjFw45HwORVHmVwhG1tMZXj7omNcxhbKcPx5xnwCqkk1xwXMEQ88TCD2LlCFczlmk+a5CU/uq0qyoPciP5zxFmrNXwOmb40xXkB/OeZYVlacxeeU7ONeFAEmaouNDx4iHTmuib46VylZo6UbxtaJPgG7dwSZfKevLy9CyJaf69lKxpwm12oBtW6Czi64eF+7uMMvDo0lzRHGrOgHdSUjXEEBlV4QpK1agNOfCZRKpMWjpLHs9LxxTrXE455A6UcTCmO9EeyceNShcKA/08JlPbOLy8bupjabjEAajXD2YmET2zcZVX99n0AuI4gpkgwPqAyjCiSL2D7qGMv0KYhc8jS5NovEKFwoClxAw341r3QnxGXfheyBE4BOkFBqcu2P4HoiCBppWgR5dR3Te4HYySwGPQDGtqiC52xrpXOQc1K4/mlYJa0IIoSARmB5BJF1NaDNW6hYYiEQaEsC6tsd4vfnORIliiWR16/2cmP9Z5uVdPuw1bT4YLJHh3XoyHPk5Dz30EN/85je54447mDdvHjfffDPLli1j9+7dFBSkrpCVkZHB7t27E38fjX+HLTLY2NjY2HxouffN9dS0+QftThumpMnfzT9eX8d3zll8VH2/tH0vNy9fSWWr9aU1w+Pik/OP4wuL56Kp74HY8MYbiV+bewJ8M7+EvQVFIGXS/f1l5RrKz1vGBdddB7fe2nf+m29a0RfDIDSJZ5YfV6Yfo8UFpkBJj+EpMVGzht+1Pxy6IxEq2ztwaxrj8nIHffF4unZVksDQNzBJjx7itt1P8KuZ1yYellKyu3s/b7a8jT/WRYE7j1MKFlHqTQ4L/vfBR9jQsQUgyeshbET43a5buXnmr8lyZlDd1okzrBN2q5AIPgaQ+D0eYqpKVyiM1+GgaPMuYhPKcaiyf7p04kucikFLpJHxa9ZQ4C6mrjlEbTCX9S2jkYqCEFBZX0hpQSvXzM3Fq6WhmyZNfnht5XSmTqyipKgNRQHTFFS1FVDf1GpVEwGmOcspKHybluZMZArjRSkV5k5RE4sqgBxXCVdP+jUdaWFMaZDtKk6kX/Tij9YMEhgsrOuuavoDZWmLUIWDoN6YtHAZjCRstOHR8khzBukMexj45GaGQrijURyqgRCCvePHUdNVz+/WnsNF49YxObcOISCia6yoG8+T+2aDGbby0HsjdoJBhL+btJyhY9Ylp/GTVVXcOO8Z8j09hDOsr7CaIqnryuFP68/k1StPHOZehmZMwd10hd6iwf8HYkYjmpJNYcaXyfadd1T99SKEA5/3PHzeofvJ0HYwNit1ZRhVkUzMbqD7vvvgoEQoGUjTKuEYLnZSPT4PJVtSYnTg3BhF26OjNBsgQOkStOx1c91pn2JvSVHch1NyS/BUzlm4md/Ip3BvDiJR6Im5yI32UDUpj8gpCmauSrBZI7DKRXpDmKruXKaMAGVPG9pTmegXWSKXHBClIDFxeK8GQHFMRWgzMPXNICWxeFuHVbsBUVSBWHwxrFyFrlcho1GMu6O4S3uomNWNmSWIdhooGyNoNS9iei9CmTIF9u/HlDphdQux4AF6X9eqUoDbORtNjYuYixahXnEFUX+E7vBz/XLZJSFMMkuvRl0yHVasQNNKEEFJ2j1B9FKV6CwHZqaC4jdxboyh1hioSiFoEIwdpO14L958FaRMfAZKKRGKoHOWC/fblWRrRbhqQ/h2S4IThy5/6tzZDFVhvFoBEaODxlkZGEpfcocprZQkl5JNpnMUAAd61vJ6898Sz0H/Z+GtlrvJcZUyLn3BkNe0+d/iT3/6E9ddd10iOuGOO+7g2Wef5e677+b73/9+ynOEEBQVvbuoGFtksLGx+Z/B63AyITuP3R2tQ7ZZUFw25DGb95/H3tk+ZPi7YUqeWLf9qESGJzft4PuPvpi0XOoKRbj91dVUt3Xyu0vOOMoRD4FpWlUk4qw0BQeKilNuSwjgrtXruODzn4Ty8r6F2KpVVnrHMNEW/2/7E2zz16DmStTcRIIwUQRfX3cvT530XTKdRx75EY7p3PTGWzy0eSsR3coxH5WVxXdPWsSyCeMS7R6pfjNpaZ90XwLe6dxG2IjiVp2Y0uTO/ffzWssqFBRMTBQUnmt4lU+UXcj5JcsACOpBXml+a9CCBqwv2FEzyustK7ig5Czeqaph5swDBA0HlXUFdAete/W5I4wqbqb7ZBct6wIUZaSTVtvO5CoXu0YXAQI1vhusSwWXEiNNiyG2b7NCpM1M2jozWD+hHKlYode9T11Ncy7tTTOgAjRFwaWpBENu1m6ciKbpuJwxIhEHRkxhoXqATGHNn/OdzZx/GtzzmIlh0E9osDouHdXM5+d8Dm75b98Ne72I7BxyhnkN7O96KZ7fPdALwCJi+qkPvENp2gJ82gg6IrsS10yFR7V2m08YtYcXd08fVGbQqesIJAVpfqSUpBUWQlcdtd253LJhGT5HGI8WxR/xEjM1BCbZTiMRoZHgEKG4z1e20BzK4acrL+aKyasZn92IlILNLWX8d9ccQoab12rqOHfsxGH7GYoMz4lo2mhCej0uNZc053vrX9LL9LwGjJhIivjoj2gymbBzI3hnomljCcp13D1rCXfFFhNtd0A7lKR18MXzX+fMZ7fg3Z0GhknY8LI1VkRbWnpf9ZD4Jd7eOpqqncVMnzQZc+tWdKEgJpksPHMvHWd5UYXElAL1MslzT09DPtL33CgbK4gtXE8kz1pgi7iLhwOIOE7B65xnXUpK2nU/wojRI82ElKUAPqEi9W6Kr7wC2jswN62iN7VC1uhQ04MSb9uLabSgaGWY555B3UlPob5di3u9RMQDQKTRTDcv4TvpJzhPuQYKC+kJPEJ3+DnruiJ5fv3B+/Fc8E+8bZMQO3ciRRrIHrQaA62m773Te5bDYaWadY4O03SeF48pyVFMeuUDHUGnIQjN8zBiZVv8URclD/mpvyKTwIRkoUFK8O2OkPdYNYipZDrL6Yzso3lOpmUUOYApOVejCGvZ9k7bf+NuMIMFQoHCO20P2yLDh5BjWV2iqyu5mpTL5cLlGixmRaNR1q9fz4033ph4TFEUli5dyurVQ5vc9PT0MGrUKEzT5LjjjuM3v/kNU6YcWbrlUYkMlZWVvPXWWxw8eJBgMEh+fj6zZs1i/vz5uN3vbSimjY3Nx4PuWJjNHZY798zske9pWbv+ZLhT7yj05plm2p9hHyo6g8OXbPMHw9YO0hGE8kV1nd8+Z0UVDPxaL4GnNu/kk/NnMaWkLzWhyd/Dw6u2sGZvNYoiWDJ5DBfPm0qWz8Nh0dGRZDD5YlYu9C+F/AABAABJREFUxhDiiQT2tLQRisXwDNjtpbMTcnJSntca7uLlhq2pS8YhCRlRnqnbwFWjFx3emHvPlZIvPPokq6trkgSf6s5OvvLEM/z5vLM4e9IE6zaj3YjhvlkISVuom5K0XF5sfIPXWlbFx2cm/Xyg+nFG+8qYnjWJg8FadDm4kkAvEisaAiCgNJGeFSKdEIW5XURjliDg0Awr8mB+Lgs3WDvBXtXLgicPoFwEdWOyMKSCEBJNs+5xxL5Oxr+yCshkb3u7JZJUpFp4Ch7auodvzDsZTVHQVROpW58nuq6h6/EJUWBj6SgWV8dTsqqq+KrzS0QvuofX17qoPliIlApeb4SJk6v5+innMaIynOTPETj+eP6zdgPL9+xHN01OKC/lylnTKc7s87sIG0OZGPYRMqzoiEznWGoDrw7ZThNpuOOmhyeO3sGWhlIaurKR/RZBMYeCxxmlOMOa19HSoMDVTXM4AwQEYm4Csb7PVYnC/BFtSREagOWJMgxtoSCjM9v40uwXcGmxxB3OHbmfifn13Lz2TNpCwWH7GIpA7CBbW3/G9ua9tAbTyXAFmVpQyLT8H5PlmjaovZQmnZGtRM0OfI5y0hzlR3VdgCJfBo3+oY+nrQ3j1az/b2naaH5VPoqnItOSyo7W9WTxk9fPZ/YBk6xJI2DbNppUB10+L3P3HeC542Yk9Tlv3wH2+b1MHZ+NyMgglBeirLgVuV7QfZYbRJ/odta5W3lUPwusaqpUBRuJrYwRO1dNCH+9z8Wb/nquzmony5lLJLqeoL4LY8Br0QS6pY5i1BHS38B7/fUYd70IKyoPMVMKLFpE17kxAt0HMM9xw1lOlC5QwhLTLZAZgjTPGkoLvg3AwdabcQkwhSAirRKSCuAUJkLCgY4/M/X6pzD//U8iL/0HDYEm+uTM3pFHpYIwmnCcdDltJ0fArCMkBXWGkihM2VsKVeZBYI4LdgJCgQiMvLeTUImD7lluYpkKWqdJxqYwrjodxTEC3OBQPBSc9hXMgjfA7EKgYX1ym0zKuoLJWZ9IzER9aOeQEUgSk/rQriP+/6LNe0//Erbvpg+A0tJkE+Wf/vSn/OxnPxvUvrW1FcMwKByQallYWMiuXbtSXmPChAncfffdTJ8+Hb/fzx/+8AcWLFjA9u3bGTly5GGP9YhEhn//+9/8+c9/Zt26dRQWFlJcXIzH46G9vZ39+/fjdru56qqr+N73vseoUaOOpGsbG5v/EXTT4Oadr/DvA2uJmtbCwaVoXD1mHjdMOgVNOcqqAYdB1DDY0FKPFDLpk743t1Qqkjfrq96z6/8vUNnczoOrNrN2Xw0OVeHUqWO5dP50co/SN2FkTiYHWzqG/B/ziKz0I/4i9faBGvyhoXdNVUXwzJZdCZFh/YFavnjn40R1I7HI3lRVz72vr+eeL19KRdFheAxEknPvQ2lpwzYXgKooR7Tbu6urfuiScViv822dh3aHH8hbB6pYebB60OO9V/rNq29yxoRxqIqCHnGgqrGUZSEBTEOgSA0pJc80vDzkNa2IhleYnjUJbVjVwhqJKa2dx/yCKLK1L/XB6Ujezfdne+hcOAYOQJ6riNbuZk78z17ai30cnJpLMMOJtyvKqG1t5DUEycpZDAJaAgHWjh5Na3rqGvUd4TD13d2UZGQQVQ0URSDN3gWYSCzG1lRUsLBmX999/uMevnPddVz6KZ1VTY8QiIYoyxjDkoI/4N5VCX//e6JtVzjClZX17DvYnJj7bQ1N/HPtBu687ALml1tRWOmOkiGjGHrJcJQAUNMz2IegP7rsIay349ZycGs6X17wIq/vn8rbB8fRE/Xgc4aZPaaSCbvq0KLWoie84iW6s/ISJnj9MlasP3Xo7tFh75q+C3m9lunqMIzKTOPzs17CpepYVS36Ok53hrlu1iuUpX9q2D5SEdabeXTXF3m9sphxeRnk+7rpDHv558Y0Zo34Dp+cfgsZzr7qES3BlWxr+wUhvS7xWLbr/7N33vFxFOf/f8/uXm/q3Wru3cbGDdOJ6b33kASSEEghPd+EhJQfaaQACSkEQgk9QOjNFGPccO/dkqze7yRd353fH3s66axiyxgC5N5+3eusvdnZ2b3duZlnnufzHMX03F/gtIw8e4pQfANW2ZMYEvfaaDLnYpUvm2e1okFmKgJfqIet1QpjJmWieDzURgwkgln7qnh5xlRkwvtFGAZH7asmHLcQjvdgnzqVvO5dACghiRKQGBl9D3DcEExasBd6TiG2dyeqsQvvujC1Z2b086oyG7TQvY1VrU+yqOhLhMLvDnMXmt4PPeE3cToWoVz7VQLz1iJWxFBWx5AJUU8NieYQxI52Yj/nj1A4hs7GkzB6v3tFwchIVX7oibyHrrehKD4cyh6CUiUmFXrDp3QkMalhRUeVm0HTCF8+h/opbtyrIrjXRLCEzCMYQMyh0DPLhrEwlzFTrybbH6Pe/3bfVzSIQS968bHw5ESU1YtNIUcE9roYjrpY8moZiXs42VdOnIj32pu4SPkW1d1v0Rndi1VxU+Y5BY8lNXxMQRn2CVf48MZRaT4e7N+/H6+3z7g8mBfD4TJ//nzmz5+f/HvBggVMnDiRv/71r/zsZz875HoO2cgwc+ZMrFYrn/3sZ/n3v/89wIISiURYvnw5jz32GLNnz+bPf/4zFx8kdjRNmjT/e/x044s8Xb02ZYwUMeLct/s9uuMRbp1+1od27LAeJy4Ncymjn0lZkhCJE+CPjEw9N00fb2/dw9cfeAGJTIo07mho5V/vreefX76Y0fkjF/y7bP50fvXc24N+JgRcvmD6oJ8NR1f44LnGA4ky4Vicr97/XIqBAUxX10AozNf++RzPf/ezBzd0HDAAODEnk1W6GNSbQRGCYypKsarqiFZ7rcrwP+lCCGzq4Orpw/Hi9p2oYvC2AjR1d7OhvpGjSorIio0iIHYMWk5KCLV7yXG6CelhWiPtQx7TwGB3dxUAObZMUvUVDkRgSai857m8iKGjoRBCELr8dHi6msytEqfqIqT3kFVvvvpT5ChPGjjqS0p4fubw95pd09AUhSKPh3q6EDEQcWFm+hAgNUlrrpv9U6f2zYrCYbjrLsrLyymfe5JpVKrvgAfuTPFgkFLyd9XOXtWSkjlElxJD1/nKU8/z7levx2W1MsZ7Gmta/4rBQO8PgYLHUkS+wzyX7tjBjU5NwdWUeRchAbsW57Tx6zlt/HoMqaCIRArgWQ7sy804/cie/ZTnCXYUFSAtpkhg74xNxAQiDmJLCPobdBcsOGhWl8l5G9nf1RsClJpGEyEpdAcYl70fGFmYw8q6B9nVZuP8Ke+jG2bYQo4rwPjcBrY1F/Hanr9z0cTfANAWep/3m77MgbP8zsgGljdczcLip7Gpg3saDUVM7yQmFTSMAcY5pdNABsGwmf3R61m5ZraIQR5Fq64TisVoCwbJzctDb+wGwBGN4Q2F8btMQ68vGMIRNSe7qqIRnj4Rx4q+0ColnLpGrimSEm8NzJ1Lz84VeNQQMiRQAxI9o7eUGepgAHr0ReBLBCKrDnruXeE15AI2+2cI5pXTckYjnCZRAxIlYmDYBNKrUO67GiVzDADh+P4UDZUDMRDEjTYUfISkSp/PS+p7FBVhiMQWG/Fclc4znXSe7kANGCgRMGygexVQFDx20+DrtI4Z9vhCgMMxDm66CeNv78DStxPDjd6BR9/1AolFyYCFfWlXVTQqvcOH6+VYvDREmxm8T5TkWL1pL4aPIUcyXMLr9aYYGYYiJycHVVVpampK2d7U1HTImgsWi4WZM2eye/fugxfuxyErW/3yl79k5cqV3HjjjQMMDGBaUE444QT+8pe/sH37diorK0fUkDRp0nz6qelpH2Bg6EUCT1Stpi7YMcinRwa3xUqOPbGiLiAZ9KmYf6tCMC4z50M7/qcZfzDMNx96Cd0wkgYGMF3tA6Ew33r4xUHTKh6MS+dP49gJZsqu3mxhvWOnuWNKuXrhUSOuc3Tu8MYOQ8KYPLPM6xt2EghGBtWF0A1JdUsn7++pPfhBMzPN1doE5/X4sWoaygEDwYSti68ca6qes/LQV3unZ5bh0YY2QujS4MT8SQdv6wF0R6NDGhh66Yqak5QvTDyFQIuZFaB3l960XdGghdNzj8OiqmiKNmQavF5sirl62xRuQhNDiRNKBBKkOWmaljFj2MG1EJIZuUfDTTehHHssk30z8VpSvUUEgmJHKaXOxDhm4UL0G29EaoMbcQQwJS+P/IR3yjXTZ5qK8FYwnBLDJTGcEmkFKaD0i1+EiQcIHFZVweOPw1/+Yr73T2EK1BUVc2/FmEG/BynN7+iFLaZxx6FlcmzBDzDTJfataApUNMXOCYU/SV4j4yAeDwC6NL9bVTiS9yeQNDAIoG2uC4GKEAKLYuPSlSsZX9+IElVQgwpqj4IaUlDigvENDZy3dHXqQY47uK5KU+gNSGlBf0xxvH3dLx20ngN5u3oVc0rNEJZeXYTe9/G59Wxs3oWRCNfZ2dErxHqg4KFORG+nJvDEiI8vhJVOw0ZEqikSLVJCLKIQRaV3uN5kZUiNmqiaSM2qG2CzUWTps6TbYn2ir9Z4HJBk26PYVDuaaks5G8M+8PrGdAtkZiJkT9KDRBlgqzW/mwxh9ocxefDV9Fji/owZ7bTE28w6FAU9QyWWb0HP0DAUlaZIX9pfQ8aHnOD3IjHQFJWu+HCZHSRdutlf2i1jsaqJuY0iEsdX0TPUxA+PJMNhprYOx+vRpRhU5V9K0KUgboRB03B+7v+o/no2rce40e29965Zn25XCBzjxvrzu8z0xEP0L4PhFNW9/pcDzgnATdUh15XmI0QeodcIsFqtzJo1i8WLFye3GYbB4sWLU7wVhkPXdTZt2kRhYeGIjn3Id/Spp556yJVmZ2eTnT3yFas0adJ8unmzYXuK6/BABG827ODq0fM+lOMrQvDZSbO4Y+27g7ZAl5LPThr5pDUNvLB2G1E9Pvh1NSS7GtvYWNPI9LKR/UhZVJU7rz2H59du48mVG6nvCFDg83DR3KmcO3sSFnXkbqETCnOZVlLAlvqmFIMImMM/i6Jw3gxzMr6rsQ1NVYjrg09yhYBdDa3MGXMQN2lFgfnzIfFDn9HUxBMnnMRN2/dR3dGZLJbjdvGLMz/DUSVFsHlz6mTzIKu9NtXCF8eewm+3vTDgM1UIxnuLOCZ3/PDtHITxuTm8sWvPkJMbAYzNMX/zF40dw3s1J/Kf3cvx5XVjscfQ4yqBVhcV2hi+cYo5mbQqFmZmTGF95xaMQWKLFRSOyTkaALtqR01MaOMyYRFMHltiUwycmmnAybRmcULuybzVMjAMQCCYnTmHIkcipvTqq7EuWsSUJUvoXvIK3YEmFBQyrdlY3Bnm9T7uOMjP5/JwiAe3bKI1GEyZ6Pe25FsL+nQuPjvjKJbX1vBOdRWKMNNp9r6fNXY8F02fCVOmwaOPpqQ0HZKFC1k2YQr6628PWURTFHa19LlwjPYuwmctZUvHEzQE16IIC2We45iccTFuS9/q1VDicf1xaeYzK6VMxrQr/SZ5UkI4x0LrLDflu8CmxclSurjmvaXUZmaxobQUv9OBLxhiek0NJR3tFGd3pZzfwVKzAsSNDob3aIFIfPAsDcOR427BMATKIMKLAhiX20AkFkJVDToiA9N89mHQ0PMKYzO/NKLjG8IFKHRJOz3SwJoQBo1KFcUiyaUHEhlEioxqhKgcdDXU73AQtlpx2QzoilARtbLRpRMzVCKWPg+mmGb2IdPyzPvAum4jXaEs3PZ2cAoMb2rduiHoDJ4A/g6sCvT6PBiDemdLbInrqFkmokeH1vsAgaKZ/VFr95MMne5S0hNdSzC6A6d1vGnkGMaTAHo9PeLYNDO0ZqjjOzTTeCSEQnHG99jX9pVByqnYtBKyXBeYfwk3cRTAQJUyJSuNgSAmLWiqaXD02mbhKz2O1txltJzmQg2AEjHABsIrqci6GS3hoTESLPRQYgnSEMsg3m8qp2FQaOnEImJpTYY0SW655RauvfZaZs+ezZw5c/jDH/5AT09PMtvENddcQ3FxMbfffjsAP/3pT5k3bx5jxoyhs7OT3/zmN1RXV/OFL3xhRMc97OwSe/bs4f7772fPnj388Y9/JC8vj5dffpnS0tIRq0+mSZPmf4OwHjN/9IaYrChCENY/eJq94fji1Dmsbqrl7bp9yYF/ryv4Zycexell4w5eSZoB7G1uRxWKGY4yTJmRGhkANFXh/KMnc/7RR+635bcXn85V9z5BS3dPX8pCxVRK/92lZ5KZEHT02G0Yg/kmJ5ASPI5DjIU8/vikkQFg/H+e4bUvfIE1GVnUdvrJdbuYVzbK1GLYvDklHh84pNXeS8rmI4G/7Hqdnri51CgQHJc3iR9OveCwNE8umTaFe5avSk4y+6MKwYljKinymloFQghuO/lkTtxbyb82bGBPYzu5Lidfnj6F8yZNxNZvte7iUWex0b/VdCHtV7OCgktzcnrhSQCUOkvJsWXTFm1DlYYZAy1Nr4Teye6crDnJ/S8tvRKbauONpteIJzwcFFSOzT2eS0b1iacB5uT24otxX3gh7s5OM3zBbjc9RvoZdDLsDp64+DJuefVl1jTUJ7fnulzcdsJJHFdWntxmVVX+fvb5vLBzO49v2URDdxejvD4unzKd08aMNb1XNM1cvVy0yExNumxZijAoTmeKkcO9dfAQlF4kEs8BITk59gkcX3jrsPtZFBdRYxjVQUBJhOGYOg9mLH1vhFl/c8v+s73MemkCytZ1lGa2sq2phJKOdko62vuVkli1OIXehAfFxImmm/gh4LKU0hUZ3msowzbyzBK5rsCgBgYwJ7I+e4i4YSCUg4fS6XJ4odrBiBl++sINFML9BB0Nn0R3KBi6ObVf2LSJX+dXJp6XA7ygVGie7sYSCkFzM9ZglEVFuSyOdxNw9Hk4WbNh/phcCm1F0N4O1dV4xs5AyrfoOcqWYkHSDUEw4mD26G/BPY+jCtNLyXAIM4xgEBwJo1SB9wZ2d98zaBiIlKAjKPN9FYBwbPeA8zmQcHyPaWQQPoQc3JjUO9FXFCfmFMcGDB0eZ1H73M0zXWdjEKWu4+fEjbbkdo9tPmXZv0NVTENmlvN4FGEnLiPEkSiyV/ix10vBINd1BmD2h5Pz7mZX22009fwHMs3fR0U4KPXdQKnvxmHPeSh8tgnIyFYqrS2EpJWYVLEIHYeIIoTAYx2bNjB8HDkC4RIcxv6XXnopLS0t3HrrrTQ2NjJjxgxeeeWVpBhkTU0NSr/fu46ODq6//noaGxvJzMxk1qxZLFu2jEmTRuYJeVhGhnfeeYfTTz+dY445hiVLlvCLX/yCvLw8NmzYwD/+8Q+eeuqpw6k2TZo0n3KmZBan5Lg/EF0aTM0s/lDbYFVV/vGZC3mlaidP7NpEU7CbSl8mV46fwTFFZekf5sPE57QP46GSKOP4+GTuGJWVwXM3X8O/12zm9a27CcfiHF1ezOVzZ1CR0+c+f+qMcdz58ntD1mPVVE6YfIjhgfn55qpt7+p1OIy4+25ml5cze+5ciGfCvj1miMQB7vKHutorhOCy8gWcN+po1rXvI2LEmeAtosCRcWhtHIRCr4ffn306X3/OdEXXE6t3UkJ5Zia/OO2UAW04cXQlJ44e/rpUusv40aRv8Pe9/6I21JDcPs5TyZdGX02m1QeAIhQuG3UZf9rzJ4QAtXeGi2mQqHBVMDNzZnJ/RShcUHIJpxWcxe7unRgYjHaNxWMZXLTR3EkZMmtHL6N8Pp685DJ2tbWxt6OdDLud2UXFplHoADRF4bwJkzhvwkEGZQkjBxdeaGYOGcLIcdzoChwWjVBs8CwbuiE5c9LIvVS81jJawxuHLePU8gDItE+mvV/ZA32AfM6JiJtvRnv0UfJfXYmknn1tuUTifavoPkeQ0TlNZNimpcShHwqjveexvmXolGsgqPScfkh19Uc3HMDQxu24oeCyuhDCjVXJJtpvApp6dJVM24wRH19TfEN/qAgCM+1krzXd/rObe/j65Jf4Q+sZSCRSKgmzjyDHHaDikkbUP2RDl+kp4qtq4oLPfpbRFy9kT/s+cl3ZLBh1Eqr33/Dkk7B9O8TjZEXLaLUcS82k/WTRF7JY317BuKK7KKgNQlUVmpoJCHpmpRoj+l8Fn+NkAOxaLhbn59GD9yYn4r0YAPbzk2lCVcXHwYwMmpIBgNN+AsHgk9DPiwD61i504cOqFiKEINt1Hm09T8GgYUEKOe6LUrZkuy4ky3kO3ZFV6EYXdst47JaKA9rhoTTja1R1/BrT6JaqD5LrOgeXtc/YpSpOJuT+iorMb9IV3YjAQoZ9NqriGvZ8h6PSdzVrm7+LEOAU0QM+lYz2XX3Ydaf58OgNHfygdRwON910EzfddNOgn7399tspf//+97/n97///eEdqB+HZWT43ve+x89//nNuueUWPP2Ulk866STuvvvuD9yoNGnSfDpZkFtJmSuL2mDHgNhiVQhKXdnMzakYYu8jh6YonFU5gbMqDy+fepqBnDlzAn9bPLTQl8du45jxZR9hiw6Oz2Hncwtn87mFs4csU5qTwRULZ/DI0vWDfn7TaQvwjsR4cvnl0NYG2/pijKmqGmhU6M8IVnt7sasW5uceOa+c0yeMY1J+Ho+s28iGhgYcFgtnTBjH2RMnYLcctlMkE7xj+O30W6kO1tEZ85Nvy6HQMdCYMjtrNjeJm3i85nFaoi0AqEJlQfYCrii9AlUM9NBwak6mZcw47LYNxdjsbMYe6ZDQgxg53DYr3zxhIT8fImTikhlTGHMQrZHBGO27aFgjQ5ZtMi6LuTI91nc1K8PfGqKkZGzG1UkPDftxxcSfv55pa2oI+y3EDQWHJYbmMQjNGY3z3HuhcGT9QbH7FLa0ZRHV2wddGfdYx5FpnzaiOgFy7GcSNB4bNMODbgikfkzSm6PCdw07OgYffEsMyr1Xjvj4ec7T2N1++5AZQTrnerBvNEN83JZSFr2zlNyzO3lZmUlNew5WLc70kmpmjKqiqMqFtbvfzvE4bNjA9GiU6XPnmtowe96BDRtMb6letm8n54orWHDcd9nZsAx/sI4873hOnTkjxatKETbsWiUNc4YytFjJcV+V/Htc9k/YqxXQ4v8TNgIIIIKLTPfnGJt1S7Jctus8mrv/OeQ10pQcPDbTWynf+3m2Bp/BRjwlXMEAomgUer+IkggvKfDdTGfwFXTZTaqhQUVTssj3Xj/wHIQFj/2YIdsCUOK9HlU4qem8i1jC6KQIJ8XeayjL+Pqg+9i0PGzaKYN+NlKKXKfh921lj/8BBCoSPfle7r2cEvc5R+Q4adJ8EIQ8DCUut9vNpk2bqKiowOPxsGHDBiorK6mqqmLChAmEh0mx9XEkEAjg8/nw+/2HpNSZJk2aw6e6u43r3vsnTeEuFHrFxyQFDi/3LbiWMndaz+WTys+eXszjy1MnLL0u1T+7ZNERDXf4KDEMyf1vr+afb62mM2j+vuX73Hxp0Twumjd15BXG4yOKxx/Jau+nHSkltaFawnqYQkchbm34VKCfNp7euIU7lyynPpBYqbbbuG7OLL644OhBPSoOhiHjvFX3JVrDm0iNiRcoaJxY8ldy7H33+Nb2v7C946+JCY2R0HTQGZdxHZOzbk7xBGsLr2NLyx10t6xHjUgMm5X8orOZkvstrOrhjbUC0V0sq/88MaMzRaffoZVwTNF9OLRDU0vvT1QP8OLe8xGiNSn4CAkDg3SwqOxJXFZTc0VKnY0tP6Su5/kDroHBlOxbKfUeXla1fZ1/Yl/nnQdsNXvPyoxvUv68A5YuRUpJc2gFwXg9PcVWOmY4iXpVrAGDzPVBKtpn4VLzYcsWMzNNYSGMHTv4QXftgoaEB1FmJkyeDJWV0GuM6OgY1KvKWHA0u097le7ISuiXMUFgpTL3b/gcJw44lCFjBKLbkdLAax2PqqQaZqWU7Gu7hbbgs6T6yJj30+icu8lynpnc2tr9JPvav5fwIzCQie8g03EWo3P+iOiX9jYc28P+jh/TFX6XXk0Pn+MzlGT+BJtWMvi1OUQMGaMnuh0pY7isE5IhFR8VHeFN1HQ9Qyhej0MrYJTnfLLsI8+49HHh0zof6z2v8vt+iOL8YB6dRjBM1ed+/om4RodlZCgpKeGJJ55gwYIFKUaGZ555hm9961vs2bPnw2jrh8an9aZOk+bjSige5aW6zSxvMRW9F+RWcnrxFBzacErQaT7uGIbkgSVr+Oc7a2jrNuPLK/OyuPm0BXxm6hAD3U8QsbjOvpYOVCEoz8s8rEldCk1NhxSPnyZNfwwp2dvaTlwaVGZlYv2ABqi4EWJT2z3sCTyb1BTItR/F9Jyvkm0faBj0R3ZT3fUswXgjDi2fMs85ZNiGDtUIxuqJGQGcWlFKDPzht7eH2u4XaQutRgiFPMdCCl2fQVUOP098JN7Gyobb6YwuRggdKQVObT7zi36Ay1KaUlZKSWdkPXXdzxPVO3FZyhjluQCn5SDir8MgpaS++0mqOu8hopuaH3atmHLfjRR5LjINk3ffDdu2IaVBZ3QbgegejITmiFXNJMs2GYeW6C8Mw+xTnM6hxWINwzRG2GwwZsxBU4gCplfVTTchVYVA+B06gy+iyx6clslkuy/FouZ+gGug0xC4h6aufyREPsFhmUhJxrfJcJw0oHwkvp+W7icIx/agqT6ynefjth09ZMhjTG8iprdgUQuwqOksUh9HPq3zsaSR4R8/OjJGhs//7BNxjQ7LyPCtb32LlStX8uSTTzJu3DjWrl1LU1MT11xzDddccw0//vGPP4y2fmh8Wm/qNGnSpPlvENcN6jsCaKpCYYYnrXNxMAxj2Hj8NGk+CuJGmFC8CYvixq79b3qUxYxuIvEWrGoWVnUYrYQPCSkNwvE6wDQyCNGvHzjAA8qQOroRRAgNTXGkVtTrAdXWNrwhc8ECePPNj5VXlSFjROO1CGFLaiuk+d/g0zofSxsZRkA0GuUrX/kK//znP9F1HU3T0HWdK664gn/+85+oh5FS7L/Jp/WmTpMmTZo0adKkSfMp4nA8oA5myEx7VaX5GPBpnY/1nlfZvUfGyFD9hU+xkaGXmpoaNm/eTHd3NzNnzmTsUHFfh4mu6/zkJz/h4YcfprGxkaKiIj772c/ywx/+MGnZlFLy4x//mL///e90dnZyzDHHcM8994yoLZ/WmzpNmjRp0qRJkybNp5APwwMq7VWV5r/Ip3U+ljQy/P0IGRmu/2QYGT6Qz1NpaSmlpaUHL3iY/OpXv+Kee+7hgQceYPLkyaxevZrrrrsOn8/HV79q5tb99a9/zZ133skDDzxARUUFP/rRjzj11FPZunUrdvvHJ11amjRp0qRJkyZNmjRHhENIu/qxqDNNmjT/kxyWkeFzn/vcsJ/fd999h9WYA1m2bBnnnnsuZ55pKsqWl5fz6KOPsmqVmSZNSskf/vAHfvjDH3LuuecC8OCDD5Kfn8+zzz7LZZdddkTakSZNmjRp0qRJkyZNmjRp0hwOUgqk/GAaIx90/4+Sw/KB6ujoSHk1Nzfz5ptv8vTTT9PZ2XnEGrdgwQIWL17Mzp07AdiwYQNLly7l9NNPB2Dfvn00NjZyyil9eWd9Ph9z585l+fLlQ9YbiUQIBAIprzRp0qRJkyZNmjRp0qRJk+ZDQX7A1yeIw/JkeOaZZwZsMwyDL3/5y4wePfoDN6qX733vewQCASZMmICqqui6zi9+8QuuvPJKABobGwHIP0CMJj8/P/nZYNx+++3cdtttR6ydadKkSZMmTZo0adKkSZMmTZrD9GQYtCJF4ZZbbuH3v//9kaqSJ554gn/961888sgjrF27lgceeIDf/va3PPDAAx+o3u9///v4/f7ka//+/UeoxWnSpEmTJk2aDw3DMNPy1deb74bx325RmjRp0qRJc1B6wyU+6OuTwhFNdrtnzx7i8fgRq+/b3/423/ve95LaClOnTqW6uprbb7+da6+9loKCAgCampooLCxM7tfU1MSMGTOGrNdms2Gz2Y5YO9Ok+URiGNDRAZEI2GyQmZlWkU6T5n+RT0Jf0NQE77wDy5cPTLE3fz4cf3w6xV6aNP9rfBL6rjRp/kc5LCPDLbfckvK3lJKGhgZefPFFrr322iPSMIBgMIhyQGehqipGYuWioqKCgoICFi9enDQqBAIBVq5cyZe//OUj1o40aT5VpAfradKkgU9GXxCPw6OPwtKlg38eDMLixeZr4UK4/HLQjuj6SZo0n3w+bZPxT0LflSbNgRwJXYVPkC7DYf0Sr1u3LuVvRVHIzc3ljjvuOGjmiZFw9tln84tf/ILS0lImT57MunXr+N3vfpc8hhCCr3/96/z85z9n7NixyRSWRUVFnHfeeUesHWnSfCpID9bTpEkDn5y+IB6Hu++GbdsOrfzSpWYIxU03pfuuNGng0zcZ/6T0XWnSDIpIvD5oHZ8MDuvJe+utt450Owblrrvu4kc/+hE33ngjzc3NFBUV8cUvfpFbb701WeY73/kOPT093HDDDXR2drJw4UJeeeUV7Hb7R9LGNGk+EaQH62nSpIFPVl/w6KMD2inLyqgdO5Gg00WJInGtXwtVVX0Ftm0z97v66o+2rR8Aw4hS6/8DbT3/wZBhbFoZozK+i88x97/dtKE5xJVxKSXB2B7ihh+HpRyrmj1odTE9wO6O39AWfBuDOB7rRMZmfR+3deyHfSafTg5jMi5Vlf3BjWzxv04w3oHPWsi0jNPJs39wQfeIHmSrfzkSgwneuTg179BNN7oIxXahCBtOywSEUPvO6ZPSd31E1AV3sKHzdQKxFjyWHKZnnEKJc+J/u1lp0gAgpJSfIMeLD4dAIIDP58Pv9+P1Dt3xpUnzieWhh1IGG13xDmpyg1RPdaNlFTDJmEjxhjZEdXXqfgsXfqIG658k6vwBHlm/kbW19dg0jVPHj+HcSRNxWi3/7aal+YhpDDfSGmnFZ/FR4ihBiA9xpeKAviAci7NRtfGWN5cel5t5WW5O9LfgqK9L3e+j7guamqDfggJ2OyuOO5WfbW2kts0PgKYqnH30JL47LgfHA/+EcLiv/E9/+qGs0HZHtlLf9Rg9sZ1YlCzy3eeS4zwZIQ5vEhONt7Ox/gR02TXgszz31VRk//SDNnkEbWkgFNuNqnhwWachxCBGg8ZGQm/cS2Tp0xg9najChcNSidVTgViwIGVlvCO0gt3tPycY25nYWyHXeSpjsn+MVc1K1tkV2c7qhouQxEgZkQoYm/kDSn2HH4bbGG5gSctiqnv24VAdzM6ax6zMuViUw+9nDalT17OUfV0vEdbb8VkrGOM9n2z7x2RyN9LJOGBMGM+rF0TZEnwLgYpET74vzP0s83IuP6ymGIbB4zX/j13d79Pfz7vUOY2ry3+CqvQ9N4aMsL/jVzR1P4KUEQCsagElGd8m133BgL4LwF9QQPWECThy8xijCMTKlalGR/hUjmOklLzZdD8r2p5GQcVAT77PzjqbRQU3fLi/Ix8Sn9b5WO95jbrnJyiOD7YIboTC7P/yTz4R1+iQjQwzZ8485Bt27dq1H6hRHzWf1ps6TRogZbAupWS/UcULZ8ZoHZuJgY5AQWIw2buQC/2LUO697yMZrP8v8/aefXz5mecwDIkuJQJz+FWa4eORKy6hwOP+bzcxzUdAY7iR+/bdx67uXcltJY4Sri2/ljHuMQPKG9Jgb89OeuJd5NkKKXSUjOyAB0zc23WDm635bMzIwUgMBYQQeB02/rVwMhXP/vu/1xc88YS5yppg+aJz+PK75mS1/6BFEYKpZQXcd+IktD//qe+DU06Biy8+ok3a77+PvR2/TE6+zARdBhn2+UzN+xuK0icoLaWBP/Q6rd2PEdPrsWllZLsvx2s/IWUstbH+NEKxHUMec1ze/WQ6Tjii53EgMb2Vqvbv0RlaTO/VtarFlGb+mEznIrNQPI585BECi/9IKLYLkr2W+e6wVOK1zTHPbeFCOs8fw8bWzyExSP3GVByWMmYVPo2qOAF4p3oOccPPgUNMKUEImF/yNg6tkJGyom0pD1T9DQEYGAgEEskoRznfGPd9nJprxHXqMsqShu/QEFye/O3svR+mZ32ZyVmfHXGdvXTHWtjc+Tw1Pe8DUOaay5TMs3Fpg3t/dEUaaG9YixZXyPfNQMvON71JDpiM1wX3s87XxaaJ2RgZThYoZRy3z4Va3ZdZrSWyj/cnNrD1nKJBj3XRqP9HuXvWiM/p/r0/YH9wMxgSRyCGGjXQrQpBj4VcRzk3jrsLMMcmO1u+QGfobWBgxpjK+HfJ/XVfqHaPonD72PE8Fo/jC4WwxuPkZ2dzy9lncVwoCH//+0fSd4X0EDu7dmJIg9Hu0XgtH3wOUR/aydr2F2gK78GhepjsO4nJvhPQFGuyzBb/Ozxb+5sh6zi7+BtMyzj5A7flo+bTOh9LGhn+fISMDDd+MowMh2x6T2scpEnzCeWdd5L/7Yw188I5MZrHegEdIDEIhC2B9ygpHMf866+Hu+7q23/JkiM+WP9fpj0Y4ivPvkBcN5JD7973On+Ab7/wCg9dftF/q3lpPiI6oh38YtsvCMaDKdvrQnX8evuv+eGkH1LqLE1u3+xfx+M199EZa09uK3eN4eqyL5NnLzi0g/brCwwp+ZarmI2ujKSBAczBflcowg2rdvLK5z+P+qd+E/cR9AVSSprD2/BHa3FoGRQ5j0IdZLVflzH2db3GLv8LhPRWvJZRjPOcy6hly5KRp0ZZGbdtbhhU78qQkg1VDbzNLE4pL+9bxVy2DC688JDE7boi26kJPExneA2KYiPfuYgS72Upq+3+8Dr2dvzSPLdE39k7GeoMr6Sq804qs76dOPc4+1pvojP0EqACOqHYTjpDr5DtuozSrF8hhCASbxjWwABQ2/GbAUYGfzDMil016LrBzIoiCjOHGWgeJKxBN4Jsb7qEcLya/saAqF7P7tYvMjb3PjIsx8LddxPe+FLCwEC/suZ7KLYXi5qD0zIali6la+8dyCsN0A781nRCsX00dj9DsfdKWnveQZcDDQxgGhikhF1tv2Ja/h+GvU4H0hRu5IGqv5rtE7352iVIqAtV8/j+h7iu4ksp+0gpqQ7uoD3ajEfLZLR7Ekqvq36CLe330xBckajNSLyb98OG9nvIsU8l3znyyXhjaCv/qfk2cRlJ1tsS3sWGjqc4d9RvyXdMSJYN1+9m23PfRVmxCi1kHjsgLORmTidn5lmmgcHhQErJ+8GNPLSogprKCnoNQqvo4rExPfzB8nkc9/8LGQrREt5H8ZowVQuyCeakZl8TKKxpfzrFyBCK1bGz/ad0hpYDcSxqPmW+Gyn29vUPLeH9tNWsYdLqNko2tGMJ68nPYnaV/dMbqD7/LcpGn0hXZCWdoTeHvD6BV39JjjwFIVSius73PV7EunX8oLoKRzSaLLfn6X9TcNmljDv/fDNcpJcjPI4xpMFz9c/xSv1LaJ1dWGIGukVjRuWJXFVxDdZ+BoGRsKrtaRY33Zv0ShAIqoMbWdPxPFeU/RK7ahrGVrY9mzSaDUSwsvWZT6SRIc2ni0M2Mvz4xz/+MNuRJk2aDwPDMAWfEuzJ8dMyNoPBVgpAsqLtOeZNvhdxmIP1NAfn6U1biOn6oEMDXUqW1+xnb1s7ldlZg5T471Db08lDe1byRv124tJgXk4F146dy6SMka8upjF5tfFVgvEgxgHPokSiS53n6p/jpjE3AbCrayt/23PHgAFlTc9e/rDzNr4/8Zd4LL7hD3hAX7DP5WWl3QeDODMaUlLf2cVSq5vjD6MvaIvsYXH9T+mIViW3OdRMjs2/hQrPccltuhHh9fpv0hTq837sitXTWvcuJ7QL8u0zEEJQM3o8DctrhjyeIgSvrd/JKXPn9rU1GITOTsga/jlq6H6ezS3fQyARicninuhO9nc9wuzCh3FZygGoCzzUz4PhQAzqux6hPOOrKIqNlq4H6Ay93HuWKe9tPY/hts0h230RgfCyYdsGJCb/iRoMgz+8uJR/vbuOmG7eN0LAomnjuO2Sz+Cy95vYHKLgX1vP04Tjewc5sumlsL/zdnyv70ds20YwugOJIGRYaCz0UTMtC8MHxcFOyja2odVvx2kZjW4EUXfsI+85B80XDG4Aae55gWLvlbSEFg/6eX8C0XUHLXMgb7e8PuQUDCF5v30ZF4+6ErfmAWB/cDeP1txJa6QhWcyrZXHRqC8xwTsTAEPG2el/EpAIDHxKGA2DsNToljYEGjv8T4zYyKDLGC/V/ihhYOi7vyQ6MSPMS3W3cu3oR1F0ifHIw9S98iuseoD+RiFDxmhqX439kRrcTREoLKSmQOGh0yqoqezvCWFacxojOne63+C711+PfufviBnmqn/J6g52npZqtJQYNIZ3Jv8ORDazsu4y9sd8NMVKiKPgVUI0RX7B2PB7TMn7A8TjVP/le5z43vZBz9kS1qlc2UJ07bfh9K/Rtmg75pQkPrCwIXGu6SBKMzYln/1VVczv3jJovY5YlKqn/s24sRtM45rPZ/ZXIxjHhPQQm/wbCOlhShwlVLpGD/Dkfmn9/dS//BAXbm7CHTGNHDGpELat5/W573LGZb9CFByi8TdBQ2gXi5vuNU85uQhkfsfN4X282XQvZxR9DYCm0L6h7m5A0hypQkr5iQyZ+FQjhfn6oHV8Qvh0KqGkSZPGpKMjZYC5c5KSXCUZDH+slagRxnYYg/U0h8b2ltaDagNvb2n92BgZNrXXce27DxExYuiJCekLtZt5vnYTvzv6Qk4rmfSRtCNqxFnWspXmSCc5Nh/H5EzCpn489SvWdazjtabXqAnWYFNszMuex6L8RWRYM5JlVravHGBg6MXAYF3HOuJGHE3ReKH+ySHLdce7eLf1Dc4ovHD4Rh3QF2wuqUBrjxM3Bm+Dqgh2NrZy/Aj7gu5YM8/VfJWYkeqhEdI7eK3+Vs4suYMSlzkJ29TxEE2hAyeQEiUq8UdrcGp5eC3FdNuHd2s3pKQ7FIHMA1y9+7tLD0I43syWlu+jEUPp91BKGUfXW9nU9B3mlTxhnld0yxAGBhNd9hDRG3EoZTR33T/MUQUtXfeT7b4ITTnYMy4R9E2Kfvf8uzy4JDUcVUp4feMu/MEwf/viBQhdH5HgX/tJL9AX+jDw+PH6bejvvoKmeIgZflo0NysvqECdKHEoMSKGhWXxMWyZXcy86j1kv2jDCAYA8K0O0b7QSSTXkjhzMzRMCEncMMv0vg+HkYjPHwnbA5tADNRh7/WOAIP6UC3jPBNpizTy1z23ETOiKWUD8Q7+ue+XfHnMTylzjSestxE1ushSehildaAJmQzpCBsqe+M5dERSPVN6Yo1s73yUmu43iMsIObbJTMi8nELnvGSZfV3LCOrtSd353nmh2U6dnngL1Z1LqXhgHV0b3ySs+xMFQI0aCEMiFYFuEUQb9iKVYkRDA4FomNrSoYQzBe+31xKbPw6lvBLWm1uL1neyc1E+KQ9EonwvKxu+yIpgOVGpJbe36RqtIS8BfSX5tjfIvX8zGat24qcvqMZITI4UIZPbdBmHpUvxVO+g+XJj0FmJ5jdQQgbSFoVtW2gLhcA2uMu5lNAWDBKKxXAEg1BbC5MnH/I45rXGV3mm7t/EZN+9UOIYxZdHf4V8ewHE44Qfvh/Pc3cwWwljV+Kms1KCWDxK15LldG38Bt4TzxhRdou17S8kQ3AGnBcGm/1vclL+F7CrLiyKjYgxiEEmgSZsaQPDxxApB7Xrj7iOTwqHtTSp6zq//e1vmTNnDgUFBWRlZaW80qRJ8zEhkjo40zM9wxZXUNEUi+lS25+DDNbTHDpum/WgP/5u6+G5Wh5pDCm5ZdXThPsZGAB0aSCl5Durn6EzGvrQ27GsdSvnvftTbt38EH/a+QI/2fww5737U95p3jRo+Y5oF8/XLeWR6tdY2rKBuDH05PBI83Tt09y5+052du0kqAfpiHXwauOr/HjLj2mJtCTLRQ+Y0ByIgYEudXriXezt2TnkqpVEsq5j5cEbdkBfoGZnpYRJDDi+IXHZrCPuCzZ3Pk3MCA46UBYIVrfdZ7ZbGmzteJzBJreG1Xw+OsKma/4oTaIM88woQjChJM80pPTnIFmm6rqeQiU66ERUwSAY20BX1FzB1RQPB0sdpioupIwT1WsGPS8TSShunpfLMpXhE6eLZDaGtq4g/1ra3yAjEYn9DClZsauGDbv3m4J/CQODlJJIbDs9oTcIht8kFj9AzHPpUjL+sQXiQxuePSvDSGlOZoLSwa6L8zhq5n6OdlYzyVbPbGc1p3q2YBcxdo0ehbjhBvM6oCARuN+PoCPQEcRRiaGgS0EcMxQo2z5vmPM3vwuHWjrk50MR1INDDsaFMF+BWCcAS1peIG5EB7lnzQreaPo3AJpw4lNCVFjaUZHJugBsQmecpRl7PyHDzsheXtp/FTv9TxLSW4gZARpDq3ir/mts63g4Wa49Wp0cjPe/zXv/LxDIxx6DbdvoijaghSXaLgNjtUrDtkxq9uTSVe0gY2sQa2eEeMw08EWkwXFv7B7yGsWkQku4CWXuPHrvbUvIwB5InbyaEyNzJt0Z3sDmoJNYPwODeaXM/++OFtB833dh2zbcWmZif4W2IjcbThvFiosrWX9qCW1Fpu6QSzM9sBx7JDnPDW5wElHzWlv2+aGjg7DSN6uvzcrihRkz+df8BbwwYya1iTlI3DDM8KCODtizxyx8kL5rScs7PFH7aIqBAaA+VMevd/ySYDgAd9+N/60XcfQaGA7AInS8asTs65cuNZ/H+NDGgP40R6qGXQTSZQx/rAmAyb7jUgyQ/VFQmOw7btDP0qT5KDksT4bbbruNe++9l29+85v88Ic/5P/+7/+oqqri2WefTUkvmSZNmv8yttTYykn6RPayfYgJgMIk3wIzbnqEg/U0h86ZE8bx8NoNQ36eYbczt3SEgn4fEqtaqtgf7Bj0MwnEDJ3najZyzZgPL9XetsB+frDhn6ZApgApzAFndzzMrZse4u5ZNzI1o9xsk5Q8WvM6D1e/giHNiakuDTItHm6d/Dkm+So+tHYC7O3ey/MNzwOkeCmYHgfdPFD1AN8a/y0Ayl3lbA9sH9SbQSDIteViVayEYj0HPW7UOISV3gP6ggXZXqBpyOJCCD4zeQysXpX6wUH6gn1d7ww5UJYYNIU2E9G7EEBMDn5uEZ+G7lCJhMyMC76N6zl15nReW78T3Rg4cxQCLpw/Ff72576NTidkZAzbVn9o2cAF2351KlLSFdmCxzqOPPc5dLUPbtQCBZ9tFlY1J+GibEfKoSc0qjCNvXGjNqkL37fe24t5njZhenEs21GFbkjG5jdwyZzlzK7ciyIMttcX89TquaypGkvDPX9jRo9pyNL1NoLhV4mXQNcMOzJDYAssw77OgbftMyjCvB9ce21kP9dM2wXOgQ01JJ61Oqpitre9yMOY6S3JVvZeOw2D2Y5qVocmwJQpxMtK6d5oxaVGyVgTovk0T8rKuI5gU08G8wG3dQIWdGKDDkdNU4X9MIwMHs1DV7xz2DIZFnNCusm/AgMDFZ1cSxd2ESMqNVriHmJSY0fXOuJGDKvqocwSTXov9EcIUKWkrF92oBXNP8UiOyhXO8jt6kGJSHqsFva7MljXdhfFruPwWkuJG+FBvS5663W0RnAt3wa2sTj2tiBrI2xxj8Kf2fedtQsPjVoGx2g7cbR1gDOOzLcxcVM96+aMwp/lSOaM6DuSxGPxEvHZ6W/oUaM6YJ5Hr6EmZpi/A7U979KuDy1M7G4LoaxoggzItZVSpe/mlVOm8AbTaGrNRHZAbpafSefuZ0ZHNTe8NQZiYLdU4Fm9kc5jDeK5qZNnadWwxXJRmzoByFZV9losPD53LjsLUkP2VowZw4y2Ni7p6YLWVnNjQwOUlAzbd5kaC88k/1YwUIVBTKoYGPhjney99+dM2eYnJvsMDIFiBw3TfUS8FmyBGIUb/HjrQsR7+7YRpNR1qJ5hdBZM7IrZH8zPuZAt/iVEjVBKfytQ0BQbC3LSuk4fS4azKY+kjk8Ih+XJ8K9//Yu///3vfPOb30TTNC6//HLuvfdebr31VlasWHGk25gmTZrDJTPTHGwnmLLdgkvzDbCACxQsipXj8y4zN6zstzJ6CIP1NIfO7JJiThk7esiV2e+eeCy2j0lO7+qe9mE/V4VCTc/gRogjxZ+3vIxuyEEH9YYh+dOWF5PbXm5YzgNVL5meFkh0aQ6+/LFuvr/xHprDH25bl7QuQRniZ9XAYEtgC22RNgBOzT91yHAJieTUglPNTA+WDLxaxpDHVFCodI0/eOMO6AuyNm/k2mOPGrL454+fTa7XndIXGA4b28T7vNP0Z95r+QdN/eK0e9Fl7KBNMWR8+HGSImie4SY5mqqq4ocT8xlXlJv42LwZVEWgKoLbrz6d4ob9qanrFiw4aPx1XHYM63oqhJnWEaDQfSEOrZwU32izNQgUKjK/ldhHkOU8f5Byvahku80JgKpmYhESNXEfiMSkulczxwpYLaYifjgeZ1b5Hn5z2cMcXbkHTTFQBIwvrOfWc5/m0op3yNtsejpIqdMjX+W9S8r4znnn8+Wiy7nReSm/KT+F9z+XRfPl65ITLodlLO73w2gtA719VD8446NREoKd8qhY8roceJ0kMMpSj5SSzhkTsIg4UoIalmiB/ve5QEpBXDdX2Pf3rMMqDKzESR09SzQMbMRpiR7ggXEITPZOYrjRuIKg2DkKMF32c7Qu5rr3MMbWRLG1nUpbM3NceyiydCRaYxDXW7EQGFSksvc6aPpWADojuxH17zPv1b2M+0UT2b/uIvOP3RT/uoO5v6ziqFf2U73b9GZQhDLsfVi0qhMhgS1bsDRF2eIeRcDpwOcJkp/TSW62H7s1SlyoBHQnZGRBMEhFbRDFMDh+207mZFQxL2sf8zP3MdrZjFXEKHPYTC2XjnbiUkGX5t2nW/vuXQNB2LASNsxnKWwMb2QsX91OPOH10BHdz3+On8nDrcebBoaEgaO13cc7a6bynmMcWy4yDeqqsOGzH4t3VYy+qYl5obXscrw9lclj5PR089ggBoZeQhPHo95wAzQ3921saxt2HFMXqqMz1km22s0FGRv4QcEb/KBgMV/PW8J8VxXeth70d03hXJU4uk1h3ZWlvH7dJN6dOpaVZRWsnzmKVddXsPbKUoSt3xe6dKmpkXIQJvtOHNLAIBAU2sfjs5r9QYa1gGsrfk2xI7XvL3SM4ZqKX5FlKz7o8dL8F+jVZPigr08Ih2VkaGxsZOrUqQC43W78fjM+7KyzzuLFF18cbtc0adJ8lCiKKfKVwFbTxA2h6xjrPor+6yajnBP4XMUvybWVwObNIx6spzl0hBD88ZwzuG72UTj6GROKvR7uOOs0Lp425QPVX9Pdzh+3vM0PVj/P3VuXUB/0H3Zd2bbhY+ElkizrICugR5BN3XsQQ9x+QoGtoWqklBjS4JGa1wYtZyCJGjFeqH/vQ2wpNIebhzQc9NIWNY0M0zKmcUHxBQBJw0Tv+/E5x3NC7gnmNqFwUv4ZQ9ZnYHBC3qkHb9wBfQFVVdxSnMktpy3E1y+lVobTzrfPOI6vLTompS8I6n4WV67itabfsLHjP6xpe4LHqm7khdofE+8X+lHomIEYcoItcGsF2NUMlKG+1ARNc7wpxlD3ww/y0CmTuf3q0zl+ciVzxo7i6hOO4j8/+CynWmJmyrr+HHdwd2G7dnCPIafV9H5RFRczCh8l13ka/YdObutEphc8iM8+M7mtwHdzYvX/wOugYlFzyfN8AQCbVobDOh2bkGQqEXKVCLlqhFwlik/EUEScTJeptaH4Qnz91JcRSFSl36pz4v+f520Um7l6Golt4qUzJ3JXxkk0RHsFQQU7wnn8seF4VhapRK5dCICm+PDa5uNZFe3XXvPdK47GbZ2ePJY3KzSk54ciIE8zPU/ULBWrYiQn42ok9ZkQAgotjUBfyIJFGDiJYU+8nMSwCdPwEdYHeoVIaRCJ1xGJNzJYJvYxjp6EwWZwrYkKWzvWROaIMc5cJtjrzQAPYZ5L7/toezOjHTYsig1Ddg9+8v0wjADE4+gP3s+cO6txvhdBCfe1QQhQwpLiFZ3k/b+H4KGH0AzLkIYLDEnBhgCOqhbo6KBL9RDJ1Bhb1oB7ZpCmy9zUftFH9AaBOjNOVGrELVawWMj0xxjT2Mq0LbVYEnoiipDk2bqY7qvjivIzzW0re7NlQNhmocPlIKxbCOlWIoY1GQoBUOw6ZchzF4Zk1IY2rKrp6dCcH+PZnrmJug8MrZCs3TqGLUUdUF4OgE0rpGLHmRR5voTXvoAM+wlUZP+KKYUvJD1vALqjUYTStyjc/1uWAnZ1thGMHdzYmXqZdfK0Lq7PWcEkexNqwmPOq4T5jGcH1+x4P9m32xQr6y8cxVsF49ncXUx9JIOGiI/tPYWsCZRSW5lJ+zUnpR5gyZKDtmGS7ziK7OMHWQQSCFROLvhCyvZcexnXVv6GL4/5K1eU/ZwvjfkL11X+jnx7JWnSfBw4rOWykpISGhoaKC0tZfTo0bz22mscddRRvP/++9gOcMlMkybNf5njj0/JN+/959Nccf31BMbfiD/WilvLIDNhHWfz5sMarKcZGTZN4/snHcdXF85jd2s7Nk1lXG7OsHHnh8Kft73LH7e8najHdLu8a+sS/m/6Iq4ZO2fE9R2bPwavxU4gNrjrtyEl55RO/UBtPhi67D88HYiUEiklDZE2WiKdQ5YzkKxu38bnKs864m3sJcOagYIyrKHB1y8LxNlFZzMjYwZLWpbQGm3FZ/FxTPYxjHGPSdHtODHvdBpCtaxsX4KCmnCPNQfql5d+gTLX6ENr4AF9gfKPe/n89ddz9Q9uYFeT6Vo8Nj8Hq6am9AW6jFHVvZKqo0y3daOfAOKe7mW82/xXTiy4GYBpWRexu+uNIRogmZF1OUIINJx4LaMIxPYPWjKcYyW24CjoFZEPh7Hccw9nlJdzxty5pmdGRwf8/Z5UoyjAwoWQn3/Qy5Hr/AwdobeGKSHw2foyBVjVLCbl/Z6o/iPC8f1oig9nIvtEf2xaCePzn6O24zYC4Tcxp0AKGY7TKMn8ERY1N1k2y34Cgdga8+okPAKEACsGFmHBYzsBgAz3GjJEcMCxADAktrUxMl0twHg68jp4KMc0PKVO7sywjEdbZjN34koKEplD7JZyyrblIM6PENZrUISDHPdlZDsuRIgfJffXAgbDKYmIxMTM2RWhv1KLbhtoUPJYTOOkQxtDONynk6AeYBRQBESMPl0QKSVN3Y9Q67+HqF6fqGM0ozK+To7rzGS5WGwNc917WNk9Ovk0isT1yNMCTHbsoyu6jQz7DEptAfzhgR4a5vGgzNpp7i9V+qazg/VKEkVX4O67cWx8H10YyTp6wxRUjKT4pFUJwtKlVNY7eP8sA6kNvE42fxx7Rwx7cxAUOx1uN0Vl7Wy4pJiO8alhC3IOhLBS/J4fm8OB9LeR2WgQLLRiD8QIZ1iT19QmJG3h9bA5H7W6HovQiUmV/TOykErqtyCQZKumgaXAUUamxUNHLDDgGtgDMSxhgwK3mW5zU2klsdBQ0wyBIWFrrQf6ictqYYVRfB7y++m7tbWZgo17zSwoTS4Hl6xcyfPHzaC90oWuqyiKQdiw0NLtZXR9A/G//gXy8qDLNHyRlTWs8GOho4gzfTuwCD3FkCZMxUpGb2ohYMkBIDpqPO8VtdMVN42z/Z+xsGFhW08h40fNZF757hFl5lGFhcvKfsHbzfezsfM14gmvsCLHeE7K/zwlzsmD7pdlK057LnxCENJ8fdA6PikclpHh/PPPZ/HixcydO5ebb76Zq666in/84x/U1NTwjW9840i3MU2aNB+E/Hxz0N2rNh4Ow1134S0vxzt3LmQGoWOL6RZ9mIP1NIeHy2pletHI0lwNxcu1W/nDlrcBEiKNfb9EP9/wKpWebBYWHOJkNIFN1bh1xhl8+/2nEQiMXrGzRO1fGn8sJa7MYev4oMhWG+SGB/VmkAYYbQcX0vyoWJizkOVtywf9TCCocFWQb099nkY5R3Fl2ZXD1qsIhavKv8ixuaewqn0p3fEu8mwFzM85gSxrzqE3cIi+wFpezuTeifuOgX1BR7SW/TNdBHMGy+Yh2dz5IvNzr8Wuesm1T+DEgu/zTuOv6NOQN9PxTc28mEkZ55rXQwimZ32Od5tuG1ijND/P/dwv4N43zbhmzNXr4K73CG57HEPGsaqZeCwVaIqjb+eJE01F90Mgz3Um+zp+R8xoY+CKtyDfdS42LS9layS2l/aeR4jEdqMqmeiu83Hbjh1wD9ot5YzJu5+Y3kZcb8Gi5qOpA5+Vru57eg/Xd/6JvxVi+Lt/T5bv/8jSehgquYXil4iQxJaIE39/QnFCo37QKH+6DDtbug0KEpO7WLyaSM9S7K1urJkKSIVw109ps64gx+FDhEzND/faKMEFFoaaYKuoCCGwr96HwIpBFMMuiHsPiLGXUOI6AYASO7zWlkOFrXWAl4QhIWhYKbZnJLfVdN5BXeDPKeVC8b3sbL2ZuNFBgeeq5PYCS4DTfJuojmbjjzvRhE6JtYMcrSvFoNAdXT9sCETMqEE3IqhqNgrqEEZE817PeN4B27ZhERo6ZtaJ5iIPDTN8RL0qjkCMsg3tZNf3YEmEoRhbNzFONLHj3IIB11aN6jhbIuhoWLCjeSJsvbSQjnEDdRGEgLZrncRacmDLNnQZwx4wyKzuQYsemCrXoGX1cxiL9yEEeJUQbbqbmlnZB9RqPsWVtpbEMQQXjvoe9+39UcLY2KfvoEV1Mi15eC1mHdI9FRGKM5SZWAiJ3RgH2Qc8FwcKNEYiZqhXQQE0NhK3KhRZ2/nlrsfpCVppneYm4tNwdMWJrtLwb3Fhmzi5T4emoMDcfxjhR8VoptTaNvhnfgkhSYZiClOuqSyjIz6UDo4ZXrKiOcTFh5GlyyIMZthqGeXaRLdhCklm2RWy1GG0eQzDNLhGIuY5Z2amvU8/rvyPaTIclpHhl7/8ZfL/l156KWVlZSxbtoyxY8dy9tlnH7HGpUmT5ghx+eXmakBisA6YP34HGhX6M4LB+qeZaCzO6+t38dq6nfSEo0wqzefiY6YxKjfjv920FP6+YxlKP0NAf1Qh+MeuFSM2MgCcNWoKPqudP21bwvr2WgBKXVncMP4YLiib8UGbfVAmRMexQ2wcILYmpRkuMTpo5jAvtGcnVti6hqxrZua4D9SWkB6iuqcaRShUuCqwKKmT7omeiczPnj/A0KCgoCkaV5cNLv61v6WT+rYAOV4no4uGNhqUuUYfutfCUBxGX9BUobDjzKGNjQZxmsI7KXPNBmCc71ScWgarW/9Bd7wZm+JmcuZFTPSdkzIZd1hmsC9URLm9PmV9WCLYF56Awz4abpoEjz6K/u5bNAeXEom3oURBGAYRpYGAdTs5znm4LEWmAWUEKeNUxcH0ggfY2PQ5onoTIrmOrpNlP5ax2T9JKd/e/TB1Hd/HDJfQAZXO4JN47IsozfkLihiYFcaiZmNRD5y4mYTCK5EEh0xaIYFA17/I8v0fY30zaR1CIkVEzGmc12oep8OdfdA0uUHGQWYmuhEgEnkXACViJFy1zV4kHH2d4IzjcC03vW9stVFsO6xExh9oaDAn2BZiGBs3olTXoCpWpBGlc5bjgHSIZkiCM2G8URQVmxKnNppJqa0DPRFvrApJj2FjbzifRRnminE4Xktd4J4hrhRUddxOrus8VMVNtuMY2kLvYVPijLMPjIfXFC8e60QA4kZkyDCQJEKgCCdex3kEQk9jIJG9sdEJ0UZrSxzvugKwgCJcBC0WVl88Cv84Z/J6dQIN83yM2tXB7GfNCWdIb6VoTSf7j8kimGODfk+DYQFnW5S4oYLiwciWtI8bOowtkm8lds5c2L03GUaSVdXDjMdqqJmTTdhnwe6PUbipE19dCN1XgSrApUZonOWjJ8eayFoikQkZwhmOGrLUvvCDMtckLi2+kDebH6EpZmaZsIk4k7PyqXAXJZ/xCpGJpHXItkopGJszFdr7JvdxI8je0EO01u9CFVYKXadQZJljTlbGjIFIhAwaKcw2PSvcdRHcdakT/qZMHwYxc9KdmWnuB8MKP+rG0O0k0qsmYR6nyTl8ZiApoUe0Qmaql1+4p50Gax2q0ChyjDEzeaXsF2V/y6WEoxtQhYEvEb0Ujq5nf/NFjMp7Eqetn8hyUxO88w4sX56Snhin0wyNO/749CJRmv8qh2VkCIfD2Ps9rPPmzWPevHnD7JEmTZr/KpoGN900fP70/oxwsP5pJRAMc8NdT7G9tgVFCAwpWbenjoffWsvt15zOqbMOQXDvI0CXBps7Gob5XLK2dXC39EPh2PwxHJs/hq5YmLhhkGF1fGTeAzcfdzI3PLEf61EdyP4i/AaEN/j46rmLAHO132Ox0jFoKG5isG0dXmNiKOJGnKfrnmZx02KiifRmDtXBWYVncXrB6clrIYTgCxVfoMJVwSsNL9EZa0egMDNjJueVnE+xI9Wltbq5g5//6w1W76pNbhtfkssPrziFKeVHxsNlAIfRF+w/rghC78EwYSBKv+HEurYHWdP298SkXSeid7Cs+Q5qe1ZyStHPkkKCT9Y+x65gDjVBF5WRFpyxKN2ajT22PGJC47XGt7mg5Ey4+mp2jHoX64Od5KwLokb6pVO1KTTOXUXRlx/HMeOElDaF4o3Udz2LRKfQdTYu68AsBS7rWOaWLKY1+DqB8HoUYSXHtQiPdVrKPR6MbkgYGCR9LgXme1f4dVoCfyTf9+2DX89+RKIbD1rGwNRUcTlOoIlshGwbMCGWNnODw2pOpgqC0SG8GPrIt5qrn7HY9qSvCQeENShI/DOW4Vz2GYTQUATkPtLDmgtKWFI8gc64E58WZIarhin2Jhw7Y4jn7kWXYXTDNPZ1Hu3gwBADBUFb+B3K+Dwe2zzKbF00Ra34Y3Y0pS/EoEu3McNVjc/+fwC09jxPnx/VINdKhmgPLSbXdS5F7vPZ1/lXYkaA3mAFknsKyn2fR1XMle5O3UGm2jNkuES3YSduhFBVK17X+TQGnyNoWPAp0WQGAr9upWJFAC2h8yGUDNZdkGpg6EOwf2wWhVecw5hHahAoCGDU6nb2LsrB4tdRowbSKlB03XSxTtShy4OnDN5y3DROuO9lRIMCUkcCnoYwE15J/Y1Q0FCFBYEgPMaB9cIYJylbqYtlEpEWnCJCiaUDi9CxamOT+7UFX6Oz60cc5YC4XaBLgVXoCCnoQCVbHo8QCrNq9+Pz+vCHI4NcA4lNVTlrwnj4/e8B83ltkCvYITdC2Ozom0NL2a2M4jj7KLSwApMn49hfzXD3Qa4rkQrT6YTCQnNV/yAC1ppaNHSdNtPEoCSyrIw2hjZk91LsDEG7KRpqSIPa0E4e338LPf7Es6q6OT7vMuZmnZXsZwLB5whH1w1Sm/lMtHT+lLL8F82UmMP138GgGRa3eHF6LPdx40gIN36ChB8P667Ly8vj/PPP56qrruLkk09GSbvlpEnz8UfTzDRKixaZIkTLlg20fi9YYGowpK3fAPy/J95kV725wmEkVoV6U+j94MGXmVJWQHGOb8j9jxTLm/fxr91r2OlvJsvm4vzyqZxXNg2banbhCgJVKMlsCoNhUQ4UoYOIHmNV6z664xHGevMZ48kbZM8+PJaPPpXp9Moifr7oIn7y2MvEc/xkEETtMtA7fHz/qvOYPc4c1LdH/TSFm9CEIC77/yaZP8g2xWBNx2YuHDW0cNlQ/LPqnyxrW5ai/B3SQzxZ+yQRI8L5xecnt/fEu9ndvYmQ0YJNBdCpDm5hf3BaipGhubObz/72cbqCYYQ08EZCWPU4zXuCXP+7x3jgO1cyriSXD4UR9gVl/tfZEXp3yOqsipNCh7kqXBdczZo2U8tBJibhvSnWanreY0P7v5iZfS1xI862Xe8yZ20dYzc3Yw335ZKfba9h15Q81s57jQsKTiX2r3txvvIMYNA51YESlQgDpAKGVaCGdMJ/+CGOk78Il1+OoRi833gd/sja5NRmb+fduCwTmFv0LzQlVaxUEVbyXGeS1y+m/0Dauu4HzGdMRyRdwFUMVCStXf8k1/u1Qb0ZhuYQUo/2TouFRlHOX2hsvQJD6iiJeH9dCvAJ7JnHosTMsJHZ23ehVJQmrn7qgFRgkK0FGWtvhJV+dL0OkEiHgvSJfuVMTReZG0OfPwZtRRVIhepAJvJelYkFDeycXECPx8LmrlFk7QxxVHstwh3BSAiBBmY7iOeqiZVx0zIoMN3kY7rplmFRs8iwncL24HreD5UTk1ri+JJxtkbK7AoZDlNEL250JgxXQ/VzgrjemajXx7S8O9nU9HmkjCUNCFJChuN4yn19InodcTcZag8M4i0F0BDzoSaGyw2df8CKIFsL0Rh3E0XFLSKUqz2oayLEjB3YrNNozI0N63EAkqU5mxhTPh33zhpC3bWMfaaJ/LUBtP5CmYbpVWAxzPvK7o+SvauHtrFDpZGUWHZUQWEhSvVu4j1+YnYVxZAp0TYCQZatxBRgXbiQnQsa8KkbcIoYY23NqTVK6GRG4v8GVe0/Tx5LExKt95oJSdv0Ttxr67FrJWg1Nfz5vPO4Zst2QKInrqcqwJCCX5y6CO9uU7dAlxGaQ8toX+CEpLCp+R406tk7Mcy4dWNAURCZCvvOysLeHid3bRdqqO966Q6FlqM8lEw8G57spw1zEAFrVc3DYj2JcOTNpOhj8ivwCaQDLNI0tMzYtg1KcwcX8cDcvCinG15eiZSSfT0baVFaCbr7xgohvZtXGu5FN2Ick2sKAAd6/g0JT6KBGISj64iFq7D85dlUT7ThWLrU9Fy76aa0oeHjQDpc4uA88MADPPLII5x77rn4fD4uvfRSrrrqKmbPnn2k25cmTZojTX4+XHyxKULU2WnGKdrtppU/bTBM0hbo4bW1O5PGhQOREv69bBNfPWfhh9qO329+mz9vW5o0IlR1t7OmbT+P713Hg8dfhdtiQwjBiYVjeathZ0KPIRVVKCwqnpCy7emaNdyx9VW6+ok6HpVVxu0zL6TQmTGgjrZINytad6NLgxmZZZS6BncB/zD4TLGP46fnUvv8GiKBLhxWC8U5XiyP3ws1plto0GO6YWtCoko96XYtBEnV+K74MHGtQ1AXquO9tqGzUrzY8CKn5J2Cx+IhZkS5Y/vPaIo2pow/u+JdPFj9N1ShcXTWAgAeeXMtltYWTqrdzbSmKuzxPhfcqMXKykgD4356y4dr8DvEvmCs53hWtT6MP9aIHEQY4OjsK9ASq8JbOp5CJBQBBiLZ0vkU072Xof/rIS589v1Bm2UNx5m8uh7L6ga4/1rith6SA28hMGwDV0WjekdyQL3ivLfpljtSptcC6IltZ1ntuRxX+vohXqA+gtF1xKREp1f8z1z1jKOgA1bDT0yvx6aVJ/cxpM6+7rV0xdvwWfIpc01DEX3GPotlLAdD0Kc34bAvpDjvFfxdfyIYfhWkjss+D5/nRqzHNSRFPd31Bl/pfJc7M45LyJD2Zi8x0ITBDfmrsO0sN0NkZBQQRGdbONBFonf1PH7xPDS/g5blLlri5mpuTmM3OY27k2XjKDRbXXilRFUchMY4aDrHYwo6Av1HxgIVV79zX9G5lWXBMSnHlgh2RgroNuxMKqjGbh2NQ6tEMlzWAInDYirrG0aEPe0/BmIpz6IQ4A+/Q3PPi+S7zfDeUkcxe0JQbmvBgpEMzTIQ1ESzydQ8aKoLw/DTHd1M0HCwODiasOxzdy/wBzgzvJludmGzTmP3uOFF0KUUxOiA2bPxvvYK1ASRSCKZlhShTDUqUSNgCfSAQ2LNFkx9spZNFxfTNtYzoN6cXQHGP/8exGJoGblE7NCZrRN1agjZ68chUF0Z5J12A5ywiHhuFu9t+TcL3Q58Sih5nZJGlriPneFuFgI90S1E9NoBx+0lMNdFeGV1MnPL3Fde4fnzz+fuQDdv7d2HlJL5ZaV8cc7RzO7sSIrLdkerQRq0zRmYrUiis++oeipW52NRPViEnVGvdbDn0lz2n5qJNaCjRgx0m0LUq+LbHcL2n9dJMbAdgoD1Mx1zWWhbhkcNJw0NujRT5b4/oYzZW1x4VXDWBji3sYHnCqcN0JtQMCi3djCxWkJVOz16gI5oEzXzcpGDxOS81fwos7NOw6Y60Y0OhvMWA5CPPgrbqgBz4WN/SyeroiqrMouw5+VyamU+8/31qDU1fTtt22Z6Plw9eLhemjQfFoct/Hj++efT1dXFU089xaOPPsq8efOorKzkqquu4tZbbz3S7UyTJs2RRlEOKkL0v8zepvYhDQxg/sDvqE1d8dnd2sb9K9fy1u69GFJyTEUZn5t7FJMLDm+iuKK5ij9vM10ie70Uelu0tbOR321+i1tnngbAjROP5e2GXYmpT1+7FQQWReHz4/rSF75Yu5GfbPjPgONt6NjPdcvu498nfAWXZg6S44bOHdte4qma91M8JY7Lm8DPpl/04Xo39HMLtQKVPgf4+on89XMLzVswF3uFSljRk8aG/qhCodw1cgXuNR1rhs0YoUud9Z3rOTb3WFa0LKM51jBggat3wP7ovoeYlTkPRTeI3v9Pbtg7+GqUNRZFLH0X/UcdqMce++G7ux6kL9AUKxeW3sFL9T+jIbSlbzehcXTW5czKujS5rTW8Y5iVZohEOojf+VssO6rQhCQ+jOtnUV0ndK9F89mgHBAQLLHSMcNF1Kdi9etkru/BUdtnoIlsXobHWEXP+QM9jAQQ0Wtp7nmbvITwYC+bOurZ0F6LTdE4rmAs+Y7UCZwuQU+mlhMp7zJhbFBE3725u+t9Xqr/PT2JlXUAj5bDOcXfptRlxmo7bJ/BTBepgyFNAceIRNoEhk+AInA5UtOTWq2TyM3+08CLdXxT0sigqIUs/M/75FwY5Jm8aewM56IKg6NcdSzy7aJ4bwe2Z7YDHoSwIGWM6FwrGBLRrw0y0QbFmgM33cQy/xMUrOsceOwETTEvY4VALDyR+GcKkeFnGEytUqJT7DX1fpq7nmFVj/lcOkSEImsnKgYtcQ8dupu6WCbvN/2UY0c9QI7rLKo6fo4ugwxczlOwqUX47MeY9QZfIhjbzVDs6/gtea4zEUJhYf5NbN37EzaFRuFVQ9hEnJhU6dQdgMKZhdcAEI7VEpYWloYHht10RyzUxrwUaqZWgJJZBqwf8vgAetgJr76K0tiMS8ujJ96E0A9YRbepON1FiFgQQiFy9sYJFHqY8fB+AsUOGqf5CHs17IEYhRv9eOtC2N0l0NyMAFzlUwlPzOTtrxcRbK/Bo7sYl/cZppVfgtViekPE9B5iCJZ0j6fS2kKZtRW7EqfbsLEvkktNLBuHahqj9YOk8YzlCnrmZJGxKbEhHGbco49yZ3m5mUWiNyvMQw+m6MDEjE7aZ7uJDiouC9EcC91zS8hc7cdjLSYa3sa4B5voKbHRNt1N1KtiDehkb+gmu8GB5u7XrxyCgHVXrIeXm/bwjnIKJ/t2Ms9TjV2JUx/18lrnePaPySb//X1M9o1GEVYueGkZ4gz4T+HURNYWk0mOJr7QtgLbUxXAVDqijQgE1bMG71/jMsru7nVM9h2DzTKZSGwLQym8ai0q2vKdIKwYUrJkdz135U9iX0kRhpSIGDy/vZMFEyfyxzPPwnL/fX1il0uX0n3icbxp+PHHQoz25DIvt+IDZ7RKM0LSngyHjsfj4brrruO6665j69atXHnlldx2221pI0OaNGkOypa2JpbWVQNwbHE5k7KHd9X/qHHZhnd7VoTA7ehbrVpeVcP1jz9rulMnQipe2raDF7fu4I/nn8mpEw6+ankgD+9ePWQYhC4lT+3bwLennoxDszAls5B7F17B91c/R0MokCxX4srg10efy2ivKShoSIM7tw++kqtLg4ZQJ8/XbuCycjPl5R3bXuKJ6pUDfteWNu/gG6sf5u/zPv/h6DPE43D33YfsFmpdtpKv7Qpzxylu4trA9ujS4IzCkadjDeth8/yG+GEXCCKG6fb+Rs07SGVwL1ohIEwXNf7dlP/zJSZU7xw2HaBEIg35sXF3dVtyuKTsj7SEd9MU3okmbJS752BXB66mDsfoF1pQd+9FIvGoITriTloLPeyakkePx4arK8LYzc0U7esgu70LrD4Uv2lI2Px/RXSPd6TU1zrfjWdHiAn/Nq9Nd3QHGatDtC90Ec0deL0kUBN4KGlkaA518dWVT9AdXUeFs5WoVLlzawmnlZzI96YtQk2kNZH0CvINnrFBR0ERXgDqgtt5av9t/Yx95n7d8TYeq/kh11XcSa69DEVRyAheSmzxP7CviSHCfTeZtAvCs+xknXPToV3YfplDNK2CWHAtBQ+2syB7J2JsnKDXSmG0g5w9HbiarCh2t5nBQi0gPKYa64oo1jUxRKhfGxyC6Cw72rkuKNR4cdF03LOCTFzfwPhNjdgifSEuEZvGxqklHPOF2xAFBYzWAwQaN9Md25G8Br0aHeW+r5BpN/uXrf530RHMcu5NcdMXAlpibpZ1j2FbT5RjAVVxMS73T2xvvj5hyOqdjCmowsn43D8hEt9Xa89rJGPsDYnmN1CiEsMqiPsUIjTQHd2GxzaZPOdsriz9Ms/W/Rm/3hfi4BRxzig4l9EZ5wFgyDBbIr2T1dT7IG5VMRD0SAs5wNGOWdTLZ4gn0gmnYnpWzVmcC9u3J7aYQotdJQ4a5nqJ+DSsfoPcDd3EmzOx6BroOpaeMLnbu2mZ5MFXF8RX1y/MKXEo0eE3UzdmZiLGjCHnxEVcM+7iIW8du+LEo2XRFW9nV7SAXdFUPRiBQqGjAgCHZQxDu/QDqMQvPR3ijhGJy0bHFVF/thymXohdehb0rMW3NYo/uhfdiOCqNV/9W5vl6ks7e6gC1g3hVnSpk2vrIiQUVtSXoEYNDKuCx9VDOCeHDVPymbwfDBlEiUoufGY9pxVtZefkAsIeC+XhNvI2daHu10ELg830Zqo5KpuenKGN8bHE70em51oCwUeHKKWSvX5GMhxrb2M7d+dPYk92YdLlpHdNZPn2ah6bUMbV118Pd90FwE5/E3+/83u8MH9CMhRqlDOTO+ddwgTfh6T/k2YgaSPDoRMOh3nuued45JFHeOWVV8jPz+fb3x6Z8FGaNGn+t/BHwnzlzed4t646aUX/f6ve4fiScu4+6Ry81uHdTD8qJpTkUZjlobG9a9A+3ZCSRTPNbAUxXeeWZ18ibhgp3g+6YTpTfuf5Vzimogz3QQwXB7LT3zyszkJIj9Ec7qLMba6SLMiv4K0zvsqqlmqaw10UOX3Myh6VYgTY09VCQ8g/7HEXN2zlsvI5tEa6eLJ61eDnj2RtRxVrO6qYlVUxovM6JB59NGWQGtXjbM+ys2tiJa68AhZasnGvWZcycD26ycJFS9t47ISc5ECq1wvh6rJzmOitHHEzypxl6HKI3IGYk4MyZxkArV1+RMbw9fXc+y/Y1UiG20GLvweJpMGTxcb8MgI2J95IkGlN1YyJ9aD2utd+BO6uuowBAlUMPyzItY8h1z5myM8VMfhKJICjNUrB2gCKV0ECbleU18+YyIayUnon4gLJ1llFXP3EMrR9CU8BGaepJ4NYtobUpekeHZXoVkHErbHbXkTsxCKKlvTpQGS8H6T5DO8Q52qu7sUMna+u+BOn5f+HYkcnujSPf2HxWpa3beZ3m+HbU09L7DOYeF1/JDG9CVUpZ1nr42joVFibGKW1Y1N0woZGdSybqlg+K9v+zVn5X4VHHyXz3S4CQdUMW+hffUjiXTURbf0fB4i3haMb6Q4tBuI4bHNx9qbQTGQOEdu20SxnsjXYjghK5u/fC5g6B5vI4+jM0TiEAMNAC3uxbI2DMvAeFyGJ/T0VZd2vYOFCXKNt+LNgxUmjWXlCJa7uCJaITsym0uO2YdOsiAJzwmJRvcwqfIyG7mdo7H6WmNGJyzqOEs+VZDn6vKp0nMxwVjPW1jzAOJetdXO8Zwc1sT4DbabjOGYUvUxD10P4Q++CUMlynEKB5ypsWmG/7ziIpSVGxsoQ3nVhlH5x+4ZDITDTjjy7DsomA1DhPZOvuU9nX+BZ2iN78FiLGOu7GFXp67Olkk2bMdCdHyDstRBzqPSEzfKW91cwc0ITm6K5hGVviEyfsODUzibGbd8LzQEMJN2yjZYpblbdXJ4StlI7L4vW98Oc/ksVkZkJnVF8Nd0Ec2z05A38PbH749hr68y0jWNMDYODhQoIIViQcxavNj446OcSg/nZpl6JVc0l23kGbcGXGWrFPT/jKripcETistZzKzBabxyigEQVNrI98+CmBaiPPkrJkhhNoTWE4n2ZKTTFQa59Oi4tP1nvoXqBuTQHk4N1nLRmJwXr/WjhvvslZleYPL2BnfMvArcXsam6b7/6KDPraw6oTSASxgBl0mQ2nV437LELHObvkt06jdyMn9DS+ROSHk4Jg45NnYh3YylgGvaWh2BPaeGg9UkJjy5Zz9W3fR7Ky9m3eQ3r2muZ0mPhxbnjkIlQuLpgJ9e++wAvnPIVcu1DaXykSXP4HJaR4dVXX+WRRx7h2WefRdM0LrroIl577TWOO4SYpzRp0vxvc+Pi51jeYP4o95+QL62r5qY3n+PB04ZecfkoURTBty84gW/e+/wAzWlFCKaWF3D8VDOd4Nu799EWHFz1WwLhWJyXtu3gkhlTBy0zFFk2F1Xd7UMargUDxRgVIZiXVz5knTFj6Alzb3sjujmQWdaya9CUmP15rX7TkTcyNDWlDE73xrr5zswsto3KRhU96LFdWPS9/OiKM7mwy2rG9YbDWBSVS2o8lGScwetyD4F4D2XOIs4oPJbx3sNr46zMWXg0D13xwRXFix3FjHGbk24jkAneZsQQ0iau5jA5q3eAz8fY4hxquyM8PXGeuRrVj9XFY7n96FGI9e+muLs2zJrHS/va6ewOUZqfyWlzJuBxDm6UawjtYVtgOXEjQpFjLBO981GVgQaAvV3vs7z1MepCWwEocU5hfs5lVLhnDSh7KOjG0LnoC1b5kZguwhbFTv0VMyiraMUa06kJZxEyLLjVCBXWFirbW4iMK4HdkpjdRUOTFfXmOOPHNONICAtKKWgNeNnaNpbWU0s4tTwD287dhON1ZKwJ0XyaZxCdAci0m/pRi+s3c07Bv8mymVod/QXf5mXvZWnrXwhET8BrtYPQBqRRPRBV8SKlQXX3CuY5duFWIsnyNhFnnLWJPC3Ahg4LPKnAtm10h5dgyLDZsn6PmplCch1WdRRqwpvFuPE66v030RNenHDRFoCOzTKRkpyHsGhFcNNNdD34D7Y9uyxRj+hXpylYuSFQx4nWPEQwRNzaDobAIHUNWWBOb5DdGDKMsnQpX9wZ4fefsRPXBFIRdHv7+h4FwXG58+mPqjgo8V5BifeKIa/ZKM+paLHHB72uioAMLURYpIa+OCyVVGb9eMg6iccpejZObEk7gy37KSGDzGUR3BsfhWObkhNRRVEYnXEBQyaJFcNMxBRBw/QMKld1AhDbvQzfzhCzxzfSqLtojjsxEHiVCMVaN1nvdxFt3oitK5uYEaSj0kbNSdkD7leQ7D/aRtuFU8h54j3UzFx6bFE89WHs/hhhr4ZuVVCjBvZAHIsrG3XmbFO8FQ4pVABgfs457A/uYmtgeVJTpddAe3zuRYz3Hp0sW5n1U0Kx3QRj2+nzajANKWNz7sBuGWUWHIG4rNrzAnaihBk8PapPdIEMgeaDq6/GsmgRJUuWEHn3ZWLdrajCgl3NRrhcIxewjscpfOIlrn7xfcwnQaJGJcKQpo6ClFSsaGPS6pdh0RdRFp4Pi5cxlJEFJJpWDgsXkn/p+Vh330DECKaEMYLpIVLiHE+BvTy5LctzAw7rHDq7HyAS24yqZOB1XognfCxK6KfJcks9BcP+Mte3BZBSIufMYcu7zwNgj8Tw9oTxe8x7w0DSHYvwxL7VfGXiCYd2rdJ8MNLZJQ7O+eefz1lnncWDDz7IGWecgcUy9MpFmjRpPhq6YhH+U7WZbR1NuDQbZ5ZOZHpO0X+7WSlsam1kaX31oJ/pUvJObRVb25o/NqETJ00fwx1fOJvfP/su+1s7AbCoCmfPncS3LjgeTTVnk/s7/ckUl4OhKgr7O4f3HhiM88qnsqZt8NSTqhDMz6sgyzb4ytpQVLhzcKpWgvrgzvqKEMzKNlfmdwWGTotpItnd1Tii4x8S77yT/G9LuItvHJXBrlGm0GQ84dkRNXRuXfcchQuuZkE/t1BVKBy7rYdjL775iDRFEQpuzZkwMhz4/QoyNE/SU2SibS5b5U6QcsCkSRpQ+FKILJdpUMjPdNJ8VSV7qgpRhIEhleT77Kl1HH/VNXD0BLjrLqSUbK1u4sGv/pzFo2eY81EDfv/kEm6//gyOn9E3LYobUZ7efwfbu5ajJCaiBjpuLZMry39Cvr3P2LKh4xVebfgjol9McV1wK0/W/JDTi25hasZnRny9HJZsQpGOgR8Ykrz1XabonLCYK2wVe3AKKLJ2UmzrTBa1tMfRwgadrii57kxkoIfxgVr0rQpb60bh8EZRVINw0Eo8qjGVGrKfa4LTj8GpltLJWpSQRAsYxDP6hBZ7H89RHnPSu6PjOSa5B48xVwQsyN7F+y2bOLn4aCQDs7P0r1ciTH0DJJXWJlz9DAzQZ5zIUELMe2Uz7MpHyjhx3Xy+oyUqoaOs6D4F1W/gWBvFUqsTjK7G4zgJtm3Df+95dJ+9kz51FvOEwrEd7G++nIrCNxGaxsunVPKfrHlM2lDHpC0N2Ppl7ojYNdZNzmeMo5DSPc3IUDd6oqZYiUr4KEuyDfa1Mey1OtIIgmpnWgOc/FYNr3+mLMUgoWCuBJ9XkqofcSj4IxtQxNDTJUOCSz14ysAkiTCrzC2ClkRei8GeW4elwnQ97xeOFKKdmq7n6YnVY9dyKPWchbt3wgy4tSxcqpce3c9gXi21R2cwfa1peAnGm3E+EoQroGScwahQoE/rwgP2N8Iou0KgZBN1qoS9VhpmD56pyDAEe764gJw1jah79uCy5xKQbbSNdqLGzEwrMaeKXphLhWc+9HoTHWKoAIAqVC4t/Ra7u9axrvMtArF2cqxFzM7+DKOcqamaLWoGU/L/za72n9EQfB0p4zi0AiZl/4wMxwEC8IcoLtsdfodcNUKnYdAtbf2yt0gylCAuJUp3eCU+56KUem0XXojtgwhYJ+6XyOaVaJEYrpYoztYoSj9tDEMVBHOs6PktsHw5ysSJyFt/TPStn2NdPTDEiAUno5zxR8jPxwpcUf4jHq66jZgRThgazGWLDGseF48a6P3tsM3AYZuRujFQn/KnMUw6TgCPwxSEbrSr9PQTFbbG4inlDCRvNe5MGxk+IoQ0Xx+0jk8Kh2VkaGpqwuMZWRxmmjRpPjxWNlVz/ZIn6YlFUYXpinzv9pWcNmo8f1hwHlZ16AHyR8mS2ipUIQbNgADmBHdJXdXHxsgAcOK00djyVR7bsZ7uaIRZ+SVcOf4onP1CH3LdroOKROY4R2YMADi/bBqP713Hts7GlGumCoFFUfn2tJNGXKdDs3J5xVzu2/3uIENvUFG4qMwcKHotvfHvMpl6DswV5F6dcpc2shCQg2IYsHx58s/3nDp7y/PRRByPLYKmGMQNhe6IDV1q3LvrPRYccw2Ul/eFTixbZg5qj0C2lO1d22iONKKQGk5phj9LtndvoyFUT6GjiMvnL+Cqv62g4sRdyZGAxGxGV52b4/aGsBSbz2JHQYCKi3bzxeZq1m4cR4ffjccVYsaU3ZQVt7HLn8O0Kd+D8nKqlq9he00z07ROFldMQyZcJcKxGN/6y/M8dutVjC4yNTdebbiXHV0rkBK6uyzocRWnN0QPfh7a9yNuHvc3bKqTsN7NG41/TrSxb8rY+//XG+5mnGcBNnW4NHwDGec9kxUtd3LgxM7mj2MJSzxaIYpQCc+ejCqeJ4qGho7aOx2UIKLSFFk0eiBowxJMeBpIA00aBLtSvXeEAKfDBTt3Eq/dhlEhUIREjRjEZGrfF8ZCY/BtKnyXkqHtQJdiQMq6XjTFwDC2AkcTN2JmC4dIc6gDMb0Vh6WCMot/4IJ0b50tOkVr28AD0Xgdhk3QeYWTyPjUxZrgAhvW7TGyHjPdwXWjC95bBXPdGLlqIhkkCTOSJKLvoif8Fm7HKTRFOvBnu1l64ljeO34Mru4I1micqFWjx23D1xlC/HsTOPMxEOg2CFzuJDohtQ2hBTbs22M4n/NBBKwKnLNzPS0LJGtdZfROlkbbmzknu43MwxhRGnK4bBEJ5HDqJQeQCLPSFBeZjoU0d61gh7WAtfmVxH0q0zP2cFSjhrd1Rt8+27bRct8tLD35vWTQDki2d/ydCZk3MCnrSwAIoTAn53LeavrroIcO5jjwnXgWrNpNHBvWTon3JwFEWGL4FLAKiErUNgOlQSdit4ADOr1ummZZCeWY4ps2fxw1aqBbFSI+DUVI9rbrzP3a1+D227E2NJBJIYaaR9DagyI0fJZCPJZ8MyUljChUoBdFKIzzzmKcd3gvprjRzbt1ZxLS2+h9ziOxGpY1fpYp2d+n1HtlSnlDxmgILqYl/h5SM8ixzKGIU1Hpe44lOkJIMtUQPhkihooALOj9nrdUz4G2iJ/XG1ewP9iIx+LiBPdsxovMYYOaBvDoo7BlC+zcSX6dacyKuVS6sq3oFoEakzjboribIoimJijdCYaBK/s4lKv/RtfZv0a270dEJNgzcJZ8GZf3q/R3ZxvlmECZ/Rqauu9hlK0eXSpURcYzzXMjXsshZmmypXqsnV6Zx5pmA8MYJJuUIjhvvhkGREeqwTdqGXg/DBeSmSbNB+GwjAxpA0OaNB8fWkLdfO7tJ4jo8YQrct8Pxqv7d/LbDW/zg6NO/u818BNMKB7jS+8+xdKmfWhCQSJ5t20vf9mxnL8eexHHFJirwiePHY3TYjFXDNxxpFM3BbiCKnRpCCE4a/KEgxxtIDZV48Hjr+J3m97iqar1hPU4AliQV8G3p53MxIzDy1px4/iTqOlp5/WGLUlhO0NKrIrK72ZfRrEzE4AJviLAQBWpKc1EMr2XZEZm2WG1YUg6OlLcal8sduG2Bclyhuhvx8lwhOkI2VnVaqZFE3PnQlUVUkraO+p5du1faHMrlDgKOTn/GPLtOYfVnO2BbabbsDCGHLxu79pOoaOI3Aw3Pz3ns3zn/qfwVjTiygmhx1Ra92Qyz1rBzMJ3k/tUT64HJIV5HZx5ysoBddZ0/Yep2d+BOXPY/NhLANjjUTyREAG7OfEXCHTD4M8vLueO688mGA+wruN1Gvdm0FKTRUZ+F6pmUL8jF0U1GD2rlk2dbzM7+wx2BN5Fl/EBx+0lLqPs7HqPqRmLUrZ3x9qo6VmPTXVR4ZqDcoAhZ4LvLHYFXqY9sjvFeKFFBQoaufaJABgZ5qqtRBBDIyZlX1hSwm7lqQ5BMIhFSKwWjVgsTpvNw7YxJYTdVkQTjGuppyDUwajiTAgHoNOPszpGT7mVsFVLTsZ1FOIogEZIbwIg1+45qIhWhcecBNi0YiJ6DQo6iux7HiSgJ8xzFjUbKeNoYuhJsXtVBFX0TZYGMzD0EhlvofNyF1lPQlxvQAL2VXH8Z1p7r1Ti2KBJ6Aq+gttxCtlWH7JXDO6AsAaAqetrk4KdBnJQA0Mv4QkWjNyrUP78KJHYNiwEuGr3Sk4+fTtduh2PGsGtRgCV1q67KcgYmfB3vq2YmuDQYSiKAJtyiGPOA8Ks2mM5fMv7ebZl5ZuBIj3waNexLBxfxp0XV2L95/0QDhOKt9D91hKs0wuI5FhSbontHX/FbSml1HMGAEdlnkd7pI4NnS/08wCSCBROK/omvmuPh84/4nh2OaJBRxXmM6B09psgxyUiJNFCOkT9+GdOoH5eD5Uvt5C/zo/NH0+66kd8Ki0zvXRM6IL8HBg7FkpKUBsayPOUpxpSDwhB+LBYWX81Ib2VpNJkEsnmttvJss/FbTVDyELxRpbVf55gfD8y4eZd2/0829rvYkHhvbit5QC4bfPp6HnSvFYCbANCEVSctj4vibebV/O7HQ8n73MhBM/XL+HEvKP5xvgrk79rw9LUZIZybNmC1taOoQo6Kp1EfKnPQk+eDZs/RvbeEDQ2QsQUanQs+jn2wgvRc/cgiaNpo5NaDP35246/McX2a6ZmRDAS3jWTXQ1Ud23kFf1uTis5hFDzzEzz+038Np4e7+S+zBIaOgJJoWkwDQy5PjfXfcYMb8nfuA2HaiGkxwjbLARcqX2BKgQLckeuVZTmMEkLP6ZJk+aTxGO71xPR44PGzkskD+9aw1enLsRt+e8LKi4sLuPXq98d8nNDShYWHeFJ6wfg/617g2VNVUCq8Sasx7j+3Sd5+6wbyXO4cVot3HzSXG7f/po52k8gXTpkRfls6bFku0buyQDgsdj48VGn8Z1pJ9Mc7sJjsY84ROJALIrKb2ddwsbOWl6p20R3PMJYTx7njJpJhrWv7vk5Y7EpSvLcD1zB1QQsKpx22O3YH2zmhfrl7O1uwK05ODl/JvMjmSmO6ZEMlSxnX+72/mQ6wsT1xIAwMxNdGmzq3E5LtI336xy05TpYw0aerXuVL4+5mhPzUmPGDwWJHDJ9ZS/9mzV3Qikv/PhGXlixje21zbhsVhZdPo7pdhA/fS9ZrsfTM2ydugyhyzABi51otM8YYNVTDQMCwYptZghSXWgn9Xt82F1RppywB8Mwr1nB6FZ0XWHnylKqMrcwO/sMuuPtiZjrweOKFVS6Y+3Jv0PxAE/VfJfWyJ5+x9aYlX0hx+Z9PrlNU+ycOeou1rb+g+3+54nLEAKVosx5lLsbsCY8I5zdBnGfG5XuxPcqkj1YxKeBBGebND1bwmGycrPYIAwCdyiMHlcFQDymsHt5KXL7qWRH98GK3QgU7C2mkSHktR3w7YCUBmoitn5K9qnsaHt7yO/AkAqlHjMlYr7nEgKRZRgoZl+bfMxNA0O281Q0xRSaVJUsdKN9sApxro2iJI5vGTOPjtKdOIkMLIvpMVSbM5fK8nL0He8jAdeaCP7THQfE7kviCMKJTA4n5x/Nw9UvD1qnYsCM7e1ke80wgJ58C9EJw+u0xCdmo5WXE9nyb0DiXBPFfUYEt7W/MUXH3/P4iI0MXouVtriLbK1nwPNtSAgbFka7cg+tsn5hVsFIjK9Z89memZ8MZ+n9zpbtrOG3OZn8IBFm5Y+Y1y1nVQ91Z2QMqHZ7x9+SRgYhFE4p+AqVzjw2dTxLWO8hw5rLrOzryXXOM93vAVVYCEsVgRyQTlcKiEsFkRBZLdq3nezfdaO3KDhaYwNc9Uv2dBLN+StMbDGfB4cDKivhF78wjQyHEipgGKYBNxIxV8UzMw/L0ysUb8Af28VQWVZAsqPtl8wqvBcpJcsbbqI7Vosi+ozTZj3NvNfwRRaVvoQQKhnOs2jw/5q43sJArQOFLNclWFTzPtjXXcdvtz+UqnGQMDa83fw+pc58LildxEF55x3YvRs6OhDCoKPCNDAEiuxsn1BAwGUnPxKgbEs7XgldlQpZ1ZjXcfduWLIEcfHFaJahM0ftCexnsvU3uNQoQpD02AIosXewo+u7RPV3saoHmY4pCsyfn0xT62io41+fO5079/p5fuVWonEdi6ZyxuwJ3HzOMWR5nLB5M0p1NRN8+axrr2XdhFFJ0Ufzqgo0oXJF5ZyDX6s0aQ6DtJEhTZpPOO+37B9WnC+sx9ne2czs3FFDlvmomJ5byNyCElY31Q0ImVCFYF7hKKbkfHgrMCPBHw3x5L6NQxhvzIwSj+9Zz81TFiKl5JnWdSiWgQm4FE3wQscGvmkci/YB3PcdmiWZReJIIIRgeuYopmcOfV9U9bSgYwyZkhFgi7+WEtchunz245WGVfx2++MIBAYGCoIlLRuYRz63SR1NmKaGCj1Iq3QM2gYpodhtmJoIHR3s66mhJWq6l0ds5g693989ux+iwjWKclfJiNppUw5unPNaUmOpPU47l580M7VQW1vKn+7uDNqkn5qaPNatG0t7uwePJ8j06XsYO7YWq+pFFXZ6WlI1L6KDDEYjRsLwYCggwJtn6gwkbzcBKgZjjt5PdXU+lILPkj+kgQHAQMdnNTMFGEac+/dcR9joSplaGMRZ3fY4uoxzQv4Xk9utipN5eTdzdO6XCOt+rIoLCzbwfLPPS2XFCsZ//hL2+O8bcGwpBHG7gkPNgEAIYjH8+VZco1uxVfZledAsBuMWVtF5dIT47kuwvP56cqEo5lAGiOj1djmxxENa4jmDPR1/JKI3oUoDi99ASaSti3pVin0XY1VNr54c55m0Op6jI/TWAa1VsSgZlGV+L7kl230lzYE/cWBvoPoN1JDEbjUnJe0TZvHC4h1cctYSMCSqXybj9qMehUjMyutLPsNxcyuQO54CBEoI1IBEz+h/br3eGub3mW/P4rMVZ3H/vueTWVYAFBS83SGOtpclM/tUleWRoe9HVQf/DTEMQXuXl6K5c5GbTWOIEpIoAYmRkXp9daPT9CoaQUpbq1ZCkcVPY8xHlqUHc1HWDHcJGVZsio4rsdrdn6geoDOyCYFKpn0aGvaUMKsNwsqOrILkSnfKOUnJUys3cfOpX8RTXk5o4zMAZK3toe4034D7pjtWjW6EURU7hoyxs/lG9geXoutOBIJQZD9bmq5hfPa3KfyPHXbuxDphOtWZjTibg+h+jVbNS0TR8CghsvUA3lAYmzMLumJ4N3UiXToxj0pM9g+FkdiMOGqTxNHUDBsfNz0UpkwBtxuysg5uKGhqMifTy5cPFF6cPx+OPz7V6+EgxojmnjeGPx7gj5ppOTvCG+iJ7Rw0dEgRENUbaAguoch1IoriYHTeY+xtvoqYXos5PTH9hLyORRRn9YkePle/BGWIsEsJPFv3NheOOhlVDBMmahjwxhumZwKAy0OzzcqSs0fzuDaXzogTdBCaZNyJjVwSX8X0ewOUeZxmetDGRnj99YOG5a1tvo8JtsHFcFUhmeTez8qWNzm24BCMIscfnzQyAPgeeZgfXX8937noBPw9YXwuO7becIjNm01BZGCsN4+QHuPuSeZzryCISwOXxcZdcy+h2JVx8GOn+cTzpz/9id/85jc0NjYyffp07rrrLubMObiB6bHHHuPyyy/n3HPP5dlnnx3RMdNGhjRpPuFYFXVA9oMDsSgfD00GgL+dch43vPEsKxtrURODUV1K5hSUcM/J5/6XW9fH9s7mYTMxGEjWtZmpqda07me7v3nIcg2hAG/W72RRychDJv6bbO4cXHSyF00obPbXcGrR9BHVu6+7gd9ufxyZ+Ad9xoD3jSY2R5uYYTNFS0dvrWL1zMGNGEJAjB6klBgrllHTY4pjRWxm7Hl/JJIX6hdz09hrR9TWkD541pD+dMYGETo8kAPcXcu2FPGPjtGsXDkZRTEwDAUhDHbsKGPChGq+c10lQihkbdmM1EDEIaxZ6bI5UqqVSKx5pjfH3v0B8ivaBzcKKaahoa69FYBx3oW80fhnosZg5yewKU7GekzPj1VtjxM5wMBgljL7nXXtz7Aw9zo0JdVVWBUWXFq/MJV+K3FUVbGw+WwCGY00BV9KNV4Ygmw5HlusBmIxpEVDbdxH9zE5CC21FYoClft3U/PCfYz2eMAfQEqBFjSIxgWd0g0SfGoQq6ITk1b0hOeAIqzMU39H9Uufx/F+FUqo1/Ua7J7RZJ00Dk5ogvx8hFAZn3sPDV0P0Bh4gIhehyIc5LouoMR3Y0r6xDzvlwmE3kh4FvQZGkRUoCm52BMrn36Lg1WvjWPW5l3MD29HCUnT8V5IQoqVJzpnERwbg2Mz6b9yLCKD9fYSQZ8X0iWjTqHYkcsT+99gZ1cNCgpzsiZxZf50cm3/SJZb2zyDz6gHpuAz0XXBmh1jmD3BAZmZKIobwwiY1y4sBxhULWrxiAwMAC7rLNzWckYpVbTF7fTopuifVcTJ0wKoikK265JkeUPG2NJ2B1X+JzAS6fw0YWe8vITRPT3J47+XVTisTk40rrOrsZWj5s6FjWY5LWxgCejEMoYeGu/vvItNXWsJykx6Mx+EkQTiDmLbf0f2OxOwqnmowoIlcyzfHTuV9a4yvOEwtnicqEUlYLfzq4Z/c9YL+6GrC0U3kEKiOg2kVxDJUTGsAhGViDagB5SoAoEO6OkxJ/9f+crwBoZ4fPgUksGg+SwuXmzqN5x0Erz33kGNEYo4uNG110NjZ+DNYTOyGBJ2+t+gyHUiAHbLaAqzH2RT+100h3egCo1y90kUZV2PIvrc/Lf69wyrI+CPddMc7qDQMUyIXEcH7N2b/DOUUcifCiewWuaxMGs38/P34NSiVHdl82b9BP7QdSpnlXczW2wzjQwA+/aZYpZZQxv/bewgbgg0RWJIkQjZStWaiMY2AIdgZMjPN7+r3u80HIa77sJWXk7e3Lnmb0xHB6xcmZLaWQjB9HMu49ELzuLl2i34YyFGe3I5rXgyDi0t3P9RIjgCwo+Hsc/jjz/OLbfcwl/+8hfmzp3LH/7wB0499VR27NhBXt7QGmhVVVV861vf4thjjz2sto7IyHDJJZfw17/+lczMzMM6WJo0aY48p5SM5c363UN+nmN3MTmz4CNs0fBk2B08cdblrG9uSGaaOLa4jOm5g+d8/m9hO4j7ooL4/+ydd3gc1dWH35nZvqveu9xly71XbMCYakrohN4JJYSULyEJJY2QEAihhNAJnRC6KQaDce/dklwky+q9rbbvzHx/zGqllXZlSdi07OtHj6WZO3fu9HvPPed3MAXK7G6tRUSI6FGiE0R2t9Z+54wMRzoHKmAIkxbxSLxXvTbiTJQswkf5IuOrZXSiRHpdJ3llzRweHt7QoBN0sGcP7tJ9wZncfRPStdRjvdjZVjzothpEQ8hscPg29D0HVc7DVLoqMIkmxsZOwCSZQgbZzt1+mnYlQgIoitbxVFXt/5KSXLZtmcGEnD1Yq2swJRjwNHrZmZYfFH0EzcCABEvmavdVpbME4QiRNJKlI3BcJk7P+jnvVP4h4E2inTttnkvg9Kyfow94cext/zSoid6bLn+RovZPmZhwev877zUTx1NPcdp119E26io2NL2A099EnCGPOeKZWFy/AK/2XvMaJCRFJqnEwcR/VdE40YYnToex3U/Krk6sVW46ZRekjkRsb8CVoqdNtrC1ehiOOHOgnSqp+nbyTc3E6HODgzDzmjWMUafjEXLwGpsRkDDps9ATA59/qf0ERPREnZ6s2GvJir0WRfUhoAs7qJbEGEamvUWj/WlaOl/GJzeilzJISTmHWPM+bRCmKGR98THX71mLvFvPJsMYUhI70EkynU4TzW2xDKORhe3vgfEwEt3eMqox/JUwGQpDlsxLnsS85EnIqjY3LgpiH48asyuP5ZumsGTmdmRFQBIDWg+KQJvDykvLT2TpvFg4uAejfjQuzxbtipv6mpwSbFf2aZWseGhyrcOntGMzjCDOMD7knAmCQH7SwxxouJAknYckXdcAVzOM5yTci0HXnSVpa/2vqHJ8FngatTKy6uVA/bMkeeJIMGnhW76YWAQn9GNn0GZ9ExICxrKAocGj4EehZ/iOtl5BVf3sbX0Np2rosab7f/16mXb3XlKsWqf9wfzJ7BBzUFWRdktPAVWVp3TzOdXzHLqumCavis8k0bjEhn2qCX+ciK5NIXaHi9j1bmL3+rTj9Xph/37swwQqGu6g3bsDSbCQaj2N7JiL0UsJwYwJFGvvO1VV6fS341ac6AQ9cfqkboFIRYHnnoN//hMKC/saLnoZIzIuPJM9/K6ft6FAqlkbkLR42yOf/ACtvo7g7+X2z1lVd1fg3Gvvo11tb3HA/iWn5jxBjF67DwbyzTliGZdL8/IIIOfmsWdaMndPeZ8sa5sm2CtAhrmdeemlvHxwFkWjF4LohtJAyFhdnXZ++jEyWPWxqECnbMCLRPc9o2IS/JgFH8mmQXgCXnyx9gwX9/iWlZeHGBX6EMgykq3Tcd2Y+QPfV5SjzzeUwvLBBx/kuuuu46qrrgLgiSeeYNmyZTz77LP88pe/DLuNLMv88Ic/5N5772X16tW0tbUNer+DMjJUVVVRWFjIU089xemnH6EzESVKlK+FM/ML+efe9dQ428MO2n4y8biv5KZ/rJicmsHk1G+XYaEn4xMySDZZaXKHj51XUFmSraX2Mkv6fgehKipmaWgzBqqqsrymiJfLNlFqbyTeYObs3MlcPGwGNr3pyBV8BeYkj0YniCF6FD2RVYVFaeMGXe8+e2W/M1FbJ6TjqHAQJ1pJNyVxygdFfHzGuD6GBhGRM1uTEN58Gp/a7XVSPLnvfaWq4JLDu632x5SEqbxf+27E9SIiE+O6PTlavM08XfYoZY5uw59BNHJW5nmc2GOQXVbdwjmddbw9ehalCb3bK7Dp9eVcHtuCAEzNzWSV5xBbMzWBLm2+W0AxgjpO4kczZwGQkRhHTT+HqCJgMHR/9kfFzOGyYX9nU/N/OdS5FYDhtunMTDqXNPPIYDm/Gl4zoCdOf9sRy0SaiYvPz+eUWbO7Z+I+fVJz97XZwOdDFsGVrEc2idiq3diqex2kAOAHoxFdYga1mQIORULy9tBHQaDeF4eMkYtMC0MGYYIgYNKlY9JFMMb2SHPYpdYvhjEs9UQSbaTH3U563O3dCxUFrD+Fzk7YuxeT309WaiI1De14vQaq60JnXlUgPysJVqxAL/pR81VUm4gcG76Tqeimhm9LTxG8Xh41p6gdLP3wBIoPZ3HKrG3kpDXhchtYtbOQTzdNY/b4acRaTbBxI0bdaHz+GryGWpRYCXqYnqzGeSTGXBOy30r72xS3/AW/0p2CMkY/mimpf8NmGBZcZjVOoiD9YxrsT9Hm/BBF9WIzTict9npiTPOC5Tq8+3sYGELDRWSDSJv3ALHG0UiCiflp8bxc1hb2fACkxtooyEyB/XsxiLF4lDZtltHUpaGgoqqasKZJikMSzLj9VTTLImHNbYEUrZ2yQgpQn2xjhZAdwcVQYFLFYRyqTJwoosoKilFAdCvaOe1SFEWb/fZadfjizOjaNA0Mv+qi6tObabg4KTgYP9T2CNX2V5mW/irm174I3tsOv50D9l2UmUwUZ2XjSRQYFl/FKZV5ZNTKWmaFrgwEBw/C6NERzxlr1qBrbibzghOp9qwIU0BFRMeohJ9q7aT/CUlRAKcSD4DL38zqunvo6x+j4pZbWV13L6flaFk95qdM4ZCjJuw3V0BguC2LJGNcn3UhuFxB/QwA34g8LpqwjgxLO4LQfYW7jG4/HLmRZ1xjwZ/fXYffHxSBjERh4kWUNn8a8GAIvWfdqg5FFZiRdE7/be2JTqe9h/rzUunJELKMRPlu0NHREfK30WjEaOzraeT1etm6dSu/+tWvgstEUWTx4sWs7xFi1pvf/e53pKamcs0117B6dWQttf4Y1F23du1aHnjgAc4//3wuueQS/v73v2Oz2Ya04yhRohwdLDoDr510KT9e+w5bGqt6LNfz04kLuXjklH62jhIJnSjys4mL+OWmZX3WSYLAsJgkTs3RZpBPyBzFXduEsPG/oIWDnDwELwZVVfndzmW8Xr4l6CnR6nXycNHnvFOxg5cWXE2CcXApBgdDvMHKD/MX8MKhL/usExGYlTyK8XGD1/qw6kz9ege0JlpR5k2CTTsZbsumtrGZ054uplJOZXdcPo5YA8lpLUxyl3OOnA96KzpBQlWhZGIG7Ynhp/ON0uCNMrmWPKbGT2N727aw7T0hdTHxhngAPLKbv+37A63eUNE/r+LhP1Uvo881cFxgkN3e6Ubvl7mgaA21MYnsSc6lw2gm1uNifFMFGfYW5AXj0UkimdZYxlxwDs0pZhz1HkSfgGxWyctN4LHjzyLNoqnvnzTsLDbseAO9MYKYo6gyK3teyLJ08yjOzA4/k9FFvD4Th7854noVyLUO8D0zkJk4d8CIIAjaTHNGMh0pkb21VFXQxBxHjcYbZ8HRroWE+A29w8QEmnxm7C8+TUKP/Tc02dncBl8qCbgtMcwflcjJlk7imrtnOiku1jr2l102sOMMR5d42z//GRzYTRyTTWu7k1LJxp6kPOwGMzE+F+ObDnNCmoEEvxvsdvyKExWBzssNYbUmWmQbdlcTObEDbEPA2JVhb+GXkwv58w6BzcVjg0r1ApCXnshPL1ykGXzKyxEEEZtpEY4FFkzGA/jkavRSJgm2S4i3XhiirF/rWM7upt/22X2nr5QNtVdwXPa7Qb0LAJM+n9zEP5Kb+MeITS9re00TcAyDN06H3yTQ6a0gzjia2c01pCVINLbpUdSeRnbNMHLN4mwkUYSNG7Hq8/B62pBNAr7Y0HtGQiXTuhhBEPApHhTCG+yN7X4klxLM9rE7Pxlqwh+HoCicuG8vdoOBOEVAddrxxUuokoChVCauSrv/RVREQUFNFGgYmUjezmYUQcFj6SD5Uy+1F/bUj1Dwyc0cKPkJE9doM+se2cWWlt08Yz6REjEPalWoFZD0fr488RAPCGaSN3QbgOTaWrbn5/JiajJVosQIUeTiTgeTXJ6ghgfFxUz4dC7yyX7qXKHfBYNgY0b6Mxh08QDkmG18aU9muLGpT8iEqkKVN4GJCdoNe7BjWRgDQ9cVk2l076bVU0aCcTinZszjg5pVtHk7+4jyqqhcnn9G+BPfE7NZG3gHDA1q+T5mp5YFjQq9kRWBmVm7YHkPEVKdrk9qyd4YdRb8RApXFfBiQI1wT0VEp9PeQ0uWaNkx1q3rG+LyNWQZiTIEjmJ2iZyc0L7X3XffzT333NOneFNTE7Isk9brXkhLS6OkpCTsLtasWcMzzzzDjh07vlJTB2VkEASBn//85yxdupSrrrqKCRMmcOutt6LrZSG77bbbvlKjokSJMjgyLLG8cdLlFLc2UNJWj0VnYEHGMCy6vumUogyc84dPwqfIPLBrJe3e7tnT+enD+cusM4LhBKnmGK4ePYun923oU4cAnJM3kRGxg0+huKr+AK+XB9yTe3yZFFQqHC08uPczfj/12OpY3DT6JPSixIuHVuNRtJz2IgKnZk3h/8adOegYbIBFqVPY1nog7DoBGGbNJP7y68D+GOY9e9HXJOJsd5NOO+lNO4NlY2OMmKdq7vBmyUTtsGTWLB7ZJw64y/YzPnbMoNsKcN3wG3il4mXWNq0Odmp1gp6T0pZwTta5wXKbWtbR7G2KWM/7Nf9l3kUPIjU3o99QjM+vGQMy7C1k2EMNE6IoIHYNIMaOZcott7BJEFhTU06r20V+XAJTUzJDzr9BsjDGNocyX98ZLkUB1WPltHEXD/r4F6Rey2uHfxw2ZEIFLFICmZYBerQMZCbOaOweAGRkYD7xRD6fu5cxu3eQtr0dnbt7YOE3idRPiSPnlH/AW2tpqdYG7z6ThCum7/svrtlD58oPSbBoM+n7a9r5Y2cWh+Mzgjnndx8WeMaQyBNXnMCwT97uNnqsWaN17L9Kx33cuG6xOUA8eICN2XP5KGl0iGv/trSRZKU4GP3lW0F3frFRpSw/hQTViRQI7FVVqPXHcdCbzQRL5NSZIfQKWznvwGYKzjyDFypd7K9sJMZq5PQ54zhz/nisB/cHBeRAy65gW3Intn7Ogaqq7G/5B92qHT3WIeNV2qiwv8nI+OsG1t4A7d7SyCtFgZYpVuK2aNffXbaa25fW8GzlEg5UZgWLmY1eTp69lYSM92BPDpSX4w/ojbROs4YYcLoeLadP096J6OkCiN4uPQ/tm2BMSYWajrBl410OMtpbEXUCxMfjkTyoRlkbfygC3UFIoKgSEgoumwjjxuGrKQJFwFzjR9cm40/s7n+ryIhfrsWnHI9etFHRXsGT4hIOGDMDlyGggeSTqHs7nVL7eyQXFMCePciiyFqbhf8IsCxLC0vYAfw3LYUfChL3HChDDMzai+vWMfWU3+HJETnc8QJ+xU6yeT6p1pNCjjPRkIRDNnLAncoIU2PwnlVUKPck0+qzBVMLt3kPRTy3XbR7y0kwDidWb+Uvk27nb/tepLije7sEQyw3jTiP6YkDeBeZzdpzXK1d25jqw8QccOMcE95oIIkqhTWVcKize2F6ujag74cW1woEpKDHSV8UWt1rSLUOwTs8LQ3OP18Tn2xrG1iWkSjfLEfRyFBZWUlsbLdVOZwXw1Cw2+1cdtllPPXUUyQnDy31dxdD8p8pKCjgmmuu4cYbb+Shhx4KMTIIghA1MkSJ8g0xNiGVsQmRRVyiDJ5LRk7l3GET2dRQgdPvpSA+jbyYvm6gv5h4Iladgaf2rcfp1wbjRlHHpSOn87OJxw9p36+Xb0GKpF2gqrxftYtfTjwFq+7YpScVBZHrRy3mkmHz2d5Sjl+VmRCXQ7LpSFOmkVmcNpW3q1Zx2NEQMhMlBDrBN4xciqDXwy23sP7//kh7x56w9dg7PRysbKQgPw1pwXGIc07BX/85Yp/ZLVAQODPrxCG1Vy8auCL/Ks7O+gFlnQcRBJHRtjFYdKEdzJ1t2/v10LD7O6jyVpN3yy3oy5oRln8WNmZcECArJU6bPezh7moATsgZ0W9bLx3zC94oE9nbsQZJp50HVQWLmsaNk/7cR5xxIGRaxjI18Qdsa3kraGjoaraEjvNy/zK4Co80E2e1wuLF0NSkdeIrK7kg5gb+fcIDxC1pxdDhR+dV8BskPLESNv3VnOhOhPJylIA2f9nElLC6HCO2NqAEYuodTg9/6sziUGw69Mg1LysqHq+fuz8s44UbrkV49NHuClat0jr2Q6WoCDIyoLYWgL1CDBNKd5FcW8nepFzsBgsxXieFzRUojlZaR2SRWFGGTjDgTDbi2mdk77As4iUnAiodihmvqgcU0syRU+mFECZsZfyyN/lrfj4s6SEg99Df+sZ6z59/RCOLy1+Nw1/eTwmFeseKQRsZNM+HSOog0DTLysitmtGxw7OPce92cuuF71O+IJXapgSMBj/DMuvQSQrGYhfuD/6MCStOvzbQbJ4ZfsDY5N6AovoQBUNg30qfNiiGrr+1d/FsWy4W0zacbn2fskmdnegUhRiTCfwKTZkxiCO8mBp9+K0iQvD1JSCbBRqn2lidPomC5oWof9O8jgQ/GJpCjQwoKrHbXcjY0Ys2tnpEDiRlhonhFphaf5DGNgH/zBh0MTHst1qo8XqYUnaIDyeORxXF4DP+kioz94RFnPLRJ91VrFqF8fzzGZ34s7DnDCDDejzjLX+kxJXMJ62FJOg0kdlWv5lsYxuTbJVkWTXBQ6MU2+c89UYrE6jbnMwDk3/CYUctVc56bHoL4+NG9J9RoicJCTBiRNDIYGjqIOO1Dmovig1raDCVeMl7uxUafN0Lhw/XBvT9oKjd2XD6L/MVEMV+dSGifD+JjY0NMTJEIjk5GUmSqO+hQQJQX19Penpfw2lpaSnl5eUsXbo0uExRApMrOh379u1jxIj++yFdDNrIUF9fz7XXXht0pbjiisEpdUeJEiXKdw2jpGNBxvB+y4iCwK2Fx3HNmNnsaK5GUVUmJWYSYxi6bkJ5Z3NYA0MXXkWmyd2J1XbsjAxd2HQmFqQeHeFKo2Tgb1Nu5h/7/suqxp1BL41sczI/GnU20xMDHgc6HQ+SjWviKUxuKGNiYzkmf/dsrUsy8LaYxq9+9ztIS+NCxUeZp5YdbUWIqoiKgoCIgsqNIy5ihC33K7U7Th/HlIRpEdfLqr9fbQ4Av+oHnY6pf/gVN3vjyD+wm/F13cclCOAzmsi79AI467QhzZpfMPxneJVb2N7yKR7FxZjYmaSZ8gddT08Wpd3IMOsMVjc+S5unClHQMcw2i4VpN2DRHSH+ORL9zcQ1NsJddwWLJr/0NjddeRdvm7ZTY/oSzB70Qj4npF7OhFopONseo9OE1kqnpvTZnaCo5O9uICYgkFjkNVIenxFiYOhCUVTKK5spMSUzNj+/e7C9bt0R09ZFRFE09f4RI8DtxufxUt6u7TvD0UqGIzRLiSBAmUMhMSYGnU6HN9dE5o52DixJpZmY7nKImKVYRsUcN/C2fAUBuSMeJv4jl1EH6HXRg5yY06h19g3d6sKTbMC48EzYWI5fdSN5FIb/u4n07A5aJ1nwxUnoK2USdjqxVHnxmuowGUYACi3TLXiTI+lsqKiqjEvuwKXoMAvegER89+DREyfhN0kIAacX/eYt3HRiAX/7qJy+hhEVg07FZrCAvRniNSFN+2gTe3+Zjr5TQfSoKMZA+IYo4Ko2QjMIEV3vQRdIkSpatO/BBt2IsCJxgqowoaECr6zH4/MjTpjAnv0lKIDJ5yPW5abd2m1wEYB/up2cMsjnwCDFU5B4A7Q+xkhzA61+C6oqMFXnQC8qjIi7FIte8zIZFrOE4rb/RKzLJCWQZp7cZ3meNYM86xC0nUQRTjwRDhyA2lqkTi/mRjOZL7TjydZhnxwQ32xXiNnhxlTlI96VrV0v0AyFixcf8T0Qa5xMjf2V/ssYBpedKcp3F0E9CtklBrm9wWBg2rRprFixgrPPPhvQjAYrVqzglltu6VO+oKCA3bt3hyz7zW9+g91u5+GHH+4TptEfgzIyvPbaa9xyyy1MnjyZnTt3kpv71TpsUaJEifJ9w6IzMDdt2JELDoBUUwwVnS0Rs1YICMQbjpBO4FtKnN7Kb8dfTqvXTqWzEZvOxDBrRp/wi/pmO35zDJ/nTeKL3AnEeN0YZB9eSY/dYEIVRH6ZmoqApij+m3E/YlvrXlY1bsbuc5BryWBJ+nyyLcc+w8rImDGU2PdGNDQYRANZZu27mRBr4aE/3cADL3/OP7YfxOpxYZB9jCvI56arTyc196t5JBlEE7OSlx654CDIs00jzxbZyDJkws3EhZltNz/xDJfk58Osa7tn2z/4MGRgHKOLpX3mOBxJVujl0WLt8BPr1ZNg1QRES5Lyobb/ptU1dDB21qzufTidR0xbF5HWVm17UYTCQtokE3y2JWJxVQWHywtLToSmJnLxUta5AWOHH2+8MXCfqRhEK2fn/HFwXirHUEDOosvCICbgVVrDrheQSDIfOT97bzKtJ2CSUnDLjWHXxxvHYrv859DxKOKmD/HjRQAsVV4sVX2NGjpB0xRTCkZTs7SD3vdLV2tjDKOQRBMmVFR0uFQw4A8JWZEFHVWTExi1KbBZeTnXWZZiXKrjnyv30mrXBv46SaZwVgKZq1IRnE7w+YjrUGhDR8tUK+hEfPGhA1dFhdwkATZvRifG4FfsKDrwJocaHEQv6EQbelHztHNG8DaL9bgwytr50KHHPXUqvpKi4Hqj3xdSXgWqOtphCM/B6PhrMYix7Gt7Cp2ghZIZxHhGxl/JyLjLg+WSjeMYHnMyZfblhPqTa35TM1N+gigcZfHChQvh008142ZrK3GHLCh5nQj4MVXZexQUMNvjMB5q0/7s8oI47shGvRTLqZSKf8GntEKfkAmJBNNcLIaBzQpH+R5wFMMlBsMdd9zBFVdcwfTp05k5cyZ///vfcTgcwWwTl19+OVlZWdx3332YTCbGjx8fsn18wGOn9/IjMagn9pprruHPf/4zt95666B2EiVKlChRBs85uZPZ1FQedp0kCByXNoo4g/nrbdRRJsEQQ4IhJuL6xDgLDS1aHKwqiHQYQ40q8THmEMOEJIjMSJzAjMQJx6bB/TA/eRGf1H2AT/H2MTQICCxMWaylsgyQlhjDX289i5YOJ/UtdhJjLaQlRj4X/3MMYbZdGDeOedf/gW2H/kKDpxYxIKqmoJCgWhkXNzl4vxjTUxBq2/ptQnKiFdy9wqPc7vCFj0RPJXpRRLzoQp6sT2JKYxkTmsox9RjcuXV69qYMw3zyiSw8fQI88QR6TIyKWYCUsIT9MaUoqp8sywTGxZ2EURqCAOwxEpATBT3D4q5kX+tDYdZqmgN5sYPXBhEFHQsy/8XqmhsChobudIA2fT5z0h8OGk90T+9B/vLtsPWogboses2QZTvnGpSGSKEbKiPjtY64XjQzPOY4yuyrcasigqoiBPylQKBmRjwzdvR4fp96isuvu45Lbz+dHXVFOHweJqSNJt5ghWc+hhpNGdJql3G0Rw7XEIAZzZmwazmiYEISbbgyPfjjexoZRFSDSpxxWvD+np5mY2ebEkyN24Ve0TxNzCYJg15CkaSQsDyPLtSjQwDSrDZtcN2TATwHgiAwLO5C8mLPpdN3CFVViDEM75OdRRAE5qX9mlh9LkVtr+NVND2LeMNwpibdSI5tXrjqvxppad2GgtJSdLW1JJYl4Ta340rwoBgEJK+ItT0Go9OCgKx5MHQZGAbwTIiikYnpz7Kr7kp8SgsgBcwmMlb9aMam/PXoH1eUKL248MILaWxs5K677qKuro7Jkyfz8ccfB8UgKyoqEI+BjsegjAw7duxg1KgBxvxFiRIlSpSvxGnZE3i7Ygdbmg6HeDNIgohVZ+Dn45d8g637ejhr0QSeeXsDSpiwEVEUOGvR129MiEScPp5bR/6Mxw4+iFtxIaKl+VNQmJowg7OzwsfyJ8ZaSIz9bnqkHFOGONser9Nx57g/s6ttK0UdO1FVldEx45iSMxL9G/cEix83OplHt0aYbRcE0lNjKRyTBSv3h640DTEEqpcwV5LqY/i0cawsieWLnPBeOg+esQBausXtREFiZOpiRiZeMLQ2hOMYCMgNj7sKp6+Kys7/BFz8BU2YUDAwJfVvWPV5Q6o3xjCMk3Pfo6pzOY3urQiIpFvmk2Fd2D3TrdORdv2TrJx+ENv6chK3OYJioSogm0VsC6+EU+6AtDSSgEnK3exq+n0gPayIJrWpMCbhJrJt3aJ8s5Ovo9qxDY/iQEUOZrsQEInLmUr88VNhXUAAOJCiVczPZ+qsgNZF6WbYuFFbH3in6WQdxjovhma5T8iGqoJtv4eRLx0AuyZQaZRSsZ81Hb2uFJ+iue/HG6czfMytmGJfDRqKfmCQeVYQA1mPug2xPlE7T+PyNM8xaetWhickUtrSjFOvp8Pc9/6+cPwEaO71rAziORAFHbGG/scPoqBjUtJVjE+8lE5fLZJgwKpLG5K48IDpMmSKImRnI9XWYq2rw1zrQUVBRIeg00N2umZgMJsHHDbUhc1QwKzsz2lwfEC7eyuioCfRcgJJ5oUIA9WQiPL94BvyZAC45ZZbwoZHAKxcubLfbZ9//vkh7VNQI+Vc68WGDRuYPXv2gCp1Op0cOnSIwsLCITXq66ajo4O4uDja29sHJKIRJUqUKF8XHtnHk/vX8OqhTbR5XegEkZOzCrmlYBF5tqRvunnHHIfLy/W/f42y6uag+j9oBoactASeufsiYqxD1704FrhlF5ta1lPlPIxRMjE9YRZ51v41PaIcgfr6rz7brijw0592b5+fz1PZ83nxzQ0IQncWElEUEAWBB+45n6kTc+G++7q9JywW+Nvfhq7J0Gv/hy67nuvufgW3xxeSPlIFTppTwL23nIbw5z8fnf1/A9i9B6jpXIZP6cCmH06WbSl6aYgaHoPEr7jZXP8LGhyr0Xf4EN0qOksyBcN/SXbsqX3Ku/2NVHW+j9NXjVGXTLZtKVZ9dp9yHb46NjU9y8GOL1DwYxRjKIw/k2lJl6JXdPDoo6HeN+FwOuHzz7X/DQY8iXqa8+20FZppm6zpR+jaZRJ2OMk5kE9MmSOYbpGMDHj1VdTUZDxyI5JgQi/Fa+veeCMkc8jmE87ijhUH8HhlRFG7xwVV4amWLYxPj0VobYXDh3FNGM/y0oN8NnwYyyZPDG4vCgKT0zN4+ZzzMf71r9/Z+7Bf/P5QQ6aqgterLdfpwGDoTjUyiLChKIPn+zoe6zquYb/7I+JQjdQBFLebQ3f9+jtxjgZsZBg1ahTDhw/n2muv5bTTTsNq7euaV1RUxEsvvcRzzz3H/fffz+WXXx6mpm8f39ebOkqUKN8fZFXB7nNjkQwYpP+tDk6ny8O/39/M25/vpMPhwWYxcvaiCVxx5kxiv2UGhijHGEX5arPtvQZh6i238GmzyKtvb6a0vBFREJg3cwSXXzCHMSPTYc8eeOSR7u0XL/5q2SV67Z9bb6UyKYvn3tnAZ+v34fPLZKbGcdGpU/nBSZORioqO7v7/B1EUP05/LQYxBoMu/qjVKytefKobg2hF7Dkj3XvQGol9+7TsBoEBq09x4jTbcSf6kQ0Cep+V+PZ49I4e3fSEBLj+erjyyvB11teHCKZiMuG89Ao+sktU1reREGvmlNljSVvxEfznP1BSorV3/HhcsbE8fsapPFdXg9PnI8ls4dKJk7h+6gzM+/Z9/+/Do2HIjPKV+L6Ox6JGhiPg8/n45z//yWOPPUZZWRmjR48mMzMTk8lEa2srJSUldHZ2cs4553DnnXcyYcK3x4X1SHxfb+ooUaJE+T6hqipen4xBLx1bF9oo31/CDMK47joYPx6fT0YUBSQpYLTYs0fLWtEz9jyQyeRY7F9RVPyyjEGvO3b7j/L1caRB69y58NprsGxZMKUpdHtD93nDZWTA6afDj3/c/0z6iy/2NXDk52vCjV2CqStWwFtvda/X6eCSS+BXv9Les7KMQQq8Z//X7sOvasiMMmS+r+OxoJHh3j8dHSPD3Xd+J87RgI0MPdmyZQtr1qzh8OHDuFwukpOTmTJlCscffzyJ38Fcrd/XmzpKlChRokSJ0ouBDMI2buwrMDl/viaU+F3ff5Svl/4GrV1eD59+qhka6uq6wyJAG/ynB/QATjppYK76fv/AQjYC6RsB7b4rLIThw6P3YZRvjO/reCxoZLjnKBkZ7vluGBmG5HM7ffp0pk+ffrTbEpb8/HwOHz7cZ/mPfvQjHnvsMdxuNz/96U957bXX8Hg8nHzyyTz++ONBxcwoUaJEiRIlSpQgQ8haMVixt2/1/qN8vYRL0dpF7wwfa9dqBokuPYD4eJg3b3Cu+gMVTB0xQjN8GI0wcqTWzuh9GCVKlKPEkDwZvk4aGxuR5e7csnv27OGkk07iiy++YNGiRdx0000sW7aM559/nri4OG655RZEUWTt2rUD3sf31XIWJUqUKFGiRAnDQOPm4diIvX3T+4/y7eRou+oPJGTj88+j92GUbwXf1/FY13ENv/voeDKU3fvd8GT41hsZenP77bfzwQcfcODAATo6OkhJSeGVV17hvPPOA6CkpISxY8eyfv36AWfD+L7e1FGiRIkSJUqUfvimxd6+6f1H+d/gSMaL6H0Y5VvA93U8FjQy3HWUjAy/+24YGb5T5kiv18tLL73EHXfcgSAIbN26FZ/Px+LFi4NlCgoKyM3N7dfI4PF48Hg8wb87OjqOedujRIkSJUqUKN8y0tI0lfxzz/1mxN6+6f1H+d+gv5ANiN6HUaJEOep8p4wM77zzDm1tbVwZSN1TV1eHwWAgPj4+pFxaWhp1dXUR67nvvvu49957j2FLo0SJEiVKlCjfGY40CPu+7z9KFIjeh1GiHEtUEL5q/MB3KP7gO2WefOaZZzj11FPJzMz8SvX86le/or29PfhTWVl5lFoYJUqUKFGiRIkSJUqUKFGi9EA9Sj/fEQbsyfCPf/xjwJXedtttQ2pMfxw+fJjPPvuMt3rk9U1PT8fr9dLW1hbizVBfX096enrEuoxGI0aj8ai3MUqUKFGiRIkSJUqU7z2KoqW59Hi0DBUJCdHQiihRogQZsJHhoYceGlA5QRCOiZHhueeeIzU1ldNPPz24bNq0aej1elasWMG5554LwL59+6ioqGDOnDlHvQ1RokSJEiVKlChRovzPUl8PX34J69f3FYmcMwcWLvxui0QOxngSNbREGQxHwxPh++jJcOjQoWPZjn5RFIXnnnuOK664Al2P1DlxcXFcc8013HHHHSQmJhIbG8utt97KnDlzBpxZIkqUKFGiDIBoZypKlChR/nc5UtpVpxNWrNB+vq50l0fzuzQY48n33dAS5ZggHAVNhq+s6fA18pWefq/Xy6FDhxgxYkTI4P9o89lnn1FRUcHVV1/dZ91DDz2EKIqce+65eDweTj75ZB5//PFj1pYoUaJ8/RyyN7OntQ6zpGduWj4WneGbbtL/DtHO1DFDVVU+qzrIS/t2UGFvJdVi48KRE1k6bCx6UfqmmxflW4rD76CooxhFVRhpG0mSMbJQX5OnmYOdpUiCxLjYsVh1lrDl3q5azeuHP6fV14lOkJiZVMBto88lwRBzrA4jhA5fBysbV7KlZQt+1c/omNEsTl1MtiX7a9n/9xlVVTnkKKfeXU+MPoaxsQVIwiDfL34/PPooFBcPrPyaNdDcDLfccmwMDUfzuzQY40mXl/T69UcuO0RDS3HHfpbXfUG5sxKbzsr85FksTJmHSfpqYd5+xc+ejhIcfgdZ5kzyrTlfqb4oUY6EoKrqoG0iTqeTW2+9lRdeeAGA/fv3M3z4cG699VaysrL45S9/edQbeiz5vuZljRLlu06T28HPNrzH6vqy4DKzpOfH44/j2jGzEAThG2zd95wjdbx60qszVeFoYFfbYXSiyMzE0SQav56ByncJVVX5v3Uf8UbpbiRBQFZVREFAUVXmZ+TxzAnnYZS+UwmgvrOUdpaxvO5T9nceQC/omZE4ncVpJ5BgSPimmxaCoiq8WfUWn9Qtx6/KAAgIzEycwdXDrsAkdedfd8luni57ns0tW1ED/rU6QcdpGUs4N/tsRKF7tvcXO55ga+v+PvvTCzqemflzsiwpx/S46tx13Fd8H3a/PdhWMaBLfsOIG5iZOPOY7v/7TKWziidKn6LKVRVcFquL5cphlzEtYerAK3rxxZBvgd3nYq2pnS0FAo5YHXHtRk6vsjKho9c3ef58uOyyr3oY3XyF71LE+gZqPFEU2LtX+72wkDbZzn57KZ1+B6qqYtGZGG4dRpqpx/MyduygDC3vVH/I65XvICKioASXZ5szubvw59h01gHV05vVjRv49+HX6fQ7gstGWPO5ddS1pJlSh1TnseD7Oh7rOq4Rd/4JyWQ68gb9ILvdlP7pzu/EORqST9GvfvUrdu7cycqVKzH1OFmLFy/m9ddfP2qNixIlyv8uXlnmspUvs64hNFTLJfv4884VPH9g8zfUsv8BujpeA+nIgVbu0UfpcHXw8+3Pcsn6B/hz8X/4w97XOWfNH3l437v4FfnYtvk7xruHinijdDcAcsDWrwT+X1dbwVN7N31jbftfYlXjan5X9Ec2tWyhxdtKvaeBD2s/5te776bGVfNNNy+E1yvfZFntR0EDA4CKyuaWLfzjwGN0zRmpqspD+x5hS8u24KAdwK/6ea/mQ/5T+XZw2Uc1G8MaGAB8qp9f7nzyGB1NN0+UPkGnvzOkrUrg35NlT9Lh6zjmbfg+0uJt5U/F9/e5jzv8HTxy4HGKO0oGVlF9fci3oAkvvz/ezStXJHNgdhI14+LYO8vEA+fLPHZ8BvQcRK1Zo21/NBjidwm/P3KZV18NMTB4vH42t8K9lVZuPRzPI52pHBJt2rNVWqqFZrS20la8jS0tO9mfKLJsXiGvnTKNZfNz+cJSyQF7aXf9xcXaPgbAQXsZr1e+AxBiYACocdXxYvkbAzvuXmxu2c7jpc+GGBgADjkquGfvX/ssj3IMiWaXODLvvPMOr7/+OrNnzw6ZSSwsLKS0tLSfLaNEifJtQlXVb603wPLqfexvb4y4/h97VnPJiKn/E7O9PkVmVX0JhxxNxOnNnJheSKJxaDMaA6JXxwugNSOZrWNScMfaGKtYGb77EMLhw8H1alERb/7lFjbOywjZTlYV3qxci4DAbWPOPHZt/o7xfMnWoOdCbxRUXti3jZsnzBnS81nnbuT9muWsb9qKT/UxwprPGZmLmZ446Su1WVEV9rSXUOqowCjqmZE4mRRj0leq81hxyHGY92s+YXvrblRUCmMLWJp5MgWxo4JlWrwtPHfo30Bop15BwSW7eKL0aX43/q5j3tYWbxvLapazumkDbtlDjiWTUzMWMy9pZvD6d/g6+LT+07DbKyjs7SiitLOUkTEjKbHvp9i+L+L+Pq5bzmkZJxOjt/Hy4c/6bVuNu5laVzMZ5mNzncsd5Rx2Ho64XlEV1jSt4bSM04ZUv6Iq7GzbS7mzGpNoZFbiZBKNfT1U/Iqf5fUr+az+S5o8zcTqYzkhdT6nZizGLH21mcdvis/qV+CW3X0GrF28VfUOvx43AM/jL78M+fNvcwXqRsfQ89XUJYOwaXgls1PPYNrrH3evXLUKzj9/sM3vS4/vkqqqHGpqZa1f4MuENJS4OE7NTuV0VyuWmh5Gla5Bfjhvil7GE6cqcmdLJjuJQ5FUkGBPp8CbHbHcUZjDWY1aiIQK2GsP8ebpp7Bi5NgeFQp8ONzFovYD/GGDgsUfOClr1sCSJUcM31hevzLowSAoKla7B71XxmeQcMQYWdu8icvzLwwb8hSpL6eqKq9XvoOAEGLEA+290e7r4POG1ZyZeUq/bYsSZSgMqXfe2NhIampf9xqHw/GtHbBEiRJFw+338cyurby4dwd1jk4SjCYuHDuRGyfPIN5kDruNrCh4ZD9mnf5re8Y/q94fdCMPR4fPzfbmaman5n0t7fmm2NFSwU+3vkqztxNJEFFUhfv3LuPmMSdy1Yjjjv4Oe3W8/EYDjy3M5LWYdkTqgXoUVMadkMPfDFcT/8Ir4HbT5O0gaXMxceNiaE0INYCowH+r1nHZsBNIMNiOfpu/g5S2t4Q1MHTR6HLg9Puw6genP1LuqOTuPX/Dq3iDg4t99oMU7zvABTlncm720AZrta567i95jFp3AyIiKir/Ln+Tk9KO46phF4a433/TbG/dzd/2P4aqdhsPdrTtYXvbbn408mrmJ88CYFXjmj4d7y4UFA47D3PYUUGeNfeYtbXe3chde/5Mp98RbOshRwWPHXyG/fZSrsq/GEEQ2N2+B1kNP1gELbxgW9sORsaMZFvrDiRE5AiDS78qs6ejiDlJM2nxHtlLoKSj4isZGWRVwaf4MIqGPt+PWndtv9sKCEcsE4kKZzV/KXmcRk9z8J59ofwNzshczCW55wTvWb/i5/6SR9jbURK8H5q9LbxZ9T4bmrdwT+EvsETQshgQ35Bo7uaWLRENDCoq+zsP4PQ7+z82RQnRH6hLjaNmbDORegGKCs+bDjAtPx/Ky7WF69bBuecO6JhlVWF7axH77OXoBR2zkiaSZ80M+S6pqsqaqnr+mj2G0tSM4BO8ya7wRGwW/7n6VBJfeRncbm1FpEF+L+PJU4Yx7MKPovTwqAn8vv3TLSzMzCO+QguP2J+Rir5RhpGhZ8LuMfF57Bj+e4rKZR/0mCQZgKGlylVDTEsnY7fVMmpPAwZ3tweG16TjwPhUmlNKsA7Xwlzcsptltcv5vGEVbb524vSxHJ+ygNMzTsai0/pyTZ5mql2Rnx8VlU3N26NGhq+JqPDjAJg+fTrLli3j1ltvBQh+NJ5++ulo6sgoUb7FuP1+LvvgTbbW1wQHOK0eN0/t3Mwnhw7w1jmXkNDD0FDrsPPw9rW8VVqER/aTbLJw+bip3DB+JqZjrBrtVfz9DsIAvHI/bpDfcnyKn8/rStjVWolB1HFC+ljGx2eFdMJrXW3ctOl5PIHj7Bpk+FWFh0s+JcFg4+ycQcTVDoReHa8nF+Xxhq0Z0GbYu9hnr+ZWq48Xrr0G8dHHqHO1ISAweVcVXywc06daWVXY3HKAJelTjm57v6UoqoLd58AoGcIKdsUbTdh9nojbG0RpSF46/zz4Al7FE3Ktun5/o/I9ZiVOIduSEWnzsLhlD78reoi2wIC058Blef2XWHVmLso9e9BtPRb4FB+PHXwGRVX7uN8DPFn6b6bGT8Cis1Dvrg87w9eTek99iJHB6XexumkLVc56YvRWFiRPI8M89Jjm58tfpdPvwK8qyKo2CBNQkQT4tH4lsxKnUhhXgE/p/10nCEKwjE/xgyD061brV3wAGEUDnsDvkUg1DU2bot7dxJuVy1jTtBm/KpNoiOfU9EWckbkYXUDY1KY7stFxIGV60+l38Lu9D+Hwa6KAPe/Z92s+xaazcnaWNrD6vGENezr6xuSrqFS76ni7+kN+mHfeoNswJHHCo5g+8Uj3DBASehOW1taQtpeMyQSaIxYXBWiXm2HWyd1GBqcT2togMbJAKUCNq4F79z5OnbsJSRBRVXi54gPmJU3hjp3m4ICluq2Dv2aP4WBq6HtMUVUa7J3cX9HC/dddB4880r1y1Sp8556Nw+/GqjOjRwwxnjjSMnh7h49wXQ5BVRjfVsEhg5UpMTHYXW4azTFM3VfJJ7MKUUOuj4DDa2RltsxlgzG0+P3M+bAEy/qtYR9bg9tP4ZYaUsofhoWLcV9wDr/f/xCHnZXB91e7r4P3aj5ka+sO7hr3f1h0Znzqke8Bn+o9YpkoR5HvkJHgqzKkUcKf/vQnTj31VIqKivD7/Tz88MMUFRWxbt06vuzVQY0SJcq3h1eKdrKlrrrPO05WVSo62nhk6wbumnc8ANWdHZz5/r9pdbuC3gRNbid/37aWdTWHefHkCzBIx04Bf2pSNsurIrv86gSRwoT0Y7b/Y8lBewM3bfg39e4OdIKICjxzcDULUkfxwLQLMQeyZ7xevlEztkT4Kj11YCVnZk8+erPIvWatOrLSeMnWHHYQJqsKBzvr2DjayJz8fNQd5QCML6pm5YLRqGLfuS45jC6DV/HT7OnAqjMRq/8Ks4XfEmRV5q2qz3m3aiWtPjsCAtMTx3FZ/mmMsHWreZ83YgIP71ob1pAmCQJnDxuHbpCzneWOKsqdVRHXi4h80bCWy/JDB0yqqtLibQMEEg1xfWab1zZtCqwPz7Lazzkr65RvhVv5ttZdOGRnxPV+1ce65s0sTltIjO7IgqQ9y2xt2cMD+57BrXiRBAlVVXm14gPOyjyRK/LPGbSXV6u3je2te/ApEgp6evY+/SoYRYUvGtZQGFfASNuIfuuSVTlYZlTMCFY0fNFv+a6yx6dN4t3qdRHLmSUjhXH5AzugHtS6Grhz9/04/d3u+i3eNl6peJdi+0F+UXATkiAyNmYsNp2NTn9n2HoUFOYkDX7y6suG9ZogX4R353vVn3BaxokYRD0rGiL3WxUUPm9YzcW5Pxj4e3YoqR6bm496+sQxMaPZ1LI5ojdDsiGJmCMZcDyhhtCY1Cxgd8TiigpmwaoZPHrS5VUQAa/i47e7H6HF246qaoZ00Gxl6xu3s+eTMiYbtPfnJlVHWVom4SwCsqLy0e593HXmCVgDg3y37KXkw2f5Y/oavIKMUTRwuqGQizvbMYh6AJpGjkfdfqhPfQAxPhcGv5dOpw5yUmlr186J2esj1umm3db7u6XS4jLCrJkDM7QEtCYm72kjco8HzJJJe8euWcP+slVULlZRdaHvHAWValcNH9Yt57zss0g1JmPTWSPqLoiIFMSM7mevUaIMnSH1TOfPn8+OHTvw+/1MmDCB5cuXk5qayvr165k2bdrRbmOUKFGOEq8W74q4TlZVXi/ZHRz03L/lyxADQxcKKhvqKnmrdO8xbet5wyZh0RkQwzhmioLAD/InkmQ6hroExwiX38v165+nyWMHCMxgah2qtQ0H+cPuD4Jl1zTujxguAlDtaqXW1X70Gtdr1mpvQWa/s7ySILKp+QDMmkWSMQYVFZPHh60zfIdyYvyw4O8e2cfjBz7k9JW/47w193Pqynu5fevT7OuoPnrH8zWjqir3F7/AC4c+oNWnXV8Vla0txfx0+0OUdJQHy145dhq5tjikXgNTSRCINZi4ddK8Qe+/yRN5hrGrLY2elpBlXzRs4OZtd3P91l9z/dY7uXX7vXzZuDGkzK72YoSIDtLgVbwcsIfvoH/dNHlbEBG0gYoi4pElPLKEX9FmRkVBoilwDuYlz404AANI0MczJtABr3bVc1/Jk3gUbdZPVuXgtu/WrGBZ7cpBt7XF24asiijBcyv0+AGPIlLtqgMg25JFYey4YNaFnoiIJBgSmJageQnNTJxGvD4uYtlJcePJMGsG2uuHn4mlH+PQzSPPHvRxAbxQ/maIgaELFZVtrXvY2LwdAJ2o49LcSwHC3mPHJR9HrmXw4So724v6fXc5ZBfljkqA4P0QCafswqsMcLZ3sOKEq1bBFVfAb36jGR2cvQxkXQaJu+6C55/Xfu6668hlX3yRJcmL+j0Hp2WccmTDmDHUC2uKlIjPYwk74w+aJ8Pi9AXat6QHitFAjauBKmc9chjviTWN22j0tCKrSkDbTkBFQFHB0uGmvqUCj6x53GxMyezXy9GvKDR1OmHWLDyylw3NO6lvqcRg1wbaHsXLl1Vr2di8K+jtYcuOPGGhV/wIApiMejAasVq7Q4eM3vCeAiNswwZuaAloTXT6XcFFjWkxrJo9ho9PncD6k4bTlGHDo3hRAn0F+67NzPn0YNjqFFQ+b1gFaM/XzLg5Ya+XqmreHyemHoOwyyjh+R8Tfhzy9NeIESN46qmn2LRpE0VFRbz00ktMmDDhaLYtSpQoR5k6R2e/7yeHz4vT58Pp87Ls0L6IA1wReG1fZIPF0SDeaObZ4y7CpjcgoA1odYGZpLmp+fx26knHdP/Hik9q9tDk6Qx7bhVUllXtpNFt/wZaRp9ZK198/zO9wY5eQgLppgTMkhEBAYM3tBMpIrAgpZAsi9Y58ysyP93+LK+Wr8Ipd+9za8tBbtz8OMXtlUfhYL5+trWWsLZpR1iBLb8q8/jB/wSXxRlM/PfUyzh3+Hj0AddxEYGTckbxzmmXkWOLG/T+Ew3x/a5XVDXE9fztquU8evDf1LqbcHl1uLw6alwN/OPAC7xX3b8g4NGmztVCcUcFLd6vdu8n6uPxqypuWY9X0SGrIrIq4lV0uGQ9fkUJnqc8ay4npB7fpw4h8O/K/MuDs9cf1X6JqkYesr1d/Wm/mgnhsEoW/KoIYQ042jKX3F3nTSOuJ8eSDYCEFDQixOpj+cWYO9CJmnOqXtTzszG3oxc1jyhV7Z70TTIkccOIa4N1mnQGXpx9J6NjskP2btOZ+b+Cizk1c9agjgmgw9fJttY9EQ04IgKfN3R7T8xKmsXto24n29zdhlhdLBdkX8AV+VcMev+DJeEIz41JNGEQB6iN0ks0V1FUyrDybzmHx4TRfJoyAWdaZtdKLSXitm1wMPyAsUdF8OST2o8ygPtszRpGPP8h1+ZchoiIgIAY+AdwUtrisPd+HxISNA+JAOKmTVyQfRGqKoQMXLvuMcmbyXnDF8LGbkNlhdrODQcf5qatv+Pmbb/nmk2/5f2alcFsKAC72/ejBQpB7+dB55NRVZW2QJYRU2oKYj/GEUkUSLSaISGBMkcVHsWLiorklvEpIrIq4NWLOGU3h52aSGSS4GNSYTZiGA88n6hDVSE3OxE8HlIEfXCdxxDOIVzg1OwpfQwthEtd2ENrosHdjE8n8cqY43hAfyEfHF7I5zvm8FH9XF47dQ6fnF+AXad9Wz2Kl9E764ltDu+11e7rCJ7f1QcEmlq1b7miaj/a9RLYdzid7bVHcaIiSr90aTJ81Z/vCkMOqpZlmbfffpviwMt03LhxnHXWWeiOcZx2lChRhk6mLYb9LZ6IHWWb3oBFr6fe2Rl0VwyHAtQ7j/1AeHpKDquX3sp7h/ewu6UWk07PqdljmZGS850Vmd3SXI4kiBEHJAoqO1srWZwxjnkpoznU2RjWICEAmeYEMsyDH4xGpNes1WifGVUHkU61ikquOQ2qW5EEkTlJBaxvLsFrkIJhILKqMCE+n18XXhDcbmXDbra3lvWpT0HFr8g8vO99npj5o6N3XF8TK+o39clv3oWKSmlnJVXOerItmttzksnCX+adxt0zF9PocpBgMhNnGHrIQYyUhMurx6T3hb1mggAej+au2+618/Lh92jotNLYaUNWNEOHJMqkxnTy4uF3OT51NjF6GxPixrKheVvE/RpEPaNihkVc3x8H7dU8UvJfyqtK0Ptk/HodE4ZN5baCc0kxxQ+6vqkJE/Ephh6GntAT4ZZ1zErs9ri8LO8SMk0ZfFT3Mc1ebUZ7dMwozsk6i7GxBcFyO9v29ev10OJtp8nTQpopecBtdcjuPu0LRUDtMRcUo4/hnsLfsrt9DzvadiKrMmNiRjMjcUbQ7buLD2pW0+hR0Qt6JFEBFXyqSIevg/XNu1iSPjdYNt5g45/T76DT5+RAZzWJhljyrP0r4fdHh8/e7wy6gkqLJ3QANil+EpPiJ9HqbcWv+kk0JCIJQw/Hmxg3jl1txRHbYZHM5Fs19/sTUhfw4uHw6QEFBBalzhtYqEQv0VyvqOO+jgxWtugRRQFF8SOUyxgMNv526Q8ofObB7oFoXR3etAxKpHjWk4zTZGNKbgxz1CaMtVWaEaKrbGkpjBqFKzOL8tEFSEmJjJAEpE2but3zAYqLmf9ZEuMv/Curm9ZS764nVh/L3KQ5ZFuyjnw8oGkIzJmjeUgAlJdztmMp5swbefnwf1ANTQgC+P06hhum8ts5VyMWFQXbcchRzRtj/DT4uq93q6+Dp8vepMXbzhX5ZwFg9zvDGhhAwKcPvQ9OzU7lP+XhvU8kUeCkcaOIMRlRW1qodtWjqOBTdDSoMdg9ZkDFYdDjMuqoctYz0pYLGzZw03WX86NfvAI+FdSudqjY9SasGXGkJsfAjlIsQHZmPAd8Djosoe9rAci3JbIoYyS88N/uFRYLxMf3bXCPEHOfrPBy6gkU2fN77B/cTWYqPslHOBkOXnA80179GKNowK14KNhRx6YTh/epNl6vhb01uRysripHJZ3a5niS4+3oJAWn20BjayyKouM/+/Zw1shxYc9nlChfhSFZBPbu3cuZZ55JXV0dY8ZoAl/3338/KSkpvP/++4wfP/6oNjJKlChHhx+Om8Tda1aEXScJAhePm4goCCQYzRglXVBwsDeiIJAXMzQxsMFi0xu5ZOT3JwxLGkCcvRTo0F6YN4vXD29ElfvqMqjAtSMXHl1V/65Zq4AbbseqNXjmWzAa/H0GraoKsixS0e4KzlrF6M3MyZnK53nT2GOvQhJEFqSO4/oRS7DpugVFP67dhogQVmtCQWV3+2HqXK2km7+ee+xo0ebr6HcgCtDu6ySb0AGcVW8YdBaJcOxuqeNQcyIFaQ2AGrxmqqoZGOraYzmgaC6565u3UdNho7EzNCZbViRq2+OQZZH1zTtYkj6f+ckzeLPyA9p99rDHd2r6CUPSYzhctps3XvgNC/ZWsdjdLT7oNW3kpYmfcM1VfyY2p28Huj8OOaqRVYjkHaAC++yHmZmkeV6KgshJ6SdyYtrx2P12dII+bIq4gQx4BzsoPlJ5VQWjGGr4EwWRSfETmRQ/MeJ25Y5qltevAwR8qoRPDt3Pc4feYmHKdIxS6D1n01uYkjCKr0qCIb5fQ6qISLopJcK2R+eZX5Q6h7erP8Lpd4W9Z8/MXBI0zExPmMpLh99CUUPfc6qqvWdnJ84Y2E57ZyuQRrGqzQuowSwFqqri9fr5/TPr+bfZStcV8CjwUpGTlzMnBTwEVN6v7iA2No7HfzCBzHXdnh9yfT3PnHQaf0ePv6wayqpJi7Hx86VncyYyPPVUSGaFxjmj2e2tpMpVR7w+lmzzMDLMGcHvzBFZuLDbyADw1FOcfN11nLzgfppcHXT4HORYU9BLOtizR9s/mvDkAfth9k4Kf/7ervqM09IXkGJKRFYUtLPd97ntjDHhMemD62Y2VHN8QSErS8pCviCSKBBvMfPTkxcAoGzcgE9R8MoSbqOeTlvXO0rAi45to3OYv7tcW1Rezs59b2D4YQPedTaU/WZtoG+TkaY52GjXMWd3LRa7NrkyS8ph55SxqKIYDHmTVZURsck8s+BipL1FoQafuXP7ij720kCqNGRSJA7r6w6vCoBK3eZ0Yn46D9aXkOWopMxxiFG7Glg5ZwxuRY9B58dq9CIKAicEQiDa3O5gdQ6XCYer93tapcEZXq8hyjHgaIQ7fIc8GYbUO7322mspLCykqqqKbdu2sW3bNiorK5k4cSLXX3/90W5jlChRjhIXjZ3InMzcPp9xURAYHp/ILVNnA2DS6Th/1Pg+8eJdKKrKpQWTj21jv0V4PD7WbStjxboSKmtbj7xBPxyXOrpft2qjqGN6Uj4AmZZ4/jnzCuIM2qBHJ4jB0JFbxiz+ypklPLKfldVlfFBezMH2pu5ZqwDi4cOkl3Tg8Wn26J5u135ZpLPThr6oJNiZqne380BCI6ua9tHk6aTe3cF/KzZwyfqHqHQ2Bett9XRGFLPsoqNHfOq3gcPtbfx+zUpOfvUFTnv93zy4cS0NjlCxuixz6hE77mmmoacCPBJ6UcLhMVJUm0aLw0JXJjanV09ZYxI1bfHB0IxKRyuNnVZ0okyyrZPcxBZyE1tItnWiE2UaOm3UOLXZQpNk4q7Cn5AaaHuX+zXA4tQFXJh75uAa6vfDiy/S+H+3MXFrOUZ3aHYDg9vLqE37qP3FLfDii1r5AbLffrhf/QhJkNjfWd5nuSiIxOnjwhoYAGYmTgyrDwPabHe2OZ2kI7jd9ybZkBDyTIVDJ+gjr4zA6sat/RofnbKb7W2h2RR2tx3i3j0vc9n6v/KjLY/xbtV63PLQVOetOjNzkqaF1YQALXzopPQFQ6p7oNh0Vu4a95NgaIwkiMEwmNMzFnNW1snBsp/UrcHllzQBzsC10MQHRTyygQ9qBiBo3mvA2Jmczn8PekLSIXahqjC2/gAVnSrExKAC29xm4jqaQ0IhVBXsdjf/+ecy1DGaV40KbLLEsHfXHvw9ytbbO/nZux/zjirCddcFlxd1lPLua3extXknjoYK6g/t5F+bH+OX2+/Hd4SsIkHS0jSByi7cbi1zw333kbx+C8NL69B/uQruu09bHjBw1Hua2TshnfbE8M+UAKxp0jyktJSL4Z8vVRQoKUxHDXw3xcOH+fuEfH5+6nFkJcRq2xv0XDhjIm/+6Ifasj17kA5XoKo6VGB3YVYvMWKBbRNzUVXt+fIpMpZnXySvvRbD0laMt9dgvK0G4w316GZ00ppgwbVnZ3Br/f793LzwB/z3xKu5aex8rhszh2cWXMyyJTeQWVYZNLQEOS6M7kEvDaT98YVEHEGqAp4mM1a/EWbNIseSiYEYOpuMlBWnUFydyc7Duewoz8Gm5nJ6xhIA0qy2fgWEJUFgePx3y5j/XSYaLjEAduzYwZYtW0joIWqSkJDAH//4R2bMGKDFN0qUKF87BkniudN/wIt7dvDi3h3UdHaQZLZwYcEErpk4ndge7vI/m7qAdbUVlHe0BmPvu2afzxo+ltOG9U1T+H3krY+388Qrq3E4uzvcMybm8dtbTyMpYfDCkwvTxjAyJpVDnU1hjQ2XDZ9LjL57tmFKYh7LT/wZX9SXUN7ZSKzewpKMQhKNg0/r1pM3Du7iT1s/p83bLUY1KzWHh6fPIj0waxVjMHLh51t4nRmU5idj1AdS5PklfH6J0ZX1HL9nF5ji8CkyG5v3s/GU+fhV6LZhqzS6OvnNjpd5fs5tCIJAvi2NA521EY0tkiCSPgRX+WPFmsrDXLPsbfyKEgxdKWlu4vld23n17AsoTNFSGJ6cPpcPa9eGrUNEZGpCAcnG+GPWzhkp2dh0Bjp9UNaUDE19QwYWZ2tChlV2Pya9j+yENk1qMFDEqPMTb3ZR1RZHlb1bWyPTnM5Dk+9lV1sxpY7DGEUDMxMnkzqI8AAgKIwnF+2lztUS0Z1dRaXK1cSYNWs05f1bboEBhGPqRV2/rvqqqvYJLRgIp2Qs4MPalbhkT1gxw4tzTx90CFetuwlZFdCJ4QeiAO3eUGNbg7uNt6vWsLphD35VYWrCSM7LXcBwW3c6P4ff1W8QBkCnv3tw83L5F/yr9MOg94EA7Gkv5+3qdTwy9SZihpD15fL8c9lnL6PZ0xI0KHalCz0xdR6T4wsHXedgybNm88jUP7C9bQ/ljkpMkolZiVNINoYq/K9s2IyCiqJK+GWJ0Bl1lbXN27ldvbx/z5NeA8byzFGoW8KL2AqqQmFbBe16C6Sl0tnppknRY8SLze/G3uN8q7JMZnkxDUk5pMXE0OH2cBiByRWH+HjchF7pE+EvK1Zzxm3XocvPp7Z4Mw2tlRz/oZsxu2ox9tDK8Ri38tmcGk69+Dd902iGoysDRg+9CcrLQ2fre5+SEZmsWRj5ThQEMXgfGkUdkTwZAPZMzsJ8uPtZ0D/7LFdedx1X/vQaZEUJ9RAMeFOoqoo/EHawY0JO7yppTbCxozCHk1s0IVDJ6+Ost3dSnx5Lydh0OmOM2OweCorrSKvroCXOSFKXXTk5GV59lYnr85k4a5bmBVh0GJ57o+85mT8//DnupYGkmBJB7v/Jdbi9kJCAw+enpkPCrwgYfd1GWJfXyJelOqoKOhkZbyLGYOSskWN550BR2LBLWVW5dOzkfvcZJcpQGZKRYfTo0dTX11NYGPqRaGhoYOTIkUelYVGiRDk2GCUd106azrWTpvdbLsFk5t2ll/FC0TbeOLCbFreT4XGJXDF2Kj8YWdiv8NL3hfc+28Xfnu4bXrJtTwW33vM6z/31coxhhZ8ioxMlnppzJT/f+gZbmsu7lwsilw6fw80FJ/TZRi/qWJJx9MLQ3j1UxC/Wf9hn+ZbGKs5zdLBi9myMGzaQarKRjMQVn6ynMjmBnSOzabeaiXO4mHywitzmNkbkaW7nlY4mto7NoiXBSmhHURtYlHTUUdxRxbi4HM7Jns3HteFj/CVB5KT0Sd+adJZOn48bP34PryyHDF0VVcXh83Ljx+/x5aXXIAoCI2NyuCz/dF4sXxaizSAikGiI5ZZRFx7Ttpp0em4unMf9O78ILOm+DpIgMDI2mcVZmjt8gj6RzLj2EAMDwd9VMuM6iNeFDsZEQWRyQiGTE77CADEgjKeoimYMUKBOTqLYk4/dbyXG5GCs8RDpQktQ+Z3iYm27yy47YvUzEsfzr9I3IhoaFBRmJ04adLMTDXH8YcJP+EvJU9S6G4PLTaKRq4ady9zkwXsVyYqCoor4FQVJ0MJbukJbVLTsGD0V5/d3VPGTbU/gkr3Be+uTui18UreFe8ZfxoJU7VnMsaQHVegjkWvRjBJF7RX8q1R7F3QZ/brOXHlnA48eeJ9fjQtz3yqKNrD2eDQtl4SEEHfwBEMc90/8JR/VrmRl43ocfhfZ5nROyVjE/OQZX5umjiiITEuYyLSEyOElLrm36n9o22RVwaf4kfpL2dxrwCglJwLhjQwxPhcmxYckiWA00maMQejUxA0Nsh/0oWXNio+ODhdpqalU1zciCAJmr5dYt4t2S6ihu8nhZGd1LdOmT6f9v08yoqkNgPZ4M35Dd/uNHj/Cyi9QijyICxZoRoT+jHg6nWbo65Ga0+F3cchRhV+RSTLGk21O676u8+fjO2US/n1PRqxSVuXgfSjQZQAJZ2hQaU800zZzFHFdaSa7vCny85G6BvmtrVroXmCQr6K9p3eNz6IzyYhV8iAIKqoq4JU1z5VPjy/k/4pSkXZtDe4tra6DtLqOkBYIQFtuGtQGnquusc4RDC2MHaud23D00kAaF6NnRYcQ1vsFQC+JZCTFwv5Wilsbgs+rR9993VTAq8g8vmcDD84/A4A7Zy1iW30Nh9vbggY/SRCQVZVrxk9jXlZe5PZHObr8j4VLDLh33NHR/cDdd9993Hbbbdxzzz3Mnq25V2/YsIHf/e533H///Ue/lVGiRPlGiDUYuXXyHG6dPPg85d91/H6ZJ18Nn4ZMVlQOV7fwxfp9nLJw8AOuJKONq0ceh9svU9Jei04QOTFjHOfmTh94nOwQUVSVv24P7/4rqyrVjnb+M3Mal7a3IxQXMy9tGCtqDpDT1Ep2kxYq0uWOPjd9GIZAx3tzmp7liwqIHAuvsqW5lHFxOYyPz+O6EUt4qnR5SOy2AORYkrlt9NKjfdhD5oOD++j0hncbl1WVyo521lVVMD9H66hdlHsyY2Lyea96JQfsFZh1Jo5Pnc7pmQuI038175OBcP3Y2ciqyqN71+DuoakyNy2fB+ecGQyXcNGCTgrfWxEE0EkKbvpP7Tdoegjj6QQJ0WDhv+5plHuzQABBL6AqKjudo8lPreQKsces6Zo1sGTJEWdd4/WxuL1WDPrOsDoibp+JZOPQQlbyrVk8NvVu9nYcpNpVh01nZVpCISbJeOSNw9BlqFVUEUVVEQN+sKrSrbKvD2SMUFSFe/a8GPCk6L5uXZ4Hv9/7Mv9NuIsYvYWFqTN4ofxdvIqvj7FFRCTPmsEom3a/vlu9PqJ+goLCp3XbuWXUmcToA5oq9fWa9sD69aEpFC0WLdRq4cLgNYrR27gg9wwuyD1jSOfn6yLfmkVxR2nEEK5kYwLGI2WX6DVgHJlgIDbGRIe9b9pCvaIZFDIy4sFjB5MRVdXi4r2Srm9ZQBcwSHgkCVUTHcEYIYzI6XTBJ59gberOGiDJCn5CjSQqKi7ZjXWg3kI6HVx2Gcrixbz28q+wbdqFwaOFXNR5mtjprWDCaVeSd8pFkJbGJFUhpTyBxl4in13YdBbmJE/W/hBUDIKKN6A/0NOTBEAvytSecwJ57nUD9qYQBYG24TmsXTIMk047V5oBT8Wk86NXFPLMOXDLTdheeQXr2yU4ZXfYu0AF0i3JcP3Z2oIeoTERmT+/f+NNLw2kU+Q2HlbCa5WIosDps8cRYzbCxo0ctreiquAy6PuIT8qqyrLD+4JGhiSzhXfPuYxXinbw5v69tHvcjElM5orCqSzOG/GdFdH+ThI1MoQnPj4+5EZUVZULLrgguKwrVcrSpUuR5b45cKNEiRLlu8S+sgZa28OnhwKts7Jq08EhGRmeP7iOv+5djigIKKqKB4Vl1Xv4rLaE5+ZdwfiEASp/D4GD7U1UObTOpyAqGAwygqAi+0V8PgkQ+LCmlEsDs1YJa9ZwWs5YDrQ3Uu1oR1FV0iwxjI5LJs4QGHjMn8+/M8qR/ZHPFwhsbynn8oCO35XDT2RifD5vVq7jgL2GWL2FUzKmcnrmdCy6oQ3ajgWlrc3oRRFfhLRxAnCwtTloZACYkjCGKQnfTDiRIAj8qHAul4+expq6Q7hlPxMTMxgeGzqwbvM3BWfNw6Gq0OJvDL9yqPQQxhMEgT3DzqB8e7e+A4AQcG8ub8iheukE2Leje/tVq+D88/vdxdqGg1R36kmyGIk1ekLELzu9ehodZpbX7OXs3ClDOgRBEBgfN4rxcf0LJDa62/hD0UvsbT+EoqpIgsDkhFH8etylxBs0Y5NVb+6xhYCi9r0YmYFwlJ1tZdS4msPuS0WLKf+0bhs/yJmPTWfhFwXXcF/xkyiqGvR6EBCI1Vv52Zirg323ss66fjVi/KpMnbuFGCEtZBa7D06nJg64YsWRB1dfEUVR+G/VWtY07kUURE7JmMbJGUMXB16auYi9HeFTSArAmZnHH3kg1mvAqN+6mWuvPpUHH/6kT1FZpyMhwUpKipatICU+EXDgFvV06kIHjD5RQgDS0+OhuYF4UUBQtD63J8z5FQWBiV9+ASUlIcvrM2LZOy0rGAIwam89qbV2hK6wth7eQqqq4lX8GERd2OP+Y8s7bJltQ5g5D2unB4NXxmuQ6LQaEaSDPJ9gJh7NKy1Fn0WDO2Cc7vEsAsTr0oKhS8OsOawWN2FQFfyKiBI495LQ7eWTF5sPt0zt/z7syfz5lBU6EexFYby1QBIVxsQlg06HcPnlWKfnsOY/DzBuTw1GT7cBx2vUUz+1gMVX3A+ZgbCLU0/V3kfr1vU1ts2dq2kwHCkMpVfmjrT2Zu6dPom7t9UgiQKy0u3dNCozmZ+ce5wWClJeHgxh3Toqr0/IDIBX9qOqavD6xRqM3Dh5FjdOHnxK2ihRhsqAvwBffPHFkQtFiRIlyvcEr69/sTlVBY934IJ0XVQ6Wnhg73KAYEcBtBlJt+zjV9ve5r0Tbg7p3Dn8Hj6u3kOZvZF4g4VTssaTY03sU/dA8MgyoGIy+zCauoW/BEHzgHbYTVpWkcCsFUuWYFm1iknr1jGpn86UadUD0K+RAeINoa69UxNHMDVxxJCO4+sizmgKG8vahQrEG80R139T2PRGTskpiLh+ILNXRzVzSS9hPCU3j7XrTQiqJ3x5AdYcMHNxfn73TOW6dXDuuX1V2nuwvbUCg05BRqTNY0IvaQNonyyiImDUyWxtLh+ykWEg1LtbuWLDfVq4h6A9W7KqsrVlPz9c/0denfNbYg0WMk0pDLNmcshRE7GuE9JnAlDVQzhVUFRi7G70PhmfXsIeY0KSRKp7lJmeWMjDU+7kyf3vcajyACa/wozMQi6c9APiTHHBcnF6S8RML13EYIBHHw2dQe6PQepoDIaD9lpu2vIInh6ihVtbD/DI/vd4dtZPSB2ClsvspEmckbGID2pXdoerBP6fkTiRMzIXhd3Oq/hxy16sOpOmCdAr1ePSpTrE20/h6edW0RYwWEuiwMwl05hV2YpYXwt2OyZgeH4q/+mIQ+31zNn1ZnILcjGb9NDQQLaisFky4tLr6TCFvnckQeAHaUnEbVoDDQ3oBB1eQebQ6CTeuHZGiPDh7qlZ5JbZOWlzbDDUw/3lFzybp/KauwyX7CXBYOW83FlcPvw4TIFMJJ1+J1ta9wKaIGNnbKhRRFFVnj30FneMuZJqZxMbWg4gYEAvyuhEzevGr4p4ZYkiXw0lHRUUxOayMGU2r1W8h1fxIUqhRi8RkQnxBWSYNe2bru/SQAb5Jet+FdGQCrC3Y3/w9xnjjsf9owSePPA2zqZ69F4/foOeCflT+XHBxRgNPTzR0tJQzzuPXSdMY2vZVgS3m7FpI5kxYgaSNIh7vlfmjqV71lFw9jm8WOtmX1UjcVYTp80cy6kzCzDuKwkKSiYaLTR5HGwa0zd1sIjAuMTUqIfCt5CjIdz4vRR+XLhw4bFsR5QoUaJ8qxiRl4JeJ+Hza55ZikFFlUB0a7OtoiAwYUzmoOt9q2I7YiAesjcKKmWdTexuq2ZiQjagzczesfl1nLIXnSCioPJIyQquHDmPn4w9adAdiZFxSdgsCjpTX2VxQQBbrJsJSandC9PStNnjc8+FtjYtFtZk0nJ+9xjszUgaSaWzf/f6xekTBtXWoXLQXsfy2p20+1wMt6ZyatYUYvVDMwQsHVXAXzdEnjUz6XScOGxwaRa/DaQZjyzYmGlKPWKZAdNLGM8zZRqOz8LrcgCgQk1NK5w/q9vI4HRq92BiZANbvbMdsyEQ3iII+JSAi3jAC9ts8FHn/moZYo7E7/e+GDQwdNH1mLpkD38ueYU/TbwWQRC4bvgP+M3ux/oM8kUERsXkMi/gTh6vt5LQ6mDSzioKi0JnWj1GHUXjskhb2q2zU3OomNde+BNjd+5jaqCswgq2xr3C3DOvxHTCYkhLY0nGNDa17CccIgJjY3NJf+fjEAODy+WlxGviYPIwzJmpzC1IJnHf7lC39UHoaAwURVG4ccsjeMNkRejwOblu0z9497i7Bl2vR/GxobGONrcZo86HJCgoqoBXNrC1uYUOn5OEHgPMWlczz5Z9whf1O/GrMjE6M2dlz+XS+XMw9Ur1ePp113HyKz+iuKQWt8fHyBGpJMRb4YH98GVg8s5uZ1xuHuMvuZgtnx7E4dAG/TabkUsumM0EhsObb4Ldjh5YmJfCn5PSESQJVVWDHnFjUlO406hCSwvY7dhEK7tyRTbPG4YPETUgKigKCqIA1qzjEabMhEcewS37WFm/l6pP2nDN14yTrV4Hzxz8gg1NB/jnzGsxSnpWN2454vnc1LxH+7+lJPDYCXgVHd5eDjOiILKxuZiC2Fxi9Fb+r+Am/lT8OB1eAY8sIghg1vkYZk3mlpFXhG48gO+SX5Gx92P4FgT6hHJkmDJIMYzhC0lFNaskGCxkW0Zj1YV+Pxx+N/+3/d9sbS0LhjjK5XvJbVjL36ddTYZ5gJMAXZk7ujwz3G5GvfUqv8vPh7ldWhMN8MD7Ic9XQUIKD8Wk0BQX06dKBZXrCmcObP9Rvl6i4RIDx+l0UlFRgbdXvOrEiZEFdqJEiRLlu0CszcQZJ47nP1t2YJ/gxZcWiJJ2g+WgjoSDRpaeOPh3Xa2z/YhlapxtTEzIpqKzmds2vYJP0Qwd/h5uzc8dXEu6OY5Lhg3O/VEnCljMMt4wH6quGTybJcxKUex3cPejUUt4u3IzatDRNZQ4vYV5qZFn1o8Giqrwl6L3eKtyUyBlneYh8uj+j/nT5EtYMIT958TGMSszmw01VX1DhQU4bfhoYgzfnvCOgZJktOKWdZh1/pCwia7f3bKOOEPvnOpfgV7CePrUZAwGCa83cnhlQqJN62TT1TaV7dX7eb+iGaffy7j4DM7OndQdtgPoJD9iBLtb1/2tl45tSGdxR3kkkXwAtrR0u7JPShjNHybczDNl71DqqAJAL+hYnDaLq4efpWky+P3M/mgL17+5EZ/at+1Gj58p2w9zYs0yKHIiKwq733mSEX6PJn7Xo2x7exM733yWWStXwfz5nHDhBbwbl8/e9sMhhg4RAUkQ+XH8bFjTnY7vYHUH97VmUB6TjtAuoB5s4MEv67nxuhO5YKlFm2kNpDAcqI5GF6qqUuduw6f4yTQnohNDNQRerfgyrIEBAAFafXY2NJUwO3lwz/nblRsp7qhGRcLrDd1ng7uD58o+544CLU1rjauZGzc/jMPvDoaZ2P0uXin/nB1xpfx97hx06wIeOwFxQl1+PhO6xAm3HdLECYuLQ1KzCh3tnLcgnzOvOYWDpfUgCIwcnorBoIOV7pDwh7TqKn5z223kW+PYXVuPRa/n5LGjOGnUcAxXXx0s2+kz0Oozs2NCNn6l2yAsqzp8TTqq3/ehPluIkJ9P8baVuGQf44uq+GzumKDXg4LKnrZK3qnczIX5cznkqD3i+XQrnsB+ur4FEbRfIMTYbpJiaXMn0+ZzBh+fTp+RHFMmBjHCO/YI36UjhYQpPaaFS9pruHL90/gUOahl0up18ui+z9jdVsWD0y4Oenf9bvcbbG891OM4NapdLfx46zO8MveOPvdvRIaQuSN35hxy50+CIu1bp6gKYkBb5abxszkzf+zA9h0lyjFkSEaGxsZGrrrqKj766KOw66OaDFGiRPk+MPu04TwZtzEkj71qAkehn3GzM0mIH3wGhBRTDP11vLrLwKvlm5DVyAn5njmwmgvzZwxKLLLU3ohXjZwfXRCgqL1qwPV1EWuw8KfJF3HnjlcDHbTunp1R1PP0rBsGXedgeenQGt6q3ASEdvy8ip//2/4Sr86/nTzr4FIu1nd2sqm6l4GhCwWWlx7EtciHWT/4tIjfJMnGeFRVoc1pwmrwotd1hxU4vQYMet/RTbfZSxhPZ+/gxMXjWf7JLmS57x0uCHDqqZM0Dwg0A9va+lL+b/vbdMbaAJX3K3fxcNHnPDb7Iuakat4kqWZbvwMLrczgU88OFK8cXiS0Cy10InRKd1LCaP4x7RfUuppwyi4yTMlYumZOAyk/9cXFjI/PZ3tradi3xyhbJlbRAE8+id3rxJlMiHt8FyraQMjh92BdswZdczMP3HQ9Tx3+lA+qN+FWtPZPjB/GjSNPp+CTDcFtGxo7+GNLOuW2NFBCp+T++eQXZN37A+Zdd52m+t/FAHQ0AFY1FPHE/k8oczQAmlHy4vz5XDZsYfD9tqZxb791qCp8XLt10EaGd6s3RcxGIqsKH1Rv4fYxZyAKIk8e/BCHz43cK42pgsqe9nI+XnQ2Z7S2HXnAaDZDRgbU1mrGh7w8eOQRDPn5jOsySKwp6c6WkJwMdXXatikpJL37Dj/Oz4eusnXV8OpLsKXb06BGkig5LpXWBGufV5c+2U9NchUdHS5sM6ZTsfpNTRDR4yPW4cIea0JVBVQ0EdJ3qrZwYf5cYnVHfr66dB4mx4/sN52srCpMjtfC5Vx+Lz/e8gwdPi2bSs+tdrQe4v6it7l34kWRdxquHUHNuMjtlYTuYdD9e5fhlf19vIpUYGV9Casb9rMwrYBKRxOrGosiHlOls5l1TSUclzpAvaYwmTv6JaB58hOdjoLkTB7dvY4mt5NEk0UzMAyLGhi+tUQ9GY7M7bffTltbGxs3bmTRokW8/fbb1NfX84c//IG//e1vR7uNUaJEifK1o6oqf9zzKYIgoPYOghNgc2cFa+vLWJA+OE2Bs3Mn8+zBtWHXiQhkWuKZkqiJS61vLO1XlK3BbafW2U62NSFimd4MJLpiqLGcJ6ZP4JMThvGPfR+zs/UwkiByYvoErhqxCIN4bETguvArMi8dWh12Xdd3/c2K9fx07OAyV7y3vwQBQcuP0auDIAAOn5cVh0o5Y/Sx9dI42sxPnsCd9mW0d1oAJUS7AETiY5zMTz6K4S29hPHYsIErrr2FTRtLaW11hKRtE0WBESNTWXrmFHjoAQB2tlRxWHHRYTWh9ngm3LKfH214lU+W3EaqKYbx8Tn8px8bmSDApPj8o3dcvTBI/WchUFXQRdCUyDCHMYAFUn4C5FhSkBSR1Y3tbOmMxS6ZSMmC89MsFLgEOHgQWlvxejrI9umozEqkNi2OPQVZdNhMxHa6GV9STUZ9O43uDqy2FCguxvzGf7ntssu4YcRpNHjasEomEo0xfXQ0NjepVMRmBAwMoYiiwGtvbGTeQ5fCIHU0Pq7Zzj273wgZCLf7nDxxYDmVjiZ+O6HLSHHk99JQXl1NHnu/612yF7esGWZXNeyKqF8hAMsatnHGQAeMI0ZohgaLpfv8RJrBHjmy2xtohPbN8ZYdwL5vF5IoEaezIDhDQwMqRsey8oTRYc+JqoI0r1M7vlgbMgoCKpKokCg4UY0iqqqlUXX79DS6tQxz42KHB7cPl70FIMmgfY9GxmQyNWEUO1pLg+KjXUiIDLOlMyVBSwW5vG4HrT5H2NOkoPJZ3U5uHn0qqT30RI6EJIiMjRlGsf1Qn/aqqna9psZr7+1qZyvbWyv6qUvgvartLEwrYEfboSPud3vroRAjQ7Pbwetl29nSVIleFDkhczRn5o7HrAsYp3toIA1UUPJv21bxyK71SKKAgkyTy8FtX77H3uZ6fjltYVST4VtIVJNhAHz++ee8++67TJ8+HVEUycvL46STTiI2Npb77ruP008//Wi3M0qUKFG+VvZ3NHKwoyniekkQeK9iz6CNDCNiUrh+9AKe3L86RHBNEgRERH4/5cygS+ZAPBSkQXYkhttSSTRYafGG79AJCMxPHR2yrN3r5s2Du/ms8iB+VWF2Wg6XjJ5MhjW2z/bxBht3TThvUG06GlS7WmmL0EkFbYZpa3PZoOttcjmQAjHP4ZNzQpOrf8HL/tjecoi3KzdT5WwmxRjL0uxpzE0ZfXRFF8OwsbmU9s6uMAMxYFzops1uZnNzGYvSB589JSxhhPFS6g/z2D+v5IXnV7Pisz14vTJWq5Ezlk7h0svmYjq4D8rL8ch+yuxNbJs4rI+SuoqKV5Z5s3wbPypYyNzkcVglEw65b+pAAL0gsTh98tE5pgiMi82nqL084ph4euIADVI9Un4COP3w6J5Utrdma8ZPVaWiUmCv2cCDP5lMfsAg4FNkEtq9PP3D+WyfmBdS5dZJeQwvb2TY2nryuxYGwhqMaWnkWHqk0Oulo/GlNx7FFL6HqygqRcU1mqL9rIHraPgUPw+VfACEn6BbVrON8/PmUhCbxQlpE9nbUR7+XAVYmjl49fwMUwKlnbURJwhjdCZMkp4mT0e/Apkq0OK19x0wrl2rnUtZBknSDG7z5mkDxqSkUIOEqmrGhK6yRqM2OhZFuP56APzr1rC3rZQqZ2PQU8AkGSg0ZJDR1Zj0dHadkIJqCP8eEQQQEmUwy5jtDnSCiCBohhSvQRcsoxMVrAYPaUat5qmJYzEIRrwBwdYuw0Iw3Aq4IOfk4H7uHn85d+58hr0d5d3aBapCnjWN+yZdGxwI72orj5hKFTRDQ1F75aCMDAA/zDuZH299Fp2kYNb5NYFjFdx+PbIicFHuYkDTn+gPWVVpcmvGKJH+382yqgQz7gFsaazg6tWv4fb7UNC+IytqDvDP4rW8cvxlZFp6HNMANZA+Kt/HI7vXIRplBEkNJCeVkWT4194NjEtM5azh4wZ+oqJEOQYMqRfjcDhITdUEoRISEmhs1NJcTZgwgW3b+hFyihIlSpTvCK2e/geOsqrS7B7a4PK2ghO4f9oPGBWrvUclQeT49DG8ctw1zEzuVotelD4GMcJIRQDyrUmkmwfX6dKLEteNXBR2nYRArN7E2TlTg8sOdbRw0rtP84ctn7O+voLNDVU8tmcDx7/zFOvrIs/8fN3oBjAo1w/BmyIvLj5i+krQOtW5sfGDrldVVf5e8iHXb3yK5bW72N1Wycr6In6y9d/cueM1/MqxDTt8q2z3Ecv8t2zX0d1pbwHpp54ipf4wP/v5aby/7Ke8897tvP3u7Vx/w/FYyg4EldTbfS4UVWXLuLwwlWoDkB0tmvuCUdJzZ+GFiAghz05XmsyfFZxLjL5vmFO9q40v6vawtnEfLn//IQ9H4q7Cy7V7rWeYlar9mCUjvyy4ZGAV9Uj5qaoqDx+MYWeHKfg3gKqoeFxe3v7DG/hHaWk1JUGkPCeZxAgpeMvyUqi5tFcIw6pVfQv20tHoNPYfHiZ2hWf00NEAujUawrCluZR2X+T3qCSILK/dCcC52fMwS4bw1ggVUk1xTE0c2W8bw3FOzqyIpgMRgbOzZyEKInF6K0YxcliUiEB2b2+U3qPw3nkcuwwSt96qzVZv2QKbNsHWrdr/W7Zoy2+9Fa68EvmKy7n3ghG8W2DGZeyO+XfLXtbJ1TRmJ8H06TB6NOMOhU952hNJlJA2bcam196NHpOOTlt3aJMQyI6SF9MtMPjTgstREVDU7kvR9fsIaw5LMuYEy8bqLTwy7RYennozF+YezwU5C/nLpOt5auYdJBu7v1064cj6BQPWOOhBhbMTt2yg02ui0Wml0WGlyWml02vE5TdQ49SMCxnm+IjfWtDuw1yrlgY4yRgTEkIZjq7UnA6fl+vXvB40MED3Oat1tvPj9W+H3b7U0cInrnrW6dz44uP6eAI9tnsdksmPIPZqiAiSyc9ju9f138Ao3wzqUfr5jjAkT4YxY8awb98+8vPzmTRpEv/617/Iz8/niSeeICMj48gVRIkSJcq3nDxbQr/KCZIgMDw2aUh1C4LAGdkTOSN7Ij5F1rwYwgySL8yfwatlm3DInpB0lwTadXPBCUNyibwofxbtPidPHfwSRVUQAjP16eZ4Hpp+CQmBVJOqqvKjL9+l2e0MOQ+Kqs0eX/fFf9l43s1Y9f27iH8dZJoTyLMmU+FoCnvNBAQWpQ1+Vn5CZpLmn6hqtYSiIkoqEzP6ZmEobmykpKkRi17Pgrx8LL00G76o38vLh7TZy67Zu65O6Od1e3gtPodLhy0I2can+NnYVEqb10mONYmJ8TlDdomt7XSiqkI/omgCtZ1D99AISxgldR55BHJz0Y0bR4zVCg4HFBVBRbcBSxJEto3NpTneFrZaEQFTDwPSgpTxPD79Fl4+/Dkbm/ehqiqTE0bww7zj+wxCO31u/rj3LT6v2xucFTZLBq4ZcQKXDVswpPObYornxdl38qeil9ndXoaqqkiCwJSE0fxm3GXEGgag5dIrVKHJlMCqQ+HTfaqyQn5bOXX2fLJjYlBR6NCbmbi3ms/nFYRqM6jAfiPJF8yFbVX9hzX00tHIyFQpalERVAFBVYjxudArMj5RosNgIn6sTTtfAR2NIKbIAqJdMfj90REwQoiiyNMzbuf6zf/QPFUCj6UgQLIxlmdm3n7EusJxZtYMVjcUsbFZy7LR9f4QEBgRk84Vw48HNAPWqZkzeK9qQx/3f9Ce37Oy52o6GgHvBI/s45CjgSpnC7KqkGSwMULwk7BihebVMycwIO+61tOng9er1aHTgcGgeYM88gjMn8+mkyayTmqERaNYddxIbJ0eDF4/XoNmHChfW8FPqsyIwIR2HfllLZQP7+tFIqhQEDcMW0kZ8qEynLILQYC9hZl9tDwEASqc3WlW5yRP4k8TbuPxg69T5apHRRtUL06bxQ3D++pv1LlbWd24l3VNxaiqikP2kW5ODPGamZ8ylveqN0e8RkZRz5SEwWfxefnQ2h7fcSHk2yAi8HL5WhZnjCfJaGNhWgGrGvaF9aaQVYXz8mZox+Nq1+qJEDIiAC6/5hXyfsUeOnyesEYJv6qyvbmK4rZ6xsZrIRC1znZ+seUdNjUdDpZLNFj41cSTOTO3O3ytxF4HUt/9d4nbHnTUa15F0ZCJbxXRcIkB8OMf/5jaWk1h9u677+aUU07h5ZdfxmAw8Pzzzx/N9kWJEiXKN0KGJY7j0kewpr4sbLpJWVW5cNiUr7wffT+zM6mmWJ6ddyV3bH6dSmdrsLNklvT8rPBkTskaP6R9CoLAjaNP4IK8mXxeX0ynz8OomDTmpIwIMXZsb6qhuLUhbB0KKp0+L+8dKuLi0ZOH1I6jiSAI3DjqJH6149U+6yRBJFZv5pycGYOud1PLfuLSHXQ0mRFtPkSjAioobgnZoSMmrZP1TftYmj0NgBq7nds/WsaW2u5OuUWv56dz5nHVlG4PkdfK14WEy/REDaz/Yf78YCfxk5rd/Hnv+yGzvsNtKdw3+UJGxaYP+rgskql/8TYBbLrBC5sekZ5K6k6nJny3bl2I0j46nWaQCMSrx0+axoaRXvCHnxFXUDkxMzQEYVxcLn+ceGW/TVFUhdu3Ps+etsoQgTqX7OXR/R+jonLF8IX91BCZFFM8D029eUjbAn1CFcpicoCDYYvaFDdmQaajxQEZqTh9EkK1glHxE+Nw02HrkX5vvxFxk5XOWX5NNLC/sIZeOhopchEJPguTmg9R2FaBKaBToAIenZ4dI7Og/jxNrLALi0Vz9Y7AMFv/aVIVVWWYtbtMjjWFjxb9nuW12/i8fieSILI0a9agxR57ohMl/jrlCt6t2sRbVRuocbWSZLBxZvZMzsuZg0XXbWy5Zvip7Gwto9xRH7xnup7jUzJmsCChAB59FIqLcfg9fNlQhEf2Be8ul99LpbOZKYnDyDcnwZNPaisKCzUDjyBoxp1eBh4A1qyhbfeHCCeloepEVFHAHhtqwFlfmMxVZXYSjTHkWtI5+YNdfHT6OCpGdBvEVRUQ4LLOPPjPUyg9BtW7J2aFPUed/lBj0IT4Ufxz+m+OeG73th/mJ9ufxCt3z+TXVrfwQfVG/jzpamYkaaF5c1MKGB2TSWlnXdhB/mXDFmLVhTkn/eBXZModjRHXK6js7+jOlvHL8WdQvLaGBnd3WExXetBrRhzHpIRcAEySHlUVehifNbrfpxJmnWZ4L2qrR0RA7mG60gh4NahQ3FbH2Pg0On0eLln1PPWuUI2QFq+Tn295G6Ok4+SsgKijJEd8fwsCCFLUwBDlm2dIRoZLL700+Pu0adM4fPgwJSUl5Obmkpw8OOXuKOFpaLHzyifb+GR9CS6Pj4K8VC4+ZSoLpw7eFTBKlCh96fR5WF5xgCa3g1xbPCdkj8QghQ74fz/tdC764nnqnPYe2gla3OivJy9hVFxKuKqPKgVxGXxw4m1sbirnUGcTcQYzC9NGh3R8h0qi0cZ5uZEH3vtaI3fQQBOw29cWWbfi6+bE9An8dryXh0qW0dljQDrMlsp9ky4m3jD4rAJOvxfJ5Eef6gJVCPYRJasfyeZDNMi4AhkFnD4fF7/5OjX20E6i0+fj96tWYtLpuHiClvb0gL2u3/juenc7btmHWWdgbeN+7tzxep8y5Y4mrt3wNG8edxsppr76GP0RYzD08eTuomu51XAMxDp1OrjxRrj99hAl/BD8fqiu1n4WLkS66SZurSvit9vf71NUEgTybUmckjV4L5UNTQfY1RY55OfZ0i+4IHdOcMDwtdIrVEFKjpymT4+sCfnptRh+v6hDUPwIVQZMH1iwD7OAAkK1HsGlvePcHt+Rwxp66mgoCtNK9jNCV4mrw4Yg97hp9CpilospZQdRb74ZoaVFEycURU2srh/Rx9GxmYyLzWafvabP4FJAe8ecljW1z3ZLMqayJKPv8qGiEyXOzZ3Dublz+i0Xozfz2PRbea96PR/VbKLd5yDHksrZ2XM5IW0ywksvBYU6t7aU4ZX91MbFUWzKoVNnwmJzMq61GhoOkeFoxNjl9XHwIK0ZOWx2CHyixNNptDA7O5YzDJ2ktHe/Y037yzhRauWzJeHj7VsTrVRNyyRxTw3tPieCW2bp2ztpSI9l37g0Om0mbHY3I4sa0dv3Q1weOkFCQGDX+AxaE/u+I1UVTGLoM9Dobuf1ylV8Vrcdt+xldEw2F+QuYH5K93PoV2R+u/vfIQYG0LwCFATu2v0iby/4LSbJgCSIPDztGu7Z/Robmw8Ey+oFiUuHLeSq4Sf0e13CIQkiekEKm/a1i57PdpopltcW/IjXD2/kg6oddPo9jIlN55L8ORyXNiZYbk7yaPSCDp+qpbrsehKUoLebyvFp4wNtEOhOntPzRdvtX2GWNC+3tw7voNbZHsETDx7cu4IlmQUIgqBpsvTnNy8Q9WT4NnI0wh2GuP1jjz3GX//6V+rq6pg0aRKPPPIIM2fODFv2rbfe4k9/+hMHDx7E5/MxatQofvrTn3LZZZcNap9HpQdhsViYOvXovez/16moa+XaP7yG3eFGDqg47zhQzbZ9VVx95ixuPHfeN9zCKFG+2/zn4G7u2rgcl+wPdAJUEo1mHjnuTOZl5AfLZVhiee+k63itbBvvV+zF4fMwITGTy0fNYHpy7tfWXlEQmZUynFkpg3cX/SrY9P0bMhRVJeZbECrRk6XZ01iSMZGNzQfp8LkYZk1hXFx22M7WgcYm/rVhC8v3H8Qvy0zOyuDaWdM5YWT3ec6xJNPpDXwqe/cRAafXwAir5ur63r5iKjs6Irbt7xvWcX7heHSiiFVnDDGE9EYniEEvl3/uX4Hm6hvau1BUFYffw+uHN3LLmJP6OSt9GZUYzyflBDqjfUPFUWFUQvyg6hwQfj888YS2oxkzNE+Gurq+ngzp6Zong6rCE0/wg5t/xCc1e1hTX0ZIelRJ4vdTlmKUBt+d+bK+qF+xOZfsZWtLGfNTv4HMIb1msgtHJaI36vB5/H2K+pBQVZWM3CRwthFrEIMdUV+nCXF/33CFUXmpULYndGG4sIaFC+HTT2HvXjLa3cTjoCwbOgUzgk9A1asoVu3+yXQpCNX7tGvpdmuz88cdd8RDvXfihdy46V+0eB3Be7xLJPB3Ey8iwRA+TOabwqIzclHeIi7KWxS6oodQp93nokZx8XH8TKpLM4IpcAVVoCRtFKOnHeS419dilLRz3llZzZ2GcezMGBns9+1uE3hKMPPoafOZuubjoBFo/O4atk7Pi2gQqD/nFJDXU776vwgIyIpIco2d5Fp7oIz2/BwWHRTE5qKiUpaTyIoTwt/nggBtvu53VaWzkR9teRS7zx0MG9nVVsaOtlKuyF/MNSM04cfNLftp8oR/H6qoOGQ3Kxt2cUrGdADiDVb+Pu0aDjsa2dtegUHUMytpFDF6c9g6vLKflQ3FlHc2EGewsDh9AknG7ntFEAROyhjPJ7W7wz7jkiBySsakkGXxBgs3jDqe60cuQlaVsDoQcQYLVwxfyNOlnxMuBOO41LGMjdM8QjSticDF74O2LDZwfMtrisOU0VCB8s4WyjtbGBaTxKjYZPa3N4avVoX8mMSogeHbyDdkZHj99de54447eOKJJ5g1axZ///vfOfnkk9m3b19QY7EniYmJ/PrXv6agoACDwcAHH3zAVVddRWpqKieffHKYPYRnwF/lO+64Y8CVPvjggwMuG6Uv9z33aYiBAQim93r2vY0snDqSscPSvqnmRYnyneaLqlJ+vu7D4N9doRCtHhdXrXiTZWdcyaj4bo+sOIOZGwrmcUPB/55x77isYf3qUiiqyoKs8GJ83yRGSc+c5NG4/T6semPYztbWqmquePW/+BUleA9sraphc+W7/OL4BVw3S+v4OnyROojQ1cG0+7SB32dlZf2er0ank6LGBiampXNa1hReKP0yrDeDJIgsTp+ATpRo9Tooaq+OeKwKKp/X7R20kWF26jCeNG7C69EFDQ1BVDCa/MxOOwZGrR4pGTGbYfhwOP545IKxeCUdRlVGLCqCw90xyRQXs/rhe9lUIKLXdQ+SBEFFFRV+vu013j3+x4N2p/YofgikJvXJIrKiDWz1koxO0k6IV+k7qP9a6BWqYN69nUtvP4Xn7v+gT1GHZCZlZCaxCVYo30+qqqLTGXCgw24INRxIosDYEemMzk+FVwcQ1pCWpmU5aG0lzmDG47YzsqKRuuRY1k0ZTkNSDKnNduZuL2N8pwK6wP5aW7Xt0o7cV8mxJvPKvNt5t2oLX9Tvwav4mZKQz7m5c44YTvGtoodQZ6ffzUfx06gpzgiKjnY96mKjQMa7HqpzU0iqtqOosEWxEu/uDOn3yYqKgMrPV5Tx4Q1XYnziCWL1Vhx+FxN2VbNqUWgWIDXgvZ8Vkw633MLmplUM29Ee2LXQ58XkV2S8ih9l3lzezIxBkECi2+jY9b9fEfDT7Q3w1+I3QwwM0K0n80L5Z8xPKWRMbDYVzoau5L9hT5eIQKWzr7dcnjWFPGv/XoLbW8r52baXafM5kQQRRVX5W/GH3DDyBK4esSj4zr9mxPGsrC/G3cubQhJEbDojlw4L/a5XOpp5pnQln9Tuwqv4ybEkcUn+XM7LnRkSSnjdyBMxiDqeL1uJIyAUaxAlzsyezu0FpwXLuWV/4BxEptbVESx7pPGjJ/A+unHsPO7Y+E74QgLcOHbuEWqK8r/Egw8+yHXXXcdVV10FwBNPPMGyZct49tln+eUvf9mn/KJFi0L+/vGPf8wLL7zAmjVrjo2RYfv27QMqF7WcfTWqG9rYWhI5ybckCrz75e6okSFKlCHyyK51YQeCKuBXFJ4p2syf5576DbTs28eetip0Ri8+j4G+szEqOr1MUUc1M1O/PYaGcnsL/9izmmUVxfhVhWSTlctGTeP6gjnBcBhFVfn5B5/gU5QQQc2u3//6xWpOGTOKnPg4qp1t/c52C0CFQ3N59sryETuJPlmr5+K8uXxQtY0Wb2dI3aIgYBT1XDNScw8eSJYJ7xAyUcxJGcmoFCtlLQ5cLp0WCgIgqJgtfkYnxTAzaVj/lQyWXikZMZloO+ci/v1ZNSt+uwmvx48t1sxpF8/m4mtOwfTyC+B241cVWj9fTmLafFoSbAg9lK9kVaHJY2dZ1U4uyA/v+hmJ8fE5mlu024iiinS9FTw+PTpJxmb0MS4u+2gc+eAJk/Lz/FvOwGj5Aa/8/RM6WjVVfKNZz9LLFzB12Gx4+79gtyMB8/JH8A9PCoIoBdymtUFjalIMv7/1DNizp1uPASKHNdTXa+kUExKwtbbSIXrZmxNLu9VMfk0L+dUtIIA/LYeYkSlQUqJ5MiQkaNvV1w/I0BBnsHL58IVcPkQNjG+cXkKd/swcalbFdhsYeiDKKuMbqrDHWyAG2u0uWnUmJjaU80XuBNQeg1kV6HR6WOk2c3J+PrmedqpdTRTurWHVglEgCSGeSMmGBApic0AQ2XXmXFZOTmXi7moK99ZgcvuC9bpNeooLszjphnvxJycir7oXFBUFFVFUNLkBVfOCUBGC2XuqnU3saKY6HokAAIfrSURBVIucClgSRJbVbGJMbDZWydSvS7+C2m+2jkjUOFu5efPz+AID7q73p6yqPH7gM5KMMZydoxmJ820pPD37Ou7b8x672yuDdUxOyOPOwrNIN8cHl5XZG7hqwxO4ZF+wzipnM/cXvc+utgp+P/H84BhHEATGxOSTLg1jS6tW78jYZMbFjMAodR9TgtGMGPCWjESCQfNkmJqUQ1FbbcSyVp2BfJsWNrU0t5BdLTU8f2BT0BuzSz/ihyOmcW7+pLB1RPlmEYg8ZTGYOgA6enlNGo1GjGG0XLxeL1u3buVXv/pVcJkoiixevJj1Pd5ZkVBVlc8//5x9+/Zx//33D6qtAzYyfPHFF4OqOMrQqG2O7GoLmmW7urH9a2pNlCjfL1x+H9uaaiKuV1D5pOJA1MgQoNrZht4oIwhevB59D8uMis7gx2xSqHK0fYMtDKWso5lzP30eh98b7Kg1uR38ffdqtjRW8fRxF6ATRbZWVVPZFvk9KgoCb+3ey48XzCXJaA3Jed4bFUg2aS660zMzWVdZ0ScTSBcmnY4xAd2iBKONZ+fcyF+L3md1Q0mwMz45IZ+fj1tKvk2byUsy2kgzxVHvDt9eSRCZnjh4Y4AkiDw28zKu3fAsjaZOFL9mgBF1MmnmWP4x87KwGU++Ej1megHazrmIW+78jNbmTpSA8aWzw8WbT61k+9os/vqbKzE+9QRtXgd+RWba3go+nR8+Fn1t44FBGxmOTxvPb7csp3vyuLv755dF4sTkkEHI187Chd1GBkB4+mnOuu46Ttt8Lwf3VCH7ZIYXZmGxmWDlSm2AHyCh+jA/+fPN5PkS2FFShU6SWDBtBEvmjsV8cF8wPWiQSGENX36pGR8KC+ncsYcDzTrc9SakBBXVoCJ4BaRWgQ6nk8ZZOlILCqCpCUaO1LZbtQrO75tx4HtHL6HOzqzJCP5DYYvaZDcGxYeuwwzZqXSobQgdCibZS4zXTUevVKGSKFDb2A6zZpF86BDppkTq3a1YOz10BoQfFUUbvvxkzA+Cz+0pGTN5vLOWLxeOZtWCUdg63Ri8Ml6DhDPGwnFpkzBl5qBT5KAhVUFAUXo/9yrJRk3zpc7d1u9pkFWFGpeWOlMUpBADSG9UlSG9Y16v2IBP9Yf1AhOAZ0pXcmb21GDdY2IzeX7ujRx2NNHo7iDdHE+2pa/Gyf1F7+Pye+lpKu767aOanZyWOYW5KVqa2I+rirl13X9DJlZLO5r4yYZ3qOhs5eZxWmags/LG81jR2ojHEqc3MT9d8xi7ZPh0Xi7dHNb3QwAuHT4DU8CAIQgCv5myhNNyxvKfQzupdrSRaYnj3GGTmJ489KxDUY4xRzFcIicnJ2Tx3XffzT333NOneFNTE7Isk9bL2JuWlkZJj29Gb9rb28nKysLj8SBJEo8//jgnnTQ4j8ljoOoU5auQHNd/7KEkCqRESOMVJUqU/ok0+OuJ0+c7Ypn/FVKMWm50nUFG0suoSmAWRwzMjAoCKaZvz/voT9s/CzEwdKGisrqujA8rizkzr5Dajs4j1lXTocUvn5Ezngf2rIhYziIZOD5dc1u+aPwE/rVlMx5Z7nOvCcClEydhM3RrWKSb4/nbtMto9tipc7WTZLT1GdSKgsgVwxfwl6K+bvIC2izDJcP6F6uLRL4thfcW3c6H1bvY2FwKwOzkkZyWOfHoix32muklP58XPq0OMTB0F1U5uLeaD3fZOSc/H7VkNwCTiqv4bO7YPin2VAb2bPfmy7pSZDVSZ1ygyuGgyd0ZNCJ97URI+anPz2fsrFmat8Cmw1o2h/JySEnRNC4AkpOJfe8drszP17JIJCRAaxM89ECoBwNo+wjnbdDzmokiay35PDNmOONbD1FYW4FJ8QaLeiQDH3lTuOKZX8Ozz/afGvP7SC+hTjkmshCrXvEjCGAQdWA0ojOb/r+9+46PqkobOP67M5PeewGSkNBC7yFUKVJEBUFERERFXVdY29p9bewqtlV37b2LvSsqBum9905oKUBCep255/1jkiGTRhJSJsnz/XxGmdvmJDczc+9zznkeVKZ1ZIqzpeL3j0VXBPh6gJczmqbRz78j+7NO4GN2IktZf68dvdpwS8wlDAg4l6BwYngci5LXk5iTgm6AbG9rj7kBA+4mF26MHg9Yk16OD+3Lr8kbq7j/0Zjazjr8PqDkO6EqRs1AoIsPAGcKssgtdMLTtbhC3hdNg7wiJ04XnP+zuLzVp/dX+X5XQFL+WVILMglzs09sGukRSKRH5cnpU/Iz2Jhe/QiNH09sZHBQRwotZh7e8Iv1flFVDEi8tHMZkyN70MbDlxjvQG7oNJD396+3O17pNJLH+o6z5ZOJ8gzgv3FXcuf6b7DourVKBBpmpXNxeBf+0fWiCu3qG9iOvoHtKiwXLd/x48fx9j73OVPZKIYL4eXlxdatW8nJySEhIYG7776b6OjoClMpqiNBBgcTFe5PbFQI+46eqvRD1KIrLh1W+yzaQgisda0LNZSzqjJhkqFAegBKDQmJwc/ZnYyiPGvSMmO5m3cFl7dzjGGZ6YV5LE0+VGUngQGNrw9v4/LIboR6nf+mMdTLejEd4ubNXd1G8p9dS+ym2ZSWrXus9wTcTNbepWAPT96bdAU3//QDuUVFGA0GlFJYlGJch47cM3hopa8V4OJV7cX79Mg4TuSl81niaoyaAQ3r0GCTZmB+ryvp5B123p+nKu4mF66MHGCrAd9gyvX0WvoPIOHOhAoBhlIKxW9freeKe+PwPXwIk8GIW2GxtSSjl30iOA2IC6x9/oj1p49WOxXGohSb004wtk0TJH4sVbbkZ6nExIqBArBWdCitENGhQ/XbloqNtb5GZcqds7/yvEn38mR5cA9WBHXD01yAs26myGAix+SKVmDkuuBgtPOVxmyJyl3gRwe64ORkpLi44lSmYoMJFAQEeUFhIcEezhgMeei6oshYcfqAi7OJkQM6whprj7hRMxDrE8E7Ix4ixV3D1ehEiKtfhf3cjC78t+9c3ju8iF+T1lOoF2NAY0hgN27pMJG27ufyHsyJGcuatH1kFOXYjRAwoBHjFcbkttZAZpRHCJ282nAwO6nSkQQWpTOhJJFjUl4uBcXO6MqAu3ORLc+JrjTyCpwoNBtJys0976+2vJp9Q9fue/xMYXa16y1KJ6VkNNnylENkFledtFfTNH44upPbulo/7x/uPYYIT1/e2rPWln+hq28wd/YYwajwjnb7jgnvwtLxd/Dt0a3szzqNt5MrE9t1o6+/jE5oCTRlfVzoMQC8vb3tggxVCQwMxGg0kpqaarc8NTWV0NCqy18bDAY6lHyP9O7dmz179rBgwQIJMjR3D1w/hr899QXFZostCVDpfMrLhnWnT+fK6xgLIapn0DSMmUbMwZUkcyv54PYqqCTDeivlbDAxv8/l3LHuCwzauSSZpTfYd3UdTZi7TxO30iq9IK/aUYg6ilP51l6z/u3a0NbHm6SsrDJD5ctsqxRTepwbln9L56GEufvw5t6VHMi2Jirr4RfO3NjhjAi1v0iMa9uO1XNu4Ye9e9h9+hQezs5M7NiZXtV8mZ+Ppmnc0/USpkT059eT28goyiXSI5BL2/TBz6X2ZTmbRLme3kI3z0orJdgoOHs6G/z8cDIY6eAVwt7MJFyK7PcxoOHt5MblbfvUukmGGly0G5v6wt5kgnnzrAkzy+azqIzBALfcYv13DebaMnSoNcBgquJSsNw5yyszjF9pBrKd7If1236f5yuN2RKVS9TpvmMLE6f044cv11O+vyjP2Q3Nw4Pwtn6wYztOQM+OEaw9eJpcl3MBNINBQ9cV980ejaebi3XESil3d1wCgog8zwgRT5Mbt3eawq0xl5FelI2XkzseporfccGuvrwz8B+8c+gPFqdsoVhZcDe6cHmbOG6IHoOb8dzIpnu6TOUfm17HrFuwlCR/LA3ATgwbQE9f6/StQGc/QFFkNlFkNqJp1nKPutJse1i3qZ2hQZ05knO6yhLAbd39CXWt3fdSiKtPtUl7jZqBNiUjI9IKqg+MGNA4U2YbTdO4ruMAru3Qn9T8bJwMRgJdq/7cDnT15JbOlQekRTPXBNUlnJ2d6devHwkJCUyePBkAXddJSEhg3rx5NT6OrusUlvtOOB8JMjig2PYhvP/YNbz341qWbDiARVe0CfLlmvH9mDKyp0QzhagjNycnBgS0ZePZ4xT7luthUuCcZmJ8l06V79xKjQ7rwofDrueNfctZfco6UqCLTyg3dxrKuDaOM6oqxM0Tk2bAXEWvtFHTiPKy9qYaNI1nJo7jhi++xVKmukRp4qy7Rwwhslz5xsva9eDStt3JLi7EoGnVlvf0dHZmZs/6H+ER7Rlc6yoSDqNcT69LQS4eXq7kZld+A6ppGuGRAdbedKCrTziFlmIKnU0Y0DBoWkliTy9eGXgt3s6Vl7mrzvCQGL5O3FrlemeDkQGBDpDU1GSCWbNg7FhrjoPVq+1GGODubk3cOHz4uWkPEybUfNuqlDtnAzr4se+kAUslo08MBo2BA6Ot1ycl58ymstKYLU0liTpvvvUScrJ78uev2zEYNDRNw2LRCQr1Y+C1szAt+x2yrT3o7SM1im68im4FAWw/YM0b1LNjODdcHsegHlE1T9RZBWejE6Fu1Y8mCXb15aFuV3FP7BTyzIV4mlwrLeHYxbsdbw24nQ+P/Mmy09bykOFuAUyPGM7lbQbZrlGjPAM4N6JAQ6nyVRY0OnjVvnrItMg4vjy2jsJyFSNK3RQzstbXyUGu3gwO7MTatIOVjmyyKJ0r2llHe7Xz9K32WBaliPCsGDwxaBph7ufveRaivt19993Mnj2b/v37M3DgQF566SVyc3Nt1Sauu+462rRpw4IFCwBYsGAB/fv3JyYmhsLCQn799Vc+/vhjXn/99Vq9rgQZHFRM20CevO1SLLqO2azj4iynSoj6MHf4IG76LAktx4DuoaOMCq1Yw5RvxEkzMjuub1M30eH0C4jk7cGzMOvWXAPORsf7PPJyduWyyK78eHRXpdm5LUoxo8O53u6BEW35ZvYM3liznsX7D2HWdXqGhXLLoP5c3KlDpa+haRrezq3ghqkhlOvpNW5Yz4QZg/j2nWW2Es1lKaWYeE08rPsTsF6g92vXlU8vvZ+E1L3kWYro4h3GiJDOld4I1cTo8M7EeAWSmJNW4W9GA67rMNCxzndIiDWJ4tSp1ikIBQXWG3hf34o3nLXZtirlztmEgAI+OeliG1lZqvR+bsY1JblByvW4V1oasyUql6jT6YP3uO/mm7n6+qGsWrqXosJiOnUNZ+DgjhhXLIc3/3tu37176Tx3Lm9fdBHFZgsaYDKV/F3v3FnzRJ31wNlgwvk815ztPUN5vMe1WJSOWbfYVVQoNSK0E/7O7pwtqnyUmZvRifF1CFSHufnx2oAbuGfzp6QV5ZSUsNQxagbmdhrLZW3r9h1+f7fLuH7NG2QW51UYtTc9YhD9ShLsxge3J9zdm5S87EqDHCaDgUmR3evUBtEKXOhIhjqYPn06p0+f5tFHHyUlJYXevXvz22+/2ZJBHjt2DEOZ74Xc3Fxuu+02Tpw4gZubG126dOGTTz5h+vTptXpdTVWXNruVyMrKwsfHh8zMzBrNbxFCNG8/7tjD478mkFtUbOu9DvL04IUplzAwsolK1okLdqYgl2l/fsiJnEzbxV/pReLMDn15ot+4Knu4rGX+ZJRYg/ryS7ubsPw5t3Dfkys4tDvJlkBNM2goXTFkfA8evLk3xtdePbf/mDH1XqkgNT+b29d+zZb0E7bh0gZNY2Z0fx7qNRZjfVfYaG7KnbPtwyfx2Ff7ycrKR9M0lFJ4ebly730TGTKkk/WG+OWXz+3fAOfMoX38ccVpLXbJN8+eS9R54MC5RJ2hodCpU9XbljV0qHVkSzOw9vRhbl3zKRalbCMESt9TLw6Yxpjw2Dofu1i3sPL0PhJzTuPj5Mao0O74Oruff8dqnC7I4pPEVfx6cgu55iI6eoUwI2ow48LsRxFvPnOC65Z9QrGu2/1cutJ5Pm4SkyJ7XFA7WquWej9W+nN1v+UpjBcYuLYUFbDzrYeaxe9Iggy03D9qIUTV8ouL+Wv/Yc7k5tHO14dhHaIwtfQM6K1AZlE+Hx/YxDdHdpBVlE8HnyBmd+zPhHZdJIjQ1FJT4dFHzz13daXw2uv5eVsWf3y9nvTT2YRHBjLxmnhGd3TB+N679vP5588//xD/OtpxNomtaSdxMZq4KLQDwW7VZ9FvNSo5Z8XX38i6XDdOncokKNCbuEEx1p7v0h73RjpnDslshldesU/UWRVdh127rP/u1q1mI0xiY605OqrKo+GADmad4sNDa1iash+lFIODY7i+QzxdfcObumkXJDE7nff2r2PxyX2YdZ344Ehu7DyI3gGSN62uWur9mAQZWrGW+kcthBBCOJTa9PSW1Yx6b1scOWe1YzbXLFEnWPM4QP0k6hSimWup92O2IMPN9RRkeLt5BBnkk0oIIYQQjaM2JRlLVVdmUTQ8OWe101SJOoUQDq0+S1g2Bw4/kuHkyZPcf//9LFq0iLy8PDp06MD7779P//7WOrxKKR577DHefvttMjIyGDJkCK+//jodO3Y8z5HPaamRMyGEEMLh1KanV3pvHYOcs7rT9Zon36zNtkK0MC31fqz05+pxU/2MZNjxjoxkuGBnz55lyJAhjBw5kkWLFhEUFMSBAwfwK1OD+dlnn+V///sfH374Ie3bt+eRRx5h3Lhx7N69G9fWUDZJCCGEaE7q0tMrmpacs7ozGMC/+vKRddpWCNG8KC68uoRDDw2w59AjGR544AFWrVrFihUrKl2vlCI8PJx//vOf3HPPPQBkZmYSEhLCBx98wNVXX13pfoWFhRQWFtqeZ2Vl0a5du2YRFRJCCCFaFOm9bX7knAkh6llLH8nQ88b6Gcmw/b3mMZLBob8RfvzxR/r378+0adMIDg6mT58+vF2mVvCRI0dISUlhzJgxtmU+Pj7ExcWxppokOgsWLMDHx8f2aNeuXYP+HEIIIYSoQmnvbXi49f9ys+r45JwJIYSohkN/Kxw+fNiWX+H333/n73//O7fffjsffvghACkpKQCElBuaFxISYltXmQcffJDMzEzb4/jx4w33QwghhBBCCCGEaL1UPT2aCYfOyaDrOv379+epp54CoE+fPuzcuZM33niD2bNn1/m4Li4uuLi41FczhRBCiNZB160lCwsLwcXFWsJQerGFEEKI6rWynAwOHWQICwuja9eudstiY2P55ptvAAgNDQUgNTWVsLAw2zapqan07t270dophBBCtGipqbBsGaxZUzHhX3w8jBghCf+EEEIIATh4kGHIkCHs27fPbtn+/fuJjIwEoH379oSGhpKQkGALKmRlZbFu3Tr+/ve/N3ZzhRBCNDfSM1+985UuzMuDhATrQ0oXCiGEEJXSlPVxocdoLhz6SuCuu+5i8ODBPPXUU1x11VWsX7+et956i7feegsATdO48847+fe//03Hjh1tJSzDw8OZPHly0zZeCCGE45Ke+fMzm+GVV2DPnpptv3IlpKXBvHkSaBBCCCHKkukSjmPAgAF89913PPjgg8yfP5/27dvz0ksvMXPmTNs29913H7m5udxyyy1kZGQwdOhQfvvtN1xdL6xEiBBCNDd5RcV8smELX2zeyZmcXMJ8vLi6b0+u6d8TZ7nps5Ke+ZpbuNAuwKCUItnkwz7PtuDnR++Ofvgd2AmJief22bPHut+sWY3fXiGauV3pqaxKTgRgWHh7Yv2CK93uj+P7eXfPenakp+BqNHFpZCy3dI2jradv4zW2CelK8fXBHXy4dxOHMtPxcXblyg49mNO1P/6u7k3dPMcko/ZEI9OUUs0oJtIwWmpdViFE65FbVMR1H33F7pTT6CUf61rJf/pHtOG9a6ZIoKG2PfMAsbGts2c+NRUefdT2NKcYnj3qz7p0ZwwGDeufmOKy6QO5dXw7jO+9CwUF5/afP19GgghRQ5lFBcxd/j0rkxMxaBoo0FGMCI/mlWGT8HI+l6z8v9tX8NL2lRg0zfZZb9Q0PEzOfDluFp19gy6oLUopCi1mXIwmNE27oGM1BF0p7l75M98f3o3GuY5do6YR6u7Fd5fMItjdsymb6Fia0ai9lno/Vvpz9Z71JEbnC+sEtxQVsPXjh5vF70hCWEII0QK8s3qjXYABSkbmKdh47CSfbdzedI1zFOV65gGIiqJgylSyr78RNX06REXZry/tmW9tli2z/dNstvCvfV5syLTe6Oi6QimFUvDj5+v5ZOVpuPlm+/2XL2/M1jYqi9JJL8yj0GJu6qaIFuK2Zd+xJuUoYL2J1ktunVcmH2Heih9s2+05e4qXtq+0bVfKohS55iLuX/NLndtwOj+Xx9b/QbfPX6DLwv/Q76uXeWHbCvLNxXU+ZkP449h+vj+8G7AfOW5RipS8bJ7etLRJ2uVwzGb4+GNrsDghwT7AAOdG7T36qHU7s3yeNbhWVsJSggxCCNECfLFpu91Fpx0FCzdta7S26EqRnJvN6fzcRnvN80pNtZ8i4erKrsuncYtrBwZ/v4sRH61m0tJEvhk6ETVvHpSdcrdypXX/1kLXrb1eJQ4VurI1zwPdUvnf1zefrCGvfUf7AM3q1dbjtCCFFjMv7VzKoB9fJO7HF+j13bPcufZbjuakN3XTmj2lFGtSjjJ/42L+b/1vfHt4R6sJ4mxPS2ZVylEslXx+W5RiWdJhdqdbP3++OrQNo1b5pbtFKbalJXMg40yt23AqP4fLf/2AT/ZvIa8kqJBemMcrO1Yzc/FCChzoXCzcv8062qMSFqX4MXEPOcWFjdyq89OVIjknm9N5jfC9WDpqr6ppgeWtXGndXgINoh61svGfQgjR8ph1nbS8/CrXKyA5K7vB26GU4sM9W3hjxzqSc62v1y0gmH/2HcbodjEN/vrVKtMzD7BxxHhu/WWn3bKTpzN58rMEjo3px1033wwvv3xu5fLlMG1aY7S06Z09a9frtd7sj6ZpVDW7srCgmH07T9InLu5cfoa8PMjIAH//hm9vIyjWLcxZsZD1p45SGjoxK51Fx3ezLPkg34yZQ7RXQJO28ULlmYtIys3Cy8mFEHevRnvd7OJCblr6FetPHcdUcgP96YEtPL3lLz4cdXWVeQlaihVJiRg1rdIgA4BB01iZnEhX/xBO5mZhUdUH707mZtLRN7BWbfjv9lWcys+p0AZdKbacSWLh/q3cENu/VsdsKMdyMqoOqGP9PjyTn4enk0uV2zQmpRQf7dzCG1s3kJxT8r0YGMw/Bw5hVGQDfS9Wkk8nKSCAPwMDOe3sTAeDkTGnT+GZnHxuH8mn0+BaW3UJGckghBBN4GxuPu8t3cidH/3E/Z8t4rdt+yi2WOp0LJPBgI9r9RdUQZ4edTp2bfxr/V88tvZPW4ABYHfaaW5c/A3fHdrV4K9fpXI985aISB5afxxdKcwGhdkZ68NkDch8/OcmDgeEXVDPfHZRITtOpXI4I73Km3OHVWjfC1jgcv75zUopayIxux0LKt+4Gfrp2E7WlgkwlLIoyCku4pltfzZJu+pDbnERT2z8g/7fvMTYX94i/vuXmfbHR2w+feLCD67r1oojSUnW/1fyHrp/zS9sKnkts9Ixl9xEpxfmMWvJQgocbLh+/Tv/50PpFsFunlWOZChVPkB0ODOdBRv/4pYl3/Lg6t/YkHrC7jOpWLfw9aEdVQY5AD4/WHEknFKKgxlp7DiTQnZR440cCHP3xkDVuSKMmuZQyR//vXopj61cYgswAOw+c5obf/2OH/bXIj9QTZUbtae7uPBMr94MDwjiKc3Im0Vm7i0opL+3L8svvax1j9prbK1suoSMZBBCiEa28fAJbnvve/KLi0FZy/H+snUvncMCeeeWK/HzcKv1Maf16cF7azdV2sNj0DSu6tOjPppepYMZaby7a2OF5arkG/GxNQlcEtUZF2MTfO2U65nf3zaa02uTMbtiDbWX/MqUEXQncC6Cn9fu5vY69MznFhXx1OplfL13F4UlQaOOfgE8NGQEIyPb1/uP1iBc7ANWPdp78fX2qof4OruY6Ny9Daw/ar+iBVV5+vjghirXKWBJ0gEyivLxda79e7cpFVksXPfXQralJdl9dmxJO8mMhE/4bPS19AtqW/sD1zDZ3ImcTH47vq/S62aLUqQV5PHz0T1cGdOz9m1oJgaHRvHc1qpzmOhKMSQsEoBpMT35ZP/mSrczaBodfQLpUibx4wd7NvHE+j8xlIyUMGoGFh7YxpTobjw35BKMBgN5xcXVTk1RWKdTlLXk+CGe2rCUg5lpALgYTUzv2IMHB1yEm8mppj96nUzv2JPVKUcrXWfUNMZFdMLb2TFGMRw8m8a72zdVWF76vfjoygTGx3Ss3+/FcqP2Phs4kHdSTgHWUR6ldKWYs/8Af145jchPPj63Q2satScalIxkEEKIRpSVX8Dc93+goNiMUtYLuNKL+4OpafzfF7/X6bi3DBlApL9vhbmqBk0jNiSIWQP7XGjTq/XD4d0Yq8lEnllUwIqTiQ3ahiqV65lPM7pgcQFbZ5hW5gEUOUNKRnate+bNus7sn75h4e4dtgADWC80b/z5W/5KPHxBP0aj8fOz3gyW6KedwT/QE4OxkksGDSbPGISHpyusW3duubs7+Po2fFsbyZGs6vMuKOBMuRux5uDXY3vYcuZkheCkrhQWXfHk5lqO0KhlsrltKUer7ZgzagY21MeICgfWOzCMgcHtKv38NGoaQ0Mj6e4fCkDPgDBu6GKdtqCV287ZYOTpQZfYKkKsTz3O4+v/RIFtlELpVIvvDu/ivT3WoLCnkzOeTs5Vtk8D2nn62J4vPnaAOX9+w6GSAANY85V8sm8r1y/+GssF5mI5k5/L0hOHWZN8rNLgx6VRXRjVNqbCWAajpuHr4sZD/Ude0OvXpx8O7Kn+e7GwgBXHKw+Y1Em5UXvmiAheyMis8j2mAe8VFrb4fDqOonS6xIU+mgsJMgiHlpNXyIoth/hr4wFOn21+F3BClPfT5j3kFRZVOuLAoiuW7z3C8bSMWh/Xx82VL264mhsH9bNNnfB3d+NvQwby8expuDs3bO9SekH+ecudpRdUnTeiQZXrmXfNy0YZodIRtyXLcszF1hEQdjtW3zP/x+GDbExJqnBuS5/NX7W0eUydMBisvc0lnJJO8MI/4wgJs95oGI0GDAYNNJhwRV+unzsKdu48N+oDYPDgFlWDvbo54KXM55kr74i+S9xZ5dBzHcXWtCRO5mbW7GDlks1ZlM7BzDT+PHGAX47uYVVyon2P+MqVxHz8OcZqpolpgLPBWNMfp1nSNI23LppiGzFi1DTbjWlccASvDr/CbvtH+o3h2fiJtrwLTgYDEyNi+WHC9fQODLdt9/6eTVXe4Crg3d0b0JXCaDAwo2PvKpMpKuDaTtYgta4U89ctsS0vS1eKdSnHSTh+qDY/vk1ucRF3L/uFgQtfY/bvX3P1r58z4LPX+GD3ZrvPTaPBwJsjr+DBfhfRxsNaws/N5MSMTr35+dLZtC0TEGlq6fnn/148W5/fi+VG7Z3q3p3MgqqnsliUYsOJkxAXd25h6ag9Uf9kuoQQTU/XFW99t5pPf9tIYbH1AsSgaYyP78L9s8fg5tKwN0xCNJRdJ1IxGDQseuXfFArYc/IU7QJ8a31sHzdX7h0zjHvHDKPYYsHJ2HgX59E+/uftwWrv41ft+gZT2jNfcvHltX0ruLSrdhdXV1Ote+Z/OrjXrnZ9WQo4knGWfeln6BJwYXXsG8WIEdYe5xJhi77h3X/PYVOWK/t3J+Hi5sSQkbGEt/O3Bhjeftt+/+HDG7nBdbcjPZkP921kw+njuBiNTIiI5doOfQlyO5eLItDVk2O5Z6s5Co0yVSI5J5uPdm/h9yMHMCsLQ8Ijub57Xzr71+1vKqMwz1YusSqZRQW08ajBjVuZZHNFFgsJJw6yw9uFLT2jyfR0wzcnn96HjnNJfo6tZz765CmmnM7hqyG9Kj2kWemMaduxdj9UM+Tr4sYXY69h65lkVqUkogFDw9rTKzCswraapjEtpifTYnpi1nWMmlbpjeyW0yerzbOQkpdDekEegW4e/KPHYFYkHWF/xhnb30PpEUe17cCUaOt0u91pqRzPqTroZNQ0fjqyh7GRtTtnulLc+Mc3rE89Yff5mVlUwGNr/kTXdW7sfi7xpJPByC3d47ilexzFugWTZjjvzXxTiPY9//ditG89fi+WG7Vn9PcHqg/6uJpMLTqfjmg6EmQQDun1b1by4S/2c2B1pfhtzV7Ss/L57z+vcMgvFCHOx9XJCevlW9UXf9ZtLkxjBhgApsR04+kNyyjWLRV+MqOm0d7bn/7BbRq1TTalPfMlN82eqcl0cDdxsJILeOvmGh3TUmvdM59ZWHjeHu/sQscrrVapkBAYOvRcArGCAoyvvcrAqCgGxsWBnzsc2AqfrLP/PYF1v5CQxm5xnXx7ZAf3rv0Jg2awDSV/ddcqPj2wmS/HzCLa21oxYmpUT17ctazygyjo6BNIqLt3g7Z155lUZvz8OXnFxbabx5PZO/hy3w5eu3gS46JqfzPe2SeY3WdPVVmxwMlgpG1NAgzlks2tzzzFS8N7sT8i1G6zNd1iSDiewkeHswkxOOFsNDLzVB5/ZWZzxsc+YaFR0+juH8qwsGaSy+QCaZpGn6Bw+gSFn3/jEqZqPpNqkhvB1WS9DfB2duXr8dfy4d7NfHloG2fy84jw8uXaTn2Y3qGX7XWyzlMa0qIUWXVIArkq6ShrU45Xuf4/m1dyTZdeuFbyMzk58EiXKZ278sy65RRbqvhe9PWjb0jNz/d5lRu1F1xcTKfAAA6cSav0ikMDJnTuVOtRe6JupLqEEE0sIzufT3+rmCgHrIGGtTsT2XkoudL1Qji6i3t0qLZnw8PFmYEdqu9ld0R+rm7896JLMWiaXfZzg6bh6eTMqyMvb9rA4IgRtn8G+3gyfdd6Opyp/HMk+nQSl29dZb+wBj3zXQOCqp1/a9Q0ov2aUUnHGTMgNtZ+WWIifPEFvPGG9f/lAwyxsdb9moGUvGzuX/dLyZx1+4RomUX53LXmR9uy9p5B1hwq5S7wSvOqhLk27CgdXSnm/fmjXYABrDd1ulLckfAzmYW1732c2alvlQEGo6ZxRVR3vJ1rcMNRJtlcsW7hyT7RFQIMpfa3CeGlnp1sz3v6h/JIrkuFaREjwmP4YOT0Kofxi+pdFhVb5e/OoGkMCYu0K/Po6eTC3B7xLJt8K7tm3M2iS29kZqc+doGMDj4B1Z4Po6bRxa/2o2p+S9xvK19amZziItYkVx2EcFR+rm78d/TESr8XPZydeeXiy+r3e7FcPh1t3TruGTa00k2NmkawpydX9ejeovPpOJRWNl1CggzC4azdkYjZUvVNmNFgYOnmg43YIiHqT1xMBANj2lV5oTZvXDyuTs1zkNklUZ35ddL1TOvYnUgvXzr4BHBbz0EsnnIjXeo4nLvelPbMA05GA33C/LlmywrmrP+TgccO0OXUCQYeO8BN6xO488hWgp3L3PDUsGf+mu49q/z+N2oal3XsTICb45RWA2tyuBsSvqTTp8/R6dPnmLPkazaeKkm0ZzLBvHm239t5DR1q3d7UPP5+vz68zZblvTyLUuxIT2bPWWs5t++P7IJiU6UXeLrZwKqk4+QVFzVYW9clHycxK6PS4e8Ka+K9Hw7utl+uFKuTE/nv9pW8umMN+86errBvr4BwHug9EqVbfw7drFkfFo32nv481Hf0+RtXLtlcWlAQeyOqeb8YNH53c7Ilm9M0jaGHkxkT3BEn3YRmNtLdO5yro3vj69K8qnU0tmLdUuXoqeu69MXfxb1C4FNDw4DG3b2H1fr1gt09mRDZqdpg6swuvWt93HxzcZXvxbLbOCKLrttVbShvQkwnfpl2HdO6dCPC24cOfv7c1ieOxdNvqP+pc+Xy6ZCYyOj8fP576UQC3O2/e/qEh/HljOn4HDzYovPpiKbTPK4ERKtSWFx1KSUATYOi4qoTRQnhyAwGjVdumMRT3y/hp817bLkZfNxdmTc2nqvjK5+b3Fx08Q/imaHjm7oZlZsxA9LSYM8eOoUFYtA0nE6k0mbfFsB6sxMV5EfvqMhzvUu16JmP9vXn2VHjuG/J72hYb1RLczR08g/kiWE1uGFrRD8e2c0dK3+0lbcDWJp0iL9OHuLl4ZOYGNnFGjCYNQvGjrWWNlu9umJJwsGDrSM9mskUiVKHs9OrSHl4zpHsdGL9QjiQcQaLwhpo0BSaZh3CoJS1LEkhZk7l5xJVTZb+C3Eks/p8EEaDgcNltknKzeLGJV+xN+N0SQ+q4rmtyxjXriMvDb3cNpReKcX+s+kovfSmwvobMSiNM3kFZBQWnH8kQ7lkc9mdewKJVW+vK8i1WJPNJSZyKj+XJccPsWZPGEUl5Xt3pZ3i5oTveKD/CG7tEVf1sVohpRRf7d/J29s3cOBsGiaDgfFRHbm972A6+Qfatgt08+DbS67l3lW/si713CiASC9fnoofR786Tl/7d/xYDmaksS/jDBoaCoVR01DA88MuIcLLt9bH7BEYyrcHd1W5XgO6BzrW58umk0m8tm4tyxOPoitFr9BQbh04kLEdO1TYtktAEE9fNK5xGlYunw5vv83Em29m3K23sP7ECbIKCojxD6BjYECzz6fTHDWn6Q4XSoIMwuF071D5POlSZotO95jqtxHCkbk7O/Hvq8Zx1yXD2HvyFM4mE70jw3AyOe7c0hahtGd+4UK0lSvpGBZIdIg/aTn56LqOr4eb/SiSoUOtAYZa9Mxf2aUbvYND+WTnNradSsHL2YXLO3Xh0g6dbfOfHUFOcSH3r1lkV96Okn9rwH2rf2FkeDTupTfNISHW2ulTp1ozjxcUWOft+vo2214vPxc3zpcfxa+kJz3A1YPE7Axrb6vSSoIL52iAj0vDzWP2d62+R18pZdvGrOvM+vMLErOtZTfLTodYfOIgD639jReHXgbAsqQjfH1oR4Xj6Sgyiwp4YsOfvDvqyuobVy7PSEhUFG5rDpMfroGhkjCOBjEZXuDnh1KKtcnH0JXCVFQMJUGG0t75ZzYuY2JUZ9rV4ca1pZq/Zgnv79xsC5CZdZ1FR/bz59FDfH7Z1fQOPnd9FOHlyxfjr+FIVjqJWWcJdPOgu3/IBQ3R93N144fLZvHjkb38fGQPOUVF9AwM49ouvengG1CnY07p0I3nNq0g31xcYWSGUdMY2S66TsGLhrLk0GH+9sMPaJz7W92Rmsrff/yRB0cM56b+/as/QEOqJJ8OL7+MKSqKwXFx1ikVJ05Yp0g043w6zVJlc+7qcoxmwnGueIQoEdMmkP6x7diy70SFDPwGg4a/tzsj+1WMFAvR3AR4ujOkc1RTN6N1Kdczb1y9muCyN8n10DPfwT+Ax4ePqqcGN4xfj+4j31L58GMF5JqL+e34fqZEd7dfaTCAfzPKK1GNK6J68P6+DZWu04AgN08GBkUAMC2mBxtPn6h0W6OmMTysvS0g0RAuatceTydncqqYkmFRikkdrPkzlpw8yKGstEq305Xi+yO7uKfPcNp4+PD5gW0Yy4xkKX/MJScOcjo/lyA3j6obVy7ZnLdewJCMUBJCU61BmbKBBl3hclpxQ/84OHuW0/m55JQkEyysZJqYQdP46uBO7u5Twyk7Ldy2U8m8v3MzYB8asyiF0i08sPw3Fk293i6IkJyTzc8H9nEk4ywBbu4YOxnoGhh8Qe1wNTlxVcceXNWxxwUdp5SPiysfjJ3K9b9/TV7JtIjSEVZd/IJ4ftgl9fI69aHYYuH+339HKUXZSRK2wNjyFVzauTOhXl6VH6AxlBm1Z5OYWDGoUFYzyqcjmgcJMgiH9O+/T2Tec19z8PgZjCUXKBZd4evpxsv3TMW5mc5ZF0I4iBbYM18byXlZmDQD5iqS/pk0A8m5WY3Slq2nk/jm0E7OFOQS4enL9I69iPZp+EBGd/9Qro7pzeeHttotN5QMAZ/ffxzGkr+FydHd+OLgNramJdv1tBo1DVejiQf7jWzQtrqanHhiyGj+uXSRbYh6Wbf2GkiktzX55Orko9WeWwWsSz3OlGgfTuZmVlvmUAGpednVBxnKlYhl7VoenHMjRx58j6T+Ovltrd/hWrHCZ69iaHYoF9/eC55/1nZDme/sTJZ75SNBknOzq37tVubr/TsxlqmEUpauFHvTz7A3/TSxAdYgwpd7d/Dg0j9QnCtJ+fa2jczu3ofHh45yqCpdA0PbsWr6rXx5YAdbTiXhYjQxLrIjF0d2cKgKEisSj5Ken1/tNj/s2cPfBg5spBZVosyovbJVX6pUh1F7ovZaW3UJ+WsSDsnf252PHr+WVdsOs3LrYcwWnT6d2zI2rjOuLhde3k8IIYAW1TNfG2HuXlXehAKYlU6YR8OWZNSV4pG1f/Dp/q0YNQO60jFoGm/tWs/D/UdxU7cBDfr6AP8eMIFo7wDe2buOU/k5gDUZ4l09hzM09FzpRBejiU/GXM2L21fy2f6t5JqL0ICRbWK4v89FdPQNrPwF6tHUTt1xMznx77V/kZRjvfH2cXFhbu9B3NyzzO+qBveNpZu08fBhd3pqlYEGDQhxP0+PbLkSsSQmEl14mrefvpH33ljC6sUHsDiBl3Lm8kv7Mevmi3DatwcSE3EvyQ2xqUMkqorgXpi75/l/oFYiNS+nymogpVJyc4gNCGZLajL3//V7pZOBPty5hRg/f67r3qdhGlpHfq5u/K1HE96c10Bqbk616w2aRkpO9ds0ihacT6fZqo/qEBJkEOLCmYwGRvTtwIi+MjVCiBZL162J4woLrcOu/fxaxUiCpjYhoguPrl9MgaViol0NcDM5Mb5dp4o71qNP923h0/1bgXN5A0pvdv+9cQld/YMZHBbZoG0waBo3dYnjhk4DOFWQg4vRhL9L5RVA3J2cebjfKK7u0JttZ5IId/ciLjSi0XqDT+Zk8tSWJSSZM6FkhkIm+fxv90oGhLelT1A4AMPD2vPB3srLQIN1pEZ8qPX3Or1jT347tq/S7Yyaxojw6OpHMZSqJNlc+5tv5l/PzyA3p4Dc3EL8/D1xcjLaJZsLcvPAw8mFjZ2iKz2srhRXXsCQ/IMZaXyyeys7z6Ti6ezMZdFduDSmCy7G5nn528bTx5ZMtiptvazBwfe3b7JOOdAVduP6Sz5e39yygVndejvUaIbmoI139cFXXanzbtOoWvmoPdF0muenrBBCiOYtNRWWLbOWvivfuxIfb71paeG9K0opVpxM5LM920nMPEuohydXde7BuKiOtmH6DcXL2YVn4idw58qf7KpLGEtuYJ6Jv+Rc0scGoJTirV3rq0y7aNQ03t29ocGDDLbXMxgIc6/+xiA5N4v7Vy9ieVKibVmEly/z4y7mojaV3yTXp9uWfU9yXskUFtt9oSKvuJgbEr5k7ZVzcTU5MSI8mnaePhzPyaz0OBMiOxNaMjrhovBopkR359vDO+3OhVHT8HF25bGBY2rWuCqSzREVhUdcHB5+frDrbIVkc5qm0f3yKWT6WDDquu3v0ICGjuLefsPrnPDvm/07uXf5b3aVXpYeP8J7Ozfx2cTpDZqos6FcHBHD+zurCCApa26Djn7WUTXrk09gsSg0ZZ1eUzrNRtOt/z+ZnUVafh6B7jUIIgmbIRERhHh6cjo3t9Jgj0HTmNy1axO07Dxa6ag9R6Lp1seFHqO5kCCDEEKIxmM2Vz9PNC/P2iOakNCi54kqpXhk1Z98snurrWdyf/pplh4/wqiIaN4cO7nB5yFPat+NEHcv3ti5llUpiQAMC2vP37sPYkBwuwZ97ezioipvgsF6U7jldFKF5XnFRaw4cZS84mK6BwbT0b/hpykAZBYVcOWiT0nJs88PcDw7gxv//JpPxk5v0IDI9jPJbEtLrnS4ra4pMooK+PnoXq6M6UGeuZgzuXnYJuKXbl/y78QypS41TeP5IRPpH9yW9/ds5FBWGh4mZ6ZEd+Nv3QcRXm7KjEXX+fXIfhbu28bx7EzaeHpzdeeeXBbdBWMdk83F/H0eP2Sd5b9bV/HnsUOYdZ2uAcHc3nswYyM71un3dSTzLPcu/83uJrD033vTT/PY6gReGjmxTsduShtTkqxBAoPCLtFCyY+ZlV9Iam4OIR6e6Lo1wACgYf//0n2cm+mIjqZkNBh44ZIJ3PDNt1jKBMZKA7T/GjOaQPfKR0OJVk6mSwghhBANwGyGV16xvwmpzsqV1puWefNaXKDh58P7+GT3VuDczU9pB8Vfxw7zzvaN/L13XIO3Y1BIBINCIhr8dcpzNhrPUzzSmuywlFKK93Zs5j/rV9qSBQIMCm/Hf0dPJMSjYeftf7F/G0m5WRXaW3qf99zm5Xw3cVaDvf6u9NRyF6hl7i6Vtfd0V3oqV8b04PvDu8gvnQZTtsHKut+u9FPsSEuhR0AoYN33mk69uaZT72rbYNF15v31E78e2W8LjJ3IyWJN8nF+OryXN8ZMwqkOyeZ0o5Ffjuzjz8TDFOnWd8GO06f4Yt8O4kLb1WnEwWd7tlaZmsKiFD8d2sMjg0YS4Na8bgZ3nT5l/aAoG2Cg5Llu/d/etNOEeHjibqw6f5WGhkkz4OHU8nNcFVksfL1zF19s305qdg5tfXy4uldPJneNxVTHEWOD2rXj+2tn8u7GjSw+eAiLrhPXrh039e9HXLuGDdAK0Vy0rKs2IYQQjmvhQrsAg1KKJOXBIfcwTMFB9OoegsfubfY9n3v2WPeb1XA3cE3hzW3rq1ynFLy9fQO39hrYYudLuxpNjGgTzYqkI5UmHTRqGpdFdbE9/2T3Nv61+q8K221IPsHVP37BomnX2QUl6ttPiXuqDIjoKLacSeJUXg7BDZSk0M3kVEmAofTfCl23VgQBWGGbzlH1386m0ydtQYaa+mzvNhYd2Q+UCYyV/D/h2CE+2r2FOd371zrZ3EubVvK/LWsqvN6S44eZ/dvXfHv5TAy1fB/sSjtVbdUMi1IczkxvdkEGDyenkulNVBmh8yiZ5pRbXHmJ2lJmXSe9IJ+gFjxdotBs5oZvvmXd8RO2oOaZ/Dy2JCfzx4EDvDbp8joHGjoHBvLs+PH12l7Rskl1CSGEEK2GUgpdqQbPAUBqql3vZmaumWc3aWxKzcFgOIiuH8DkZOTaBy5n+rxL0d55xzqvG6z7jR3bpDka1pw8xptbNrDm5HEMmsaoyGj+3ncg3YPq1qZ96WeqXqlBekE++ebiBs2L0NTu6j2UVcmJ6Mq+IKNR0/BycuGG2P6AtS79ixtWVXoMi1IcyTzLTwf3Ma1L9wZr6/lu2ADyzeffpq68nFyoOmhgvX0yadbpNeWndFTmeHZGrdvw4e4t1a7/YFdJkAFqnGwuq6iQ17atq/R4ulJsOZ3MipOJjGjbvtJtquLt7HLeBImezfC9dUmHzny7f3eV64PcPegdEgaAr4srZ8oGeMrRwFbdo6V6d+Mm1p84CZyLyZT+SSw5dJgvtu9gZu9eTdM40foode4P8EKO0UxIWlEhhGiFkk5l8tQbvzPquv8xbMaLTL/zPb79Yyu63kBfYMuW2f5ZWFDME8sK2XLGelNU+prmYgsf/Os7ft2UDjffbL//8uUN064a+GrvTq754UtWHE+k0GIm31zMb4f3M/mbT/nr6OE6HdOinyd7U8nQ9pasV2AYH108nUgvP7vlPQPC+HrCTEI9rMkJd5xJJb2g6rr0GvBn4sGGbCp9g8IxVtOb7uPs2qAlP1Pzqi+Jp6HZghztapAoMdq79gngErPOVjmaQwHHsjNQ5S+AS5PNhYdb/18umLnq5FGKq3svKPj+YNU31VW5LCa2ygCDBrT38aOLf1Ctj9vURka0Z0BYmypHdjwYP9zWMz+5c9cqtzNqGiOjovFwbn6Bltr4eMvWin+TZXyydWvjNUaIVkaCDKLRHTl0ig/eWsqrL/7Orz9uJj+vqKmbJESrkngyjRse+JhFy3ZRWGSdu30i5SzPv5vAv19bVO1FWZ3ourWKRIm9ZyzsyXdHt1R+c/HZsz9jie0KUVHnFq5ebT1OIztbkM9DSxejwG74tUUpLLrO3QmLKLJYan1cJ4Ox6oQEJctbdojBKj40gr+uuJkfJl7Hu6OmsnjSHL6bOIsOvucSOhaf5/ergCK99uegNmbH9qv2pnV2bF+cjQ2XqNPf1a3a9Zqm4edi3WZU25jzHm9gSO3njZ8vN4K3s0utp/fsP1vNiB7bNmm1OibA2KgO9A0Or3CTXfrskUEjm+VUJKPBwPsTpzK1cze7Yf6hHp68NOYSpnTuZlt2bfdetPHyrhAcM2gaJoORfw4a0mjtbgrFFguncnOrXK+AYxlVJ58Vor6VTpe40EdzIUEG0WgsFp3/PPUTt1z3Fp9/spqfvt/Ei8/8ytWT/8u2zYlN3TwhWo0X3l9Cbn6RtX56idL7p99W7GHdtqP1+4Jnz9rNy1512gVVzYiJtJQMju9PhrgyiQ/z8qzDrhvZjwf2YK7iBlZhDULUZTRDXGi7cwcpf1Cgvbc/bq0gKRtYb5B7BYYxul0HOvpWrBYRGxCMSzU38AY0+oe2acgm0iMglAXx49HAdtNW+v+xER35R8/BDfr6I9pEY9QMVQamdKW4tCSHxcTIzgS4ulcapDKgMTQsstLf8/lc2bF7laM5jJrGlR1rP13Fy8nlvNnS3Qy1n9nrZDDy8SXTuLpzT5zLVGmJ8fXnvXFTGRVx/kCMo/J0dua5UeNZP/tWFk66iu+nzmTVrFuY3Mm+bKKPqytfT72asdEd7IItvUJC+WLKdLoGBjd20xuVyWDAw7n6z1A/t+qDd0LUK1VPjzp49dVXiYqKwtXVlbi4ONavrzov1Ntvv82wYcPw8/PDz8+PMWPGVLt9VSQng2g0n36wgt9/3QZYAw6l8vOKePiez3n/89sICm644aZCCDiVls3GHceqXG80aPy8dAeDekfV34sWFto9zTW6AAXV7qKUAj/7YfS2HA2NKCk7G5PBUOWQbg1Iyjn/HPjy5vaNY/mJI+dKDJYttaDgH/0G1bHFLY+3iwtXx/bk411bK4wmMGgarkYTV3Xp0eDtuLpTL+JDI1h4YBuHMtPxdXHliuhuxIdGNHiv+Km8HMxmhWa0BgRLX67030q3lm2M8Q3A1eTEh2OuYtbiLzhbmI8BDU2zjr7p5BvIS8Muq1MbbukxgJ8O7yUlN9tuVI9R0wh08+DvvQbW+phd/IPsS2yWVbK8b3B4ndrr4eTMU8PGcv/A4RzJPIunkzMxvv71dq5Sc3NIL8gn3NOrThUwLpS/mzvxbaqvDBPs4clrEy7nTF4ex7MyCXR3p523TyO1sGlpmsaV3bvzyZatlSYBNWgaV/VouDwuQjiKL774grvvvps33niDuLg4XnrpJcaNG8e+ffsIDq4YbFy6dCkzZsxg8ODBuLq68swzzzB27Fh27dpFmzY1D+jLSAbRKIoKzXzzxfpK85UopSgutvDL95sbv2FCtDJnzlY9fBTAoitSz9T+prlaLi52T3t2C8FgqGZ+e6AX7TqGWkdAlOXa+BfyIR6ediM+ylNAWLnyiUnZ2bywehU3ff8dd/z6C38cPIi5XJAiLrwdTw0fi9F6C4hRGTCggdL4R994ruho3yvZ2j0UP4JREdGAtYqCSTPYEte9f8mURsuQH+ntxwP9LuLtUVN4bsglDA6LbJRh92uTT4DSUJaKr6UsGkZlZHXyueBh94AQfp44m/HhnQk2ehNq8mF2h358M+FaAt3q9rsKcHPnu8tnMrlDV5xKhuqbNAOXRnfhh0nX1qmyRlxYW0Jcy+xXvqfOAtNje9apvaV8XFzpHRxGB7+AejlXO8+kMv3Hz4n75A0mfP0h/T56jbuX/Ep6ftVJFptaoLs7fULDWlyAIS03jy0nkjiSdrbS9bfFDSTY07PCCByjphHl68v1ffs0RjOFAJpuusQLL7zAzTffzA033EDXrl154403cHd357333qt0+08//ZTbbruN3r1706VLF9555x10XSchIaFWrysjGUSjOJp4mrzcwirX67piq0yZEKLBBQdUfyNgNGiEB9fzhaifn7V0XcmUiaHBRbzh5kxBflGl0yam/WMcJicTrCuTdd7d3ZqZvpFN6hTLU2uWVZkQ08fFlZFR0bbnvx04wO2//oJeWrVD0/hp3z76hYfzwRVT7BKtXdO1F6Miovlm/y6OZWUS7O7BlE7daO/rV9lLtWouRhNvj5/M5tRkfj28j7ziYroGBnNFx654tvDkdXBuagZKA0v5EbMamqZZp1OU2JSSxOxfvia3uKh0Cz7csZVlRxP5/PLphHp61akdIe6evDDiEv41eAxpBXn4u7hf0O/faDDw3Mjx3Pjrt+go9JIraE23jtKY12cQMb61T1LZUHannWLKd5/Z5QAx6zrfHdjN5tQkfp56Xav4e2xqabl5zP/tL37fe8A2uik2JIj/G3cRAyLa2rYL9PDg25kzeHHVan7YvYdCiwU3k4kre3TnzsHxeDdB4Fq0YvVYXSIrK8tusYuLCy7lOnQAioqK2LRpEw8++KBtmcFgYMyYMaxZU7F0cGXy8vIoLi7G3792n8UykkE0CpPp/AmxTE4NlzRLCGEV6OdJfJ/2GKsYSWDRFZePqueh5wYDxMfbnnqkpfL8M5fg7WftUTU6GTEYrDdKU+eOZeq8sbBzJyQmnjvG4MEVMtM3hgA3d/oGlgzXLnttUNLjOqpNNC5Ga7z+eGYmt//6CxZdt134lg7T3ZKczL+WLa1w/FBPL+b2HcQzF43jnwOHSoChGpqm0S80nEcGj2TBiLHM6ta71dzQDWkTWWZevVbuAWalc1E7a5nHvOIiblz0LXnmYtvAAL3kj/d4dib/+PPnC26Ph5MzEV6+9fL7H9GuPV9PvoaL2kVj0DU0XaOzXxAvjrqEewYOveDj16dHlv9ZaaJXBSRmZfDhThmR2dByCou45qMv+aNMgAFg36kzzP7kG7acSLLbPtjTkwXjxrLlH3NZ9/db2fKPuTw+ehS+ko9BNGPt2rXDx8fH9liwYEGl2505cwaLxUJIuRLgISEhpKSk1Oi17r//fsLDwxkzZkyt2igjGUSjiIgKJDDIizOnKx+GrWkweGinRm6VEK3T3TeM4pZHFpKZnW/rodc0DaUUU8b2om+32meeP68RI6DMULuYdb/zyRfXs/JIEUf3JOHh7c7wK/oT0i7AGmB4+237/YcPr/821cCR9LNsTExCM2koozoXmtdBM2v8snsfj40Yia+bG59t346uVKV5mXSl+Hb3bh4YNgzf81QKEKK8MA8vpnXszlf7d9oCBqWMmkb3wBAGh1nn5/90cB+ZhZXnL7EoxYaUk+xJO01sgOOUcOwTEsb7l0yh2GLBonRcTY6X9DSrsJBNp5KqLvui4KOdW5nbV/KpNKRvtu0kMa1iOdXSgMPzS1by6XVXVdjPxWTCxSS3PaLp1Ed1iNL9jx8/jrf3uTx2lY1iqA9PP/00n3/+OUuXLsW1liN/ZCSDaBRGo4Hr5lR+k2AwGvDz92TsJb0auVVCtE5tQnz54OlZTJvQF19vN5ydjHRuH8yj8ybwzxtHN8wc85AQGFqmV7KgAKc332Dk/iVcP9iHacOCCTmwDRYsgJdftk/yOHSodf8m8MOePWAATdcwmA1oRRpaUcm/lYYZxeKDhwDYkZpSZZlDsA6rPpiW3lhNFy3M4/Gj6OgbUCHLuK+zG6+NvNz2vt1xOsWuvGFldp5ObdjG1pGT0eiQAQaAlPMleNUgvcBx8zK0FD/t3Fdlgn1dKTYcO0larpwH4YDqsbqEt7e33aOqIENgYCBGo5HUVPvP/NTUVEJDQ6tt7vPPP8/TTz/NH3/8Qc+etc+NIyE90WgmXNaHgoJi3n3jLwoLim3L20cH8ci/p+LpJXPjhGgsQf6e3HHdRdxx3UWN96IzZkBaGuzZc25ZYqL9tIjyYmOt+zWRHaklwwm10v+VCcCU/HPXqVNMAzycnTFoWrWBBvdWUpayNvKLi/nzyCHO5OXRzseHiyLbn/cmuTV6cf1q9qemA9q5LiIFGUWFPLh0MR9dNhVN03B3cjrvtN+6/h0WWSx8uXsHn+3czsnsLMI8vbi6Ww9mdO9pmzbUUrmZTOcqwVRGgUn67hpcdkHV+b1K5RYWEeDh3gitEcKxOTs7069fPxISEpg8eTKALYnjvHnzqtzv2Wef5cknn+T333+nf//+dXrtlv2NIBzOFdMGMn5ib9avPUhuTiHtY4Lp0jW8UbJzCyGamMkE8+bBwoWwcuX5tx861BpgaMIhrlo1VTAAUNgqZUzo2InFhw5VuWlbb2+6BDnOEHVH8N3e3TyyNIHc4iJbgCbI3YP/jZ/IoDYNMG2nmTqTl8f727cAJYGuMsVKdBQrjh9lc2oy/ULDuSS6M29t21jlsVyNJoa3i6p1G4osFm788VtWn7BWsVBAdmEh85f/xaKDB/ho0tQWPRw91NMLV2WiAHPlgQYNuvs3zYir1qR7WDDHMjKqrPrj6eJMqHftK50I0dDqc7pEbdx9993Mnj2b/v37M3DgQF566SVyc3O54YYbALjuuuto06aNLa/DM888w6OPPspnn31GVFSULXeDp6cnnp41f29JyFU0Ojd3Z0aM6soll/chtlsbCTAI0ZqYTDBrFsyfD2PGWKtGlOXubl0+f751uya+aekVGlp1zyVYbyxCrHWmJ3TsSNegoArl0ko9MGx4meR9YtnRRP65eJGtAkLpCJC0/Dyu/+EbDqWnNWXzHMry44kVyqCWZdIMLD5yEIBewaGMjeqAQWm23CGYsQUmbu8fj5dz7efvfrx9K6tPHLOrMln67w1JJ3l366ZaH7M5cTIaualH/3MBHmX/f60I/jnIsRJVtkQz+/euMsBg0DRm9OuJcwsOdolmTFf186il6dOn8/zzz/Poo4/Su3dvtm7dym+//WZLBnns2DGSk5Nt27/++usUFRVx5ZVXEhYWZns8//zztXpdeRcKIYRofCEhMG0aTJ0KGRnWHAyurtYylQ40VP7yLrG8WE2ZJyeDkTExHQBrYrFPr5zGIwkJ/Hpgv+2mOczTi4eGD+eSTpLctqz/rV9jSzhalq4UZl3nvW2beXLkxU3UOsdSZDFXv4EGhWZr1QNN03ggbjgbjp4ks6AQhUJDQ7NodAwIYHa3PnVqwyc7tlY5F16h+GTHNm7rH1enYzcXt8fHs/vUKf46fgScrEleNQVascZ9g4cxqJ2MvmlofduF8+DFI1iweBlGg4ZFV7ZRUIOi2nH78PjzH0SIVmbevHlVTo9YunSp3fPE6qaw1oIEGYQQQjSZIl1x2mDCw9cfX3fHy8sS6evLDX368P6WLZWuv3foULzLJFzycXXlfxMn8kjuRew7cwZPZ2d6hIRgdKDAiSPIKy5mc0pSlestSvHn4UMSZCjRNyS82vVmXadfaLjt3zf/+AM5hdYRImXziBxKT+fhJX/y0vhLat2Gk9lZ1a5PzslGKdWiRyc6G428PXkyS48c4bvduzmTl0eHgACu7tGD7k2UnLY1uiGuL4Oi2vHF5u0cOJ2Gn5sbk3rGMqpjtHzWCsdVdhjYhRyjmZAggxBCiEZXUGzmlb/W8MWG7baboSExEdx18VC6hTvWxfrDIy4ixNOTtzZuJD0/H7COTrgjPp6runevdJ8gDw+CPDwas5nNSnXJMUtZarBNa9EpIJChbSNYc/J4hd+LUdMIdvdgbLR1RE3C4UMcyThb6XF0pfhp317uHTyUNmXKn9WEv5s7qbk5Va73c3Vt0QGGUkaDgdExMYyOiWnqprRqsSFBPD5hdFM3Q4ga06iHnAz10pLG4fDhvscffxxN0+weXbp0sa0vKChg7ty5BAQE4OnpydSpUyuU6RBCCEeUX1DEvv3JHEk8jV6HeXbNldmi87ePv+P9VZtsAQaANYePc807X7DjZEoTtq4ig6ZxS/8BrLn5Fn6bdR2LZ1/PiptuqjLA0Jyczsnhrh9/ZchrbzPktbf558+LOJ1T9Y1kffF0diY2MMi+WkcZRk1jWERkg7ejOfnf2Il0CbAmDjUZDJg06yVcoLs7H19+Jc5GIwBrThyvtjqHAtadPFHr15/etXuVOUWMmsZVXXvU+phCCCFapmYxkqFbt278+eeftuemMgld7rrrLn755Re++uorfHx8mDdvHlOmTGHVqlVN0VQhhDiv4mIL7364nB9+2kJBSTnX0BAfbrphOKNHdm3i1jW8P3YfYH1ixZscXSnMFp2nFy3j05umN0HLqudkNNIpMLCpm1Fv/jp0hL99+71dHqkfdu/lpz37ePfKyQxrH9Wgrz+3fxzzfvu5wnIN643wnN79GvT1mxt/N3d+nDaTZccSSUg8jFm3MDC8LRM7dMa1zHVRTZKLnq9oSmXm9OnPrwf3cyTjrN1oCqOm0c7bh1v7Daj9QYUQorVQivPWF67JMZqJZhFkMJlMhIaGVliemZnJu+++y2effcaoUaMAeP/994mNjWXt2rUMGjSo0uMVFhZSWHiuzm5WVvXzDIUQor4opZj/1A+sWnPQLuFdSmom/376J4qKzEwY17MJW9jwfti625aoqzxdKTYfSyI5M5swH68maF3rUGQ28/fvfqw0UbWuFDd/8wM77/5HtT3iF2pix86czM7i2dUrUFhvji26jovJxIsXX0L3YMeaNuMIjAYDo6KiGRUVXeU2wyIi+WBr5TlEwPp7Htwuotav7e3iwtdXzuDlDWv5YvcOcoqKcHdy4qqu3bl9QDy+rm61PqYQQrQWTVXCsqk0iyDDgQMHCA8Px9XVlfj4eBYsWEBERASbNm2iuLiYMWPG2Lbt0qULERERrFmzpsogw4IFC3jiiScaq/lCCGGzc9dJVq4+UOX6N97+izGjuuHkZGzEVjWutNz8887Jz8jLb/AgQ0ZePl+u38HiXQcoLLbQv30brhnUmw4hAQ36uo7grXUbqy2JaNZ13lm/kVsHDWzQdtzSdwBXdO7Kj/v3cjovlwgfXy7r1LlOJRaF1Yio9sQGBrE/7UyF/A0aMK1rN4I9al7rvCwfV1f+b9hFPDhkOHnmYtxNTpJo7zzyi4pZl3icgmIz3cNDaOvn09RNEkKIBufwQYa4uDg++OADOnfuTHJyMk888QTDhg1j586dpKSk4OzsjK+vr90+ISEhpKRUPaf3wQcf5O6777Y9z8rKop2UHRJCNIIlS/dgNBqwWCq/wcvKLmDr9mMM6Ne+kVvWeDoE+bM35VSVtc5NBgPhvrVLSldbR9MyuPbNLzhbJuBx5Ew6X23YwQtXT+Ti7h0b9PWb2vrj55+Tv+7YiQYPMoA1SeacPjI1or4YNI0PJk9h7i8/sTE5CUOZMqFTYrvyxMgLT5ZnNBgkEHQeSineX7OZV5auIa/IOi1OAy7q1J6nJo/Dz11GfgjRqkh1CccyYcIE27979uxJXFwckZGRfPnll7i51e0D2sXFBRcX+XIULYtSil27TnLwYCru7s4MGtQBb2+5iHE0uXmFdtMkKt0mt7Da9c3d1QN78cO2PZWuM2oal/TojI9bw5azvPfzX8nIsx9RYdEVGnDPF7/yV9TN+Hu6N2gbmpKbk9N5t3F3Pv82wjEFeXjw5VVXsz01hY1JSTgZDFwU1Z52PtKL3lg+XLuFZ/9YbrdMAcsPJDLn42/58qYZmIwyCkSI1kJTCu0Ccypc6P6NyeGDDOX5+vrSqVMnDh48yMUXX0xRUREZGRl2oxlSU1MrzeEgREPKyMpjxZoDZOUUENUugLh+0Y12AXEy6SyPPfoth4+cRtOseWFMJiOzrh3MtdcObhVlxZqLmPZB/Lmk+m2io4IapzFNpHe7MG4fNZj/LVmN0aDZbu4BogL9eGDCiAZ9/T1Jp9h5svIqRArrVIHvNu9mzvD+DdqOpnTDgL4kHDpc7TY3Daj76IIii4W/DhzmaHoG/u5ujO3SAW/Xhg0ciYp6hoTSM0SuhxpbQbGZV5auqXSdRSl2J5/ir/2HuTi2QyO3TAghGkezCzLk5ORw6NAhZs2aRb9+/XByciIhIYGpU6cCsG/fPo4dO0Z8fHwTt1S0Jp9+s453P1mJ2aJjMGjouiIwwJMnH5pMbMewBn3tvLxC7rzzUzIycoFziWfNZgvvf7ACN3dnrpwqWb8dxfixPXjvoxUUF1sqJAk2GjR69GhHRETLzwnw94vi6BsRzqfrt7I3+TQ+bq5c3juWKX264eHi3KCvfehUerXrDZrG4dPVb9PcDYpoR6inJynZORULbyto4+1FnzbhdTr2+qMnuP2bn0nPy8eoaViU4olFS3hw7Aiu6dfrwhsvhIPbfOykXXne8owGjT/3HpQggxCtiV7yuNBjNBMOH2S45557uOyyy4iMjCQpKYnHHnsMo9HIjBkz8PHxYc6cOdx99934+/vj7e3NP/7xD+Lj46tM+ihEfftl8Q7e/PDckEi9ZJ55+tlc7vq/L/n4tRsJCmi4BHZ//LGTtLSq69p/8slqJl3et0UnEmxOfHzcefz/ruDR+d+h6zq6rmyjT4KDvXno3olN3cRGExfdjrjoxs+H4+N+/h51b7eWPaUur6iYnMx86xMD5wINCtAhOzOfgmIzrk61u0xITD/LnM++pbgkqWRp4sFCi4XHFy0hwMOdcV1adr4LIQrNlmrXK6UoOs82QoiWRaZLOJgTJ04wY8YM0tLSCAoKYujQoaxdu5agIOtw4hdffBGDwcDUqVMpLCxk3LhxvPbaa03catFa6Lriwy9WV7muoLCYHxZt46ZrhzZYG9atP2S7Sa1MVlY+Bw+lEtulbr2Sov7Fx8Xw8bs38+MvW9i56wTOLk4MH9KJi0d3w82tYXvxW4MDyWf4dfNesvIKiA4J4LL+sXiXCSzERbfD192VjLyCSve36IrLenVprOY2id9376egyIIJa8eIKolBahZrzCEPM4v3HuSyHrX7PXy4fgtmXa+0eoimwSvL1zK2cweZwiVatG7hwVWW6QXr93Xvtg07ylEIIZqSwwcZPv/882rXu7q68uqrr/Lqq682UouEOCc5NYOUU1lVrtd1xZqNhxo0yGCxqCoDDOe2aUbjq1qJ0FAfbplzUVM3o0XRdcWT3yTw1ZodGA0aGhoWpfPSLyt5/rqJjOgWDYCzychDl47kvi8XoVExWfMVfbvRtU1Io7e/MSVlZmMyGDDrOgaAcp2qRoPGyYyqP9uqsvTAkQplE0spBftOnSEjv0Ay64sWLdjLk0t7dOHnHXsrBBoMmoa7sxOTe3dtotYJIZpEK6suIWlthbgANRm1dL5KAheqd++IansF3dyciIkObtA2COEIPli6ka/W7ACsoxHMuo5SUFRs5q4PfiLx1Fnbtpf27sIrsy6nQ8i5/Bd+7m7cOXYI86eMafS2N7ZgLw8setXBR4uuCPbyqPVxVQ2ugJrRNZIQdfboxFH0i2gDWEt+GjUNDfBwduKtmVc0eAUdIYSDUap+Hs2Ew49kEKKm8nILObI/BZPJSEyXMEyNkIMgLMSHoABPTleRE8Fg0BjYt32DtmHiJb1YuHAt+flFtnwQpTQNpkwZIEPwRYtXbLHw4V+bKl2nsAb7Pl+1lQeuGGlbPio2hpFdoknJzKHQbKaNnzdOxtaRu2Rc1478a9FfVc4LdzWZGBtb+9wJw6Kj+HrrzkpHM2hATKA/fnJzJVoBTxdnPpx9JWuPHOOPPQcpKDbTs00ol/fsgqdry875IoQQEmQQzV5xsZkPXv6Tnz9fT2FhMQC+/h5c+/dRTJw2oEHn/hqNBmZNG8QLb/xZYZ3BoOHsZOSKS/o02OuDNZHgs89O56GHviIzMx+TyYCuK3RdMX5cT66f3XBTNYRwFCfTsjibm1/leouu2HDwRIXlmqYR5ttwiVkdlberK49dMpqHf/zDbu64QdNQSvH4paPxrEOVj9lxffh2+y50S8UxDQq4bWic5GMQrYbBoDE4JpLBMZFN3RQhRBPTlPVxocdoLiTIIJq95x7+hhV/7LKblpCRnssrT/5EYUExU68b0qCvP2lCb9Izcvnoy7UoZb2osFh0vDxdefKhyYQEeTfo6wPEdgnni8/nsnz5Pg4dOoWbmxMjRnQhMjKwwV9bCEdgMp5/9p+zqXWMUqipqX26EezlwZsr17Pp6EkA+ke24dZhAxkcXbebog6BAbx51WTu+PZnsguLMBkMWJSOUTPwz1FDubR7y06oKYQQQlSqPqY7yHQJIRrH/l0nWf77zirXf/RqAhOm9sfdo+GGJmqaxo3XDOXycb35a9U+snMKiGwXwLBBHXCuqvybrsPZs1BYCC4u4OcHhgtLkeLsbGLMmG6MGdPtgo4jRHPUxt+bqCA/jp4+W+mcf4OmMaqH1KQvb1iHKIZ1iMJckp/BdIGfQwBDYyJZdeff+H3vARLTzxLg4c4lsZ3w93C/4GMLIYQQwvFJkEE0aysX78JoNFRZPaGwoJhNqw8w7OLuDd6WwABPpl3er/qNUlNh2TJYswby8s4td3eH+HgYMQJCWnZWeyEagqZpzLtkMPd8+EuFdUaDhre7K1cO6tEELWse6iO4UJark4lJPWLr9Zii9g6knmHL0SRMRiNDO0YS7O3Z1E2qHw0QqBdCiIak6dbHhR6juZAgg2jW8vIKrdnEqpGfV9Q4jamO2QwLF8LKlZWvz8uDhATrY+hQmDEDTPL2FKI2xvbqxPyri3nu+2VkFxTalseEBPDc7In4eTbfsolZ+QUsXLGV79btIjO3gMggX64e2pvLB3TFYJAcB8JeRl4+93z+K6sPHrMtM2ga0wf25IFLRzTfBKcSqBdCNFcyXUKI5iOmcxgWc/VhvZjOYY3UmiqYzfDKK7BnT822X7kS0tJg3jyHCDRkZObx14q9ZGbl0zbcj2GDO+Hi3PTtqkxhQTGrl+zh5LE0fPw8GD62Gz5+tS/DJ5qvyQO7MaFPZ1bvO0pWXgHRIQF0jwhp1skGz+bkc93/vuD4mQxbgsa9J0/x6Od/sGbfURZcO0ECDfXAousYNK3J/laUUlh0VaP8ItXRdcWtH3zPrqRU++VK8fm6bRgMGg9fNrKKvc8vt7CIP7YfIDkjixAfL8b17Njw1RIkUC+EEM2KfAKLZu2iCT1458XfycstRJUr32gwGujULZyYLk0cZFi4sGKAISqKov4DsHj74JqbjbZuHSQmnlu/Z491v1mzGrWp5X3+zXre+nA5uq5jMBhsCS3nPzSJvr0cK1v25jUHeeq+L8nJLsBoMqBbdN587lduvGMsU2YNburmiUbk4mRiZPeYpm5GvXnl11WcSDsXYAAo/bhbtGUfI3vEML5P5yZqXfOmlOLXbft4f9km9iSdwmQwMLJrNH8bHUdseHCjtOFYWgZv/bWeX7fuo9BsJjLAl1lD+zI9rmedgkdrDx1j+4mUStcp4PN127h1ZBwBnrXPkbFo2z4e+eoP8ovN1qSeus5TP/7FY1NGc3nfrrU+Xo0080C9EEIAJfW06+EYzYRMYBPNmpu7C0/8byYuLk5oJRdjpb1QQSHePPTs9KZsnnVoZ9meF1dX9k6ezq0+nRj40w4GfbqSK9Yc4/uRl6PmzQPXMvXjV6607t9EFv+1i9ffW4rFoqMUtrwXObmF3P/Y1xw/md5kbSvv+JHTPHr7p+TmWIfIW8zWNpvNOm/95zeW/b6jiVsoRN0UFJn5YcNuLHrlVxYGTeOr1fL3XVevLF7DfQsXsTf5FABmXWfJ7kNc88rnbDhcseRpfTt0Ko2rXv6MHzfvptBsBqxBh3//sIQHv/rNrmpSTS3bd6TaHBsWXbGmzDSKmtqceJJ7F/5KQbG1nWZdRwEFxWYe+uJ31h06Xutj1ki5QL2uKxJ1D743xvCVd2/29xqOHlku6F0aqBdCCAehKVUvj+ZCggyi2eveN4r3f7mL624bRd/4GAYM68gdj07izW/+QXCYb9M2btkyu6dbRl3CjN+2s27fcVswMjH1LI999gf/PZIBN99sv//y5Y3SzPKUUny4cA2VjRpWSmGx6Hzz4+bGb1gVvv9sLUpXlV6Qaxp89tayOl2sC9HUMnLzKTJbqlyvK8XxMxmN16AW5PCpdN5IWAfYT3O16AqzbuH/vvqjwT83nvzhL3ILi+yCSKX/+nnLXlYdOFrrY1YVkLLfpvbZw95duhGDplXakaZpGu/8tb7WxzyvcoH6XAs8cjSQW3b68eaOYt5bk868jw9z+8Fgsmff5FCBeiGEaM0kyCBaBL8AT2bcfBFPvXE981+exYSp/XF1d27aRum6NTlV6dPISO7beAyllN2w59J/vf/nRg4HhkFU1LljrF5tPU4jO5uRx/GT6VXml7HoinUbDzduo6qxYeX+KiuMKAVHD50iKyOv0vVCODJvd9dqe6UNGgT7tJCKAY3s+027MFYxHUFXcCI9ky1Hkxrs9VMys1l36Ljd90FZRoPGdxt21fq4/du3sZUkrYwG9I0Kr/Vx1x48VmUAQ1eKtQeP139QpkygXleKp08EsjnPmmfHYtZtn/sH96fwr4V7HSZQL4QQFZQmfrzQRzMhQQYhGsrZs3bZr/e1jeFUZk6V06mMBo2f1u+BuLhzC/PyICOjQZtZV46UR68midqac+I/0Xq5uzgxtnenam+Gp8Y3fIneZknXrXPzk5Ks/y93430qM+e881tPZ+U2WPPOd2yLrkjOzKr1cUd3jaGNr3elfzMGTWNMtw608/et9XHPp94/YssF6k8avFiX6VppQFm3KLZuTOSwa5BDBOqFEKICBegX+Gg+MQZJ/ChEgykstHuaZjr/yIoz2bnQrlyysYKC+mxVjfj5uhPZLoBjJ9IqDZoaDBqDBjhOYr0BQzvx6zcb0Su5+NQ0jcgOwXj5NN/yhaJ1u+PSIaw7cIyM3Hy7nmSDptE/pi2X9ottwtY5oBqWOQzz87Z261dz0Rbm591gzQz18az25Y0GjbZ1CAY4GY28M2cqt7z/LcfTMzEZDCisUyQGtG/Lk1PH1qm9gztGsmzv4UpHMxg1jfgOEfUbzC0XqN/hGo7RWFDlqDVNg51bjxEdF3cukXJpoN7fv/7aJYQQ4rwkyCBEQ3GxL+kVplU9rxqsI6DaBvhYL6zKKjvHtJFomsbsGYOZ/+xPFdYZNA2TyciUy/o2eruqMnnmIH7/fjNKqQpVRpRSzLzlIhnJ0EhOnsrgm8Vb2bznOCajkRH9O3D5yB74eEqQp67C/LxZePc1vP3HOn7cuIfCYjMBXu7MGNqb60f1w8lkbOomOoZaljm8YuwE3l5SeR4Bg6YRFeRHj7YhDdbcIG9PhnWOYtWBo5XeuFt0xZUD6jZKJTLAl1/uup6l+w6zOTEJJ6OBkbHR9GoXVufPwpsuGsDSPYcrBEY0QEdx88iBdTpulcoF6s2e3kDVQXelwGgygp+f/YomCNQLIUR59ZG4sTklfpQggxANxc/P2nNW0hMTfWQ/0SFtSDx9Fr2Kea2T4rrBG6+cW+DuDr6+jdDYikaPiOVsRi6vv7cMs9mC0WgtYent7cYTD15O23C/8x+kkbSNDOTfr17Lk/d+SVZGnrWtuo6Tk5Gb7x7PsIu7NXUTW4U1245w3wvfo+vKdtO061AKCxdt4o1HphMRJr2JdRXq68UjV43h4StHU2S24OJklMBZWXUocxiRlsY9Y+N57o81GDTNlhvBaNBwNpl46qpxDf47fnjSKK557XPrKJWS1y9tyzXxvRgQ3bbOxzYZDYzp2oExXTvUS1t7RYbxn5kT+b+vfie3sBijQcOiK1ydnZg/9WL6X0BbK1UuUN8rxgdLQuWlOcE6kmFAfAzs2mS/ogkC9UIIUYHiwnMqNJ8YgwQZhGgwBoN1aG5CAgDa0aP8Z9oQrv0jh4KiYttNWOmF2kNXjSL05NFzwzwBBg+2HqeJXDmpP2NHdWPZqv1kZuXTNtyPIXEdcHJyvJ7TXgOi+eSPe1i7dC9Jx9Lw8fdk6JiueHlLD3pjyMkr5MH//oi5pORpKaUUmdn5PPy/n/noqVlyY3yBDAYNV2f56q6gXJnD3Ox8dqRYWJpsJMfoSu+uQYxuq+OXW6b07p49XB8QQMwNk3l/+Sa2HUvG2WRkbI+O3DC8P1FBDR9Ibevvw9e3z+TDFZv5acsecguL6BgayLVD+jCxV2eHe7+M7dGRoZ2jSNh5kOSMbEJ8PBnTvQMeLg2QaLlcoD4i+SB9B0azdeORCoF6TYNxl/YmOMQH3lt3bkUTBuqFEKI1kysVIRrSiBG2IANA9E/f8t20q/kwtZDFWw9QZDbTJ6YN143sS9/8DHj7bfv9hw9v3PZWwtvLjcvG92rqZtSIs7OJ4WMlCV5T+G3lbgqLzJUG6S264sCx0+w+nEK3mLDGb5xo2cqVOTyTUcD81XBIC7Tladm6NocPNFjwf+PpsXPpuSH0K1cybOxYht1yZRM03CrY25N7Jw7n3olN/3lfE+7OTlzWtxHygFQSqH9sznie93BhxV97ymymMeHyPtx293jYudOhAvVCCGFTH9UhZLqEEAKAkBAYOvTcBXBBASEff8B9UVHcNzLO2lNz9ix8+bH9hRFY9wtpuPnAQtSnwyfTMBoMmKtIygZw5ESaBBlE/Stb5lBXPLvNmYO4ocr8LVosOpoGjz23is8+mY3ru2+e23/5cpg2rTFbLGqqXKDe7dMPeeTmm0meO5odW49hMGj0GdCegEAva4DBAQP1QggBWKtDXOjgtGZULEeCDEI0tBkzrOXTys4VTkysGFQoKzbWup8QzYSXuwvqPBF2T3eXatcLUWvlyhyesLixI7vyOl9KQX5uIcuPFDE2KurcZ/Dq1TB1qvR4O6JKAvW8/DJhUVGExZUE6rdvgHXrJFAvhBAORIIMQjQ0kwnmzas+63lZQ4daAwwmeXuK5uPi+C58+GPlmfoBPNycGdQzqvEaJFqHcmUOE30jMBiOVplc1+RkJOnwKZAyh82HBOqFEC1Aa6suIWF7IRqDyQSzZsH8+TBmjDUZVVnu7tbl8+dbt5MAg2hmOkQEcflFPapc/49rRuDq4tSILRKtQrkyhy5hIVUGGMA6bcIn0EvKHDYnpYH6oUNrtv3Qodbt5XtUCOFISnMyXOijmZBPYCEaU0iIde7v1KnWnrOCAmt5LV9fGaormr3754whLMibhYs2kZVjvWlrE+zD36YNYezgRkgUJ1qf8mUOuwXh4upEYUFxpZtrGoyY1A92brRfIWUOHVtpoH7sWGsOjdWr7Uaw4O5uTfI4fLhMkRBCCAcgQQYhmoLBIENzRYtjNBi4YfIgrr10AMdTzmIyGmgX6udwZfhEC1KuzKHrts3c+q8r+e+9C9EMGqpkVIOmaSilmH3/ZfgHe1vn8JeSMofNhwTqhRDNVSurLiGfyEIIIeqVk8lIdNtAIsL86zXAcDzpLJt2HOPYyfR6O6Zo5krLHJZKTGR8T28efe9mYrq1tS1u2yGYe1++jqvmXixlDluC0kB9eLj1/3L+hBCOTqZLCCGEEI7jyPEzPPP6H+zcl2Rb1rVjGPfeejEdo4KbsGXCIZQrc8jbbxN/883E/3Yf2Wdz0ZXC28/DGvCSModCCCFEg5PQrxBCCIeVfCqTvz+0kN0Hku2W7z2Uwtz/+5zjSWebqGXCYZSWOSxVUuaQBQvw2rIOn6MH0P76CxYssC4vm+RRyhwKIYRoDHo9Perg1VdfJSoqCldXV+Li4li/vupqYLt27WLq1KlERUWhaRovvfRSnV5TggxCCCEc1qffrye/oKhCxQBdVxQWFvPJd+uq2FO0KjNmWMsWlpWYCF98AW+8Yf1/+ZKHUuZQCCFEIyktYXmhj9r64osvuPvuu3nsscfYvHkzvXr1Yty4cZw6darS7fPy8oiOjubpp58mNDS0zj+vBBmEEEI4rD9X7sVSRUlCi65IWLW3kVskHJKUORRCCNFKZGVl2T0Ky5VzLuuFF17g5ptv5oYbbqBr16688cYbuLu7895771W6/YABA3juuee4+uqrcSlXwak2JMgghBDCYRUWVl6K8Nx6M6oZJUISDai0zOH8+TBmjLVqRFnu7tbl8+dbt5MAgxBCiMZSj4kf27Vrh4+Pj+2xYMGCSl+yqKiITZs2MWbMGNsyg8HAmDFjWLNmTYP+uPINK4QQwmF1bB/M3oOp6JUEEjQNoiOCpESmsCdlDoUQQjgaXYF2gZ0iJSM7jx8/jre3t21xVSMOzpw5g8ViIaRc7qGQkBD27m3YkaDybSuEEMJhTb+sf6UBBrAG9K++rF8jt0g0G1LmUAghRAvk7e1t97iQaQ0NRb5xhRBCOKxRgztzzaQBABgNmt3/p07ow/iLujVZ24QQQgghaqQep0vUVGBgIEajkdTUVLvlqampF5TUsSZkuoQQQgiHpWkat103gpGDO/Fzwk5STmcSHODNxFHd6d45vKmbJ4QQQghRA7UPElR6jFpwdnamX79+JCQkMHnyZAB0XSchIYF58+ZdYFuqJ0EGIYQQDi+2QxixHcKauhlCCCGEEM3G3XffzezZs+nfvz8DBw7kpZdeIjc3lxtuuAGA6667jjZt2tiSRxYVFbF7927bv0+ePMnWrVvx9PSkQ4cONX5dCTIIIYQQQgghhBANpQ7THSo9Ri1Nnz6d06dP8+ijj5KSkkLv3r357bffbMkgjx07hqFMzqKkpCT69Olje/7888/z/PPPM2LECJYuXVrj121WORmefvppNE3jzjvvtC0rKChg7ty5BAQE4OnpydSpUyvMOxFCCCGEEEIIIZqErurnUQfz5s3j6NGjFBYWsm7dOuLi4mzrli5dygcffGB7HhUVhVKqwqM2AQZoRkGGDRs28Oabb9KzZ0+75XfddRc//fQTX331FcuWLSMpKYkpU6Y0USuFEEIIIRqQrkNaGiQlWf+v603dIiGEEMJOs5gukZOTw8yZM3n77bf597//bVuemZnJu+++y2effcaoUaMAeP/994mNjWXt2rUMGjSoqZoshBBCCFF/UlNh2TJYswby8s4td3eH+HgYMQLK1UIXQgjhIJRufVzoMZqJZjGSYe7cuUycOJExY8bYLd+0aRPFxcV2y7t06UJERARr1qyp8niFhYVkZWXZPYQQQgghHI7ZDB9/DI8+CgkJ9gEGsD5PSLCu//hj6/ZCCCEcSxOUsGxKDj+S4fPPP2fz5s1s2LChwrqUlBScnZ3x9fW1Wx4SEkJKSkqVx1ywYAFPPPFEfTdVCCGEEKL+mM3wyiuwZ0/Ntl+50jqFYt48MDn8JZ4QQogWyqFHMhw/fpw77riDTz/9FFdX13o77oMPPkhmZqbtcfz48Xo7thBCCCFEvVi4sEKAId3NlyWGKH51i+Vg18GoyEj7ffbsse4nhBDCcTRh4sem4NBh7k2bNnHq1Cn69u1rW2axWFi+fDmvvPIKv//+O0VFRWRkZNiNZkhNTSU0NLTK47q4uODi4tKQTRdCCCGEqLvUVOvIhBJmoxNvHfHkx9WnMBjT0ACLRSemZwRPzr8O3++/hIIC68YrV8LYsZKjQQghRJNw6JEMo0ePZseOHWzdutX26N+/PzNnzrT928nJiYSEBNs++/bt49ixY8THxzdhy4UQQgghLsCyZXZP30/24+e1pwHQLToWizUB2JFdJ3jgod/Q58yx33/58kZpphBCiBqQnAyOw8vLi+7du9st8/DwICAgwLZ8zpw53H333fj7++Pt7c0//vEP4uPjpbKEEEIIIZonXbdWkSiR7RPEN1+chEquL3WLTuLuk2xMgYFRUZCYaF2xejVMnQoGh+5PEkKI1kFx4UGC5hNjcOyRDDXx4osvcumllzJ16lSGDx9OaGgo3377bVM3SwghhBCibs6etasisdcUVO3FpdFkZOuy3RAXd25hXh5kZDRcG4UQQogqOPRIhsosXbrU7rmrqyuvvvoqr776atM0SAghhBCiPhUW2j01e3qfdxcNDfz87BeW5mgQTUfXrUGjwkJwcbGeIxldIkTrUx/THWS6hBBCCCGEqJNyyam7xPihaRqqigtMi9lC31Hd4Owp+xX1WJlL1FJqqjWvxpo1dqNScHeH+HgYMUIScwrRmug6oNfDMZoHCaUKIYQQQjgSPz/rzWjp08N7GHXVIDSDVmFTo9FATM8I+ozsCuvWnVvh7g5lKm+JRmI2w8cfw6OPQkKCfYABrM8TEqzrP/7Yur0QQrQwEmQQQgghhHAkBoO1t7tUYiJ33NKf4ZMHAKAZNAwlAYdOfdvz72/uwrB797mkjwCDB8uw/MZmNsMrr9iVHq3WypXW7SXQIETLJ9UlhBBCCCFEkxoxwtrjXcL54w948J83M+vhyWz4YzvmYgs9hnSic79otF274O237fcfPryRGyxYuBD27LFbVBASzm6nUPKc3Gkf4kKblEP2waA9e6z7zZrVuG0VQjQuyckghBBCCCGaVEgIDB16rle8oABefpm2UVG0jYsDvyA4exSe/tL+phWs+8l8/8aVmmo3gkG5uPCjS1fe+TIRs/mgbXnXflE88vfr8f3u83OJOVeuhLFj5ZwJIVoMCTIIIYQQQjiiGTMgLc2+dzwxsWJQoazYWOt+onEtW2b39FfP7rzx5YEKm+3beox7n8rltWduwOmN18+tWL4cpk1r6FYKIZqKrqi2FnGNj9E8yGQ9IYQQQghHZDLBvHnWkQk1MXSodXuT9CE1Kl23VpEoURzelnd+OlrpphaLzonDp1l1EoiKOrdi9epmlTleCFE7Sun18mgu5FtICCGEEMJRmUzW+fpjx1p7u1evrlgScfBgaw4GGW7fNM6etTsnh30iKMjfWeXmBqOBDX/t4aKJcedGpeTlQUYG+Ps3bFuFEKIRSJBBCCGEEMLRhYRYh9NPnWq9GS0oAFdXa5lKqSLRtAoL7Z4We3hVv71SmM0Wa6nSskpzNAghWh6lLny6gyR+FEIIIYQQ9c5gkN5uR+PiYvc0KtgVo8mAxVz50GZdV/QYGG0dAVGWq2tDtVAI0dRUPeRkaEZBBgl9CyGEEEIIUVd+ftZpKyU8d29j3LSBaAatwqYGg4a3vwejJvWFdevOrXB3t45KEUKIFkCCDEIIIYQQQtSVwQDx8eeeJyZy6xXRxI3qCoDRaMBosl5y+wR4suDDW3BPPGhfJWTwYJn2IkRLpuv182gmZLqEEEIIIYQQF2LECEhIsD11+vADHpt3M/vnjmb17zspLCimU892DBnXA+cDe+Htt+33Hz68kRsshGhUrWy6hAQZhBBCCCGEuBAhIdYSoitXWp8XFMDLL9MpKopOcXHgF2rNwfDCc/YjGMC6n1QGEUK0IBJkEEIIIYQQ4kLNmAFpabBnz7lliYkVgwplxcZa9xNCtGhK11HahU13UKr5TJeQyV9CCCGEEEJcKJMJ5s2zjkyoiaFDrdubpM9PiBZPqfp5NBPyqSaEEEIIIUR9MJlg1iwYOxaWL4fVqyEv79x6d3drksfhw2WKhBCixZIggxBCCCGEEPUpJASmTYOpUyEjw5qjwdXVWqZSqkgI0froCjRJ/CiEEEIIIYS4EAYD+Ps3dSuEEE1NKeACcyo0oyCDhFKFEEIIIYQQQghRL2QkgxBCCCGEEEII0UCUrlAXOF1CNaORDBJkEEIIIYQQQgghGorSufDpElLCUgghhBBCCCGEEK2MjGQQQgghhBBCCCEaiEyXEEIIIYQQQgghRP1oZdMlJMjAuahQVlZWE7dECCGEEEIIIVqX0vuw5tRbXxtmiuECfzQzxfXTmEYgQQYgOzsbgHbt2jVxS4QQQgghhBCidcrOzsbHx6epm1FvnJ2dCQ0NZWXKr/VyvNDQUJydnevlWA1JUy01XFQLuq6TlJSEl5cXmqY1dXMcTlZWFu3ateP48eN4e3s3dXPEecj5al7kfDUvcr6aFzlfzYucr+ZFzlfz4ujnSylFdnY24eHhGAwtqzZBQUEBRUVF9XIsZ2dnXF1d6+VYDUlGMgAGg4G2bds2dTMcnre3t0N+KInKyflqXuR8NS9yvpoXOV/Ni5yv5kXOV/PiyOerJY1gKMvV1bVZBAbqU8sKEwkhhBBCCCGEEKLJSJBBCCGEEEIIIYQQ9UKCDOK8XFxceOyxx3BxcWnqpogakPPVvMj5al7kfDUvcr6aFzlfzYucr+ZFzpdoTJL4UQghhBBCCCGEEPVCRjIIIYQQQgghhBCiXkiQQQghhBBCCCGEEPVCggxCCCGEEEIIIYSoFxJkEEIIIYQQQgghRL2QIIMA4PXXX6dnz554e3vj7e1NfHw8ixYtsq0vKChg7ty5BAQE4OnpydSpU0lNTW3CFouynn76aTRN484777Qtk3PmOB5//HE0TbN7dOnSxbZezpXjOXnyJNdeey0BAQG4ubnRo0cPNm7caFuvlOLRRx8lLCwMNzc3xowZw4EDB5qwxa1XVFRUhfeXpmnMnTsXkPeXo7FYLDzyyCO0b98eNzc3YmJi+Ne//kXZPOTy/nIs2dnZ3HnnnURGRuLm5sbgwYPZsGGDbb2cr6a1fPlyLrvsMsLDw9E0je+//95ufU3OT3p6OjNnzsTb2xtfX1/mzJlDTk5OI/4UoqWRIIMAoG3btjz99NNs2rSJjRs3MmrUKCZNmsSuXbsAuOuuu/jpp5/46quvWLZsGUlJSUyZMqWJWy0ANmzYwJtvvknPnj3tlss5cyzdunUjOTnZ9li5cqVtnZwrx3L27FmGDBmCk5MTixYtYvfu3fznP//Bz8/Pts2zzz7L//73P9544w3WrVuHh4cH48aNo6CgoAlb3jpt2LDB7r21ePFiAKZNmwbI+8vRPPPMM7z++uu88sor7Nmzh2eeeYZnn32Wl19+2baNvL8cy0033cTixYv5+OOP2bFjB2PHjmXMmDGcPHkSkPPV1HJzc+nVqxevvvpqpetrcn5mzpzJrl27WLx4MT///DPLly/nlltuaawfQbRESogq+Pn5qXfeeUdlZGQoJycn9dVXX9nW7dmzRwFqzZo1TdhCkZ2drTp27KgWL16sRowYoe644w6llJJz5mAee+wx1atXr0rXyblyPPfff78aOnRolet1XVehoaHqueeesy3LyMhQLi4uauHChY3RRFGNO+64Q8XExChd1+X95YAmTpyobrzxRrtlU6ZMUTNnzlRKyfvL0eTl5Smj0ah+/vlnu+V9+/ZVDz/8sJwvBwOo7777zva8Judn9+7dClAbNmywbbNo0SKlaZo6efJko7VdtCwykkFUYLFY+Pzzz8nNzSU+Pp5NmzZRXFzMmDFjbNt06dKFiIgI1qxZ04QtFXPnzmXixIl25waQc+aADhw4QHh4ONHR0cycOZNjx44Bcq4c0Y8//kj//v2ZNm0awcHB9OnTh7ffftu2/siRI6SkpNidMx8fH+Li4uScNbGioiI++eQTbrzxRjRNk/eXAxo8eDAJCQns378fgG3btrFy5UomTJgAyPvL0ZjNZiwWC66urnbL3dzcWLlypZwvB1eT87NmzRp8fX3p37+/bZsxY8ZgMBhYt25do7dZtAympm6AcBw7duwgPj6egoICPD09+e677+jatStbt27F2dkZX19fu+1DQkJISUlpmsYKPv/8czZv3mw3L7JUSkqKnDMHEhcXxwcffEDnzp1JTk7miSeeYNiwYezcuVPOlQM6fPgwr7/+OnfffTcPPfQQGzZs4Pbbb8fZ2ZnZs2fbzktISIjdfnLOmt73339PRkYG119/PSCfhY7ogQceICsriy5dumA0GrFYLDz55JPMnDkTQN5fDsbLy4v4+Hj+9a9/ERsbS0hICAsXLmTNmjV06NBBzpeDq8n5SUlJITg42G69yWTC399fzqGoMwkyCJvOnTuzdetWMjMz+frrr5k9ezbLli1r6maJShw/fpw77riDxYsXV+hdEI6ntIcOoGfPnsTFxREZGcmXX36Jm5tbE7ZMVEbXdfr3789TTz0FQJ8+fdi5cydvvPEGs2fPbuLWieq8++67TJgwgfDw8KZuiqjCl19+yaeffspnn31Gt27d2Lp1K3feeSfh4eHy/nJQH3/8MTfeeCNt2rTBaDTSt29fZsyYwaZNm5q6aUIIByXTJYSNs7MzHTp0oF+/fixYsIBevXrx3//+l9DQUIqKisjIyLDbPjU1ldDQ0KZpbCu3adMmTp06Rd++fTGZTJhMJpYtW8b//vc/TCYTISEhcs4cmK+vL506deLgwYPy/nJAYWFhdO3a1W5ZbGysbYpL6XkpX6FAzlnTOnr0KH/++Sc33XSTbZm8vxzPvffeywMPPMDVV19Njx49mDVrFnfddRcLFiwA5P3liGJiYli2bBk5OTkcP36c9evXU1xcTHR0tJwvB1eT8xMaGsqpU6fs1pvNZtLT0+UcijqTIIOokq7rFBYW0q9fP5ycnEhISLCt27dvH8eOHSM+Pr4JW9h6jR49mh07drB161bbo3///sycOdP2bzlnjisnJ4dDhw4RFhYm7y8HNGTIEPbt22e3bP/+/URGRgLQvn17QkND7c5ZVlYW69atk3PWhN5//32Cg4OZOHGibZm8vxxPXl4eBoP95afRaETXdUDeX47Mw8ODsLAwzp49y++//86kSZPkfDm4mpyf+Ph4MjIy7EamLFmyBF3XiYuLa/Q2ixaiqTNPCsfwwAMPqGXLlqkjR46o7du3qwceeEBpmqb++OMPpZRSt956q4qIiFBLlixRGzduVPHx8So+Pr6JWy3KKltdQik5Z47kn//8p1q6dKk6cuSIWrVqlRozZowKDAxUp06dUkrJuXI069evVyaTST355JPqwIED6tNPP1Xu7u7qk08+sW3z9NNPK19fX/XDDz+o7du3q0mTJqn27dur/Pz8Jmx562WxWFRERIS6//77K6yT95djmT17tmrTpo36+eef1ZEjR9S3336rAgMD1X333WfbRt5fjuW3335TixYtUocPH1Z//PGH6tWrl4qLi1NFRUVKKTlfTS07O1tt2bJFbdmyRQHqhRdeUFu2bFFHjx5VStXs/IwfP1716dNHrVu3Tq1cuVJ17NhRzZgxo6l+JNECSJBBKKWUuvHGG1VkZKRydnZWQUFBavTo0bYAg1JK5efnq9tuu035+fkpd3d3dcUVV6jk5OQmbLEor3yQQc6Z45g+fboKCwtTzs7Oqk2bNmr69Onq4MGDtvVyrhzPTz/9pLp3765cXFxUly5d1FtvvWW3Xtd19cgjj6iQkBDl4uKiRo8erfbt29dErRW///67Aio9B/L+cixZWVnqjjvuUBEREcrV1VVFR0erhx9+WBUWFtq2kfeXY/niiy9UdHS0cnZ2VqGhoWru3LkqIyPDtl7OV9P666+/FFDhMXv2bKVUzc5PWlqamjFjhvL09FTe3t7qhhtuUNnZ2U3w04iWQlNKqSYcSCGEEEIIIYQQQogWQnIyCCGEEEIIIYQQol5IkEEIIYQQQgghhBD1QoIMQgghhBBCCCGEqBcSZBBCCCGEEEIIIUS9kCCDEEIIIYQQQggh6oUEGYQQQgghhBBCCFEvJMgghBBCCCGEEEKIeiFBBiGEEEIIIYQQQtQLCTIIIYQQLVRKSgoXX3wxHh4e+Pr6NnVzhBBCCNEKSJBBCCGEaAYuv/xyIiIicHV1JSwsjFmzZpGUlFTtPi+++CLJycls3bqV/fv311tboqKieOmll+rteEIIIYRoOSTIIIQQQjQDI0eO5Msvv2Tfvn188803HDp0iCuvvLLafQ4dOkS/fv3o2LEjwcHBjdTSmisqKmrqJgghhBCinkmQQQghRKui6zoLFiygffv2uLm50atXL77++msAlFKMGTOGcePGoZQCID09nbZt2/Loo48CYLFYmDNnjm3/zp0789///tfuNa6//nomT57MU089RUhICL6+vsyfPx+z2cy9996Lv78/bdu25f33369xu++66y4GDRpEZGQkgwcP5oEHHmDt2rUUFxdXun1UVBTffPMNH330EZqmcf311wOQkZHBTTfdRFBQEN7e3owaNYpt27bZ9jt06BCTJk0iJCQET09PBgwYwJ9//mlbf9FFF3H06FHuuusuNE1D0zQAHn/8cXr37m3XhpdeeomoqKgKv5cnn3yS8PBwOnfuDMDx48e56qqr8PX1xd/fn0mTJpGYmFjj340QQgghHIcEGYQQQrQqCxYs4KOPPuKNN95g165d3HXXXVx77bUsW7YMTdP48MMP2bBhA//73/8AuPXWW2nTpo0tyKDrOm3btuWrr75i9+7dPProozz00EN8+eWXdq+zZMkSkpKSWL58OS+88AKPPfYYl156KX5+fqxbt45bb72Vv/3tb5w4caLWP0N6ejqffvopgwcPxsnJqdJtNmzYwPjx47nqqqtITk62BUKmTZvGqVOnWLRoEZs2baJv376MHj2a9PR0AHJycrjkkktISEhgy5YtjB8/nssuu4xjx44B8O2339K2bVvmz59PcnIyycnJtWp7QkIC+/btY/Hixfz8888UFxczbtw4vLy8WLFiBatWrcLT05Px48fLSAchhBCiOVJCCCFEK1FQUKDc3d3V6tWr7ZbPmTNHzZgxw/b8yy+/VK6uruqBBx5QHh4eav/+/dUed+7cuWrq1Km257Nnz1aRkZHKYrHYlnXu3FkNGzbM9txsNisPDw+1cOHCGrf/vvvuU+7u7gpQgwYNUmfOnKl2+0mTJqnZs2fbnq9YsUJ5e3urgoICu+1iYmLUm2++WeVxunXrpl5++WXb88jISPXiiy/abfPYY4+pXr162S178cUXVWRkpO357NmzVUhIiCosLLQt+/jjj1Xnzp2Vruu2ZYWFhcrNzU39/vvv1f58QgghhHA8pqYOcgghhBCN5eDBg+Tl5XHxxRfbLS8qKqJPnz6259OmTeO7777j6aef5vXXX6djx45227/66qu89957HDt2jPz8fIqKiipMFejWrRsGw7kBgyEhIXTv3t323Gg0EhAQwKlTp2rc/nvvvZc5c+Zw9OhRnnjiCa677jp+/vln25SF89m2bRs5OTkEBATYLc/Pz+fQoUOAdSTD448/zi+//EJycjJms5n8/HzbSIYL1aNHD5ydne3adPDgQby8vOy2KygosLVJCCGEEM2HBBmEEEK0Gjk5OQD88ssvtGnTxm6di4uL7d95eXls2rQJo9HIgQMH7Lb7/PPPueeee/jPf/5DfHw8Xl5ePPfcc6xbt85uu/LTGDRNq3SZrus1bn9gYCCBgYF06tSJ2NhY2rVrx9q1a4mPj6/R/jk5OYSFhbF06dIK60pLXN5zzz0sXryY559/ng4dOuDm5saVV1553qkLBoPBlseiVGX5Ijw8PCq0qV+/fnz66acVtg0KCjrPTySEEEIIRyNBBiGEEK1G165dcXFx4dixY4wYMaLK7f75z39iMBhYtGgRl1xyCRMnTmTUqFEArFq1isGDB3PbbbfZtm+KHvfS4ERhYWGN9+nbty8pKSmYTCa7hIxlrVq1iuuvv54rrrgCsAYByidhdHZ2xmKx2C0LCgoiJSUFpZRtZMXWrVtr1KYvvviC4OBgvL29a/yzCCGEEMIxSeJHIYQQrYaXlxf33HMPd911Fx9++CGHDh1i8+bNvPzyy3z44YeAdZTDe++9x6effsrFF1/Mvffey+zZszl79iwAHTt2ZOPGjfz+++/s37+fRx55hA0bNjRou9etW8crr7zC1q1bOXr0KEuWLGHGjBnExMTUeBQDwJgxY4iPj2fy5Mn88ccfJCYmsnr1ah5++GE2btwIWH++b7/9lq1bt7Jt2zauueaaCqMtoqKiWL58OSdPnuTMmTOAterE6dOnefbZZzl06BCvvvoqixYtOm+bZs6cSWBgIJMmTWLFihUcOXKEpUuXcvvtt9cpKaYQQgghmpYEGYQQQrQq//rXv3jkkUdYsGABsbGxjB8/nl9++YX27dtz+vRp5syZw+OPP07fvn0BeOKJJwgJCeHWW28F4G9/+xtTpkxh+vTpxMXFkZaWZjeqoSG4u7vz7bffMnr0aDp37sycOXPo2bMny5Yts5vmcT6apvHrr78yfPhwbrjhBjp16sTVV1/N0aNHCQkJAeCFF17Az8+PwYMHc9lllzFu3Djb76LU/PnzSUxMJCYmxjalITY2ltdee41XX32VXr16sX79eu65554a/WzLly8nIiKCKVOmEBsby5w5cygoKJCRDUIIIUQzpKnyEyiFEEIIIYQQQggh6kBGMgghhBBCCCGEEKJeSJBBCCGEaGJPPfUUnp6elT4mTJjQ1M0TQgghhKgxmS4hhBBCNLH09HTS09MrXefm5lah3KYQQgghhKOSIIMQQgghhBBCCCHqhUyXEEIIIYQQQgghRL2QIIMQQgghhBBCCCHqhQQZhBBCCCGEEEIIUS8kyCCEEEIIIYQQQoh6IUEGIYQQQgghhBBC1AsJMgghhBBCCCGEEKJeSJBBCCGEEEIIIYQQ9eL/AQi0VlH0TtcoAAAAAElFTkSuQmCC", "text/plain": [ "
" ] @@ -383,11 +410,15 @@ ], "source": [ "plot_data(\n", - " data[['feature_1','feature_2']], \n", - " circles=true_errors ,\n", - " color=label_quality, \n", - " title=\"Messy Regression dataset with label quality scores\", \n", - " colorbar=True)" + " data_x=data[\"exam_3\"], \n", + " data_y=data[\"labels\"],\n", + " circles=true_errors,\n", + " color=label_quality_scores,\n", + " title=\"Messy Regression dataset with label quality scores\",\n", + " colorbar=True,\n", + " xlabel=\"exam_3 feature\",\n", + " ylabel=\"label (Y value)\",\n", + ")" ] }, { @@ -402,8 +433,37 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 112, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Note: This cell is only for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", + "from sklearn.metrics import roc_auc_score\n", + "\n", + "label_quality_scores = get_label_quality_scores(np.array(y), predictions=predictions)\n", + "\n", + "true_errors = (data.labels != data.true_labels).astype(int)\n", + "label_quality_scores_residual = get_label_quality_scores(\n", + " np.array(y), predictions=predictions, method=\"residual\"\n", + ")\n", + "\n", + "if roc_auc_score(true_errors, 1 - label_quality_scores) < 0.5:\n", + " raise ValueError(\"Label quality scores did not perform well enough\")\n", + "\n", + "if roc_auc_score(true_errors, 1 - label_quality_scores) <= roc_auc_score(\n", + " true_errors, 1 - label_quality_scores_residual\n", + "):\n", + " raise ValueError(\"Label quality scores did not outperform alternative scores\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [] } ], From 87053989d2805a28a4afc529f81091286e3c1dcf Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 22 Dec 2022 01:16:36 -0800 Subject: [PATCH 014/258] tutorial draft1 --- docs/source/tutorials/regression.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb index 682a81ffb2..cc02658e80 100644 --- a/docs/source/tutorials/regression.ipynb +++ b/docs/source/tutorials/regression.ipynb @@ -331,7 +331,7 @@ "# Generate true errors\n", "true_errors = np.where(data.labels != data.true_labels)[0]\n", "plot_data(\n", - " data_x=data[\"exam_3\"], \n", + " data_x=data[\"exam_3\"],\n", " data_y=data[\"labels\"],\n", " circles=true_errors,\n", " title=\"Messy Regression dataset\",\n", @@ -410,7 +410,7 @@ ], "source": [ "plot_data(\n", - " data_x=data[\"exam_3\"], \n", + " data_x=data[\"exam_3\"],\n", " data_y=data[\"labels\"],\n", " circles=true_errors,\n", " color=label_quality_scores,\n", From c081913de21a45b54bb67427cd1fbbd28e498b7d Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 13:07:04 -0800 Subject: [PATCH 015/258] merge conflict --- cleanlab/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cleanlab/__init__.py b/cleanlab/__init__.py index 663977fc04..46b87525d1 100644 --- a/cleanlab/__init__.py +++ b/cleanlab/__init__.py @@ -8,5 +8,4 @@ from . import multiannotator from . import outlier from . import token_classification - from . import regression From d80e0774ebdbf152793845e81eec2bb6aab340ea Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 14:56:32 -0800 Subject: [PATCH 016/258] default modified for method in docstring --- cleanlab/regression/rank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index f53a0fb47e..a6176a8de3 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -29,7 +29,7 @@ def get_label_quality_scores( Predicated labels from regressor fitted on the dataset. Array of shape ``(N,)`` consisting predicted labels, where N is number of datapoints in the regression dataset. - method : {"residual", "TO_BE_NAMED"}, default="residual" #TODO - update name once finalised + method : {"residual", "TO_BE_NAMED"}, default="TO_BE_NAMED" #TODO - update name once finalised Returns ------- From 02defb93943361b497033bdd8fec69e6a16e7fe0 Mon Sep 17 00:00:00 2001 From: Mayank Kumar Date: Fri, 23 Dec 2022 15:01:21 -0800 Subject: [PATCH 017/258] grammatical correction in rank.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Elías Snorrason --- cleanlab/regression/rank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index a6176a8de3..4318d664df 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -15,7 +15,7 @@ def get_label_quality_scores( """ Returns label quality score for each example in the regression dataset. - Each score is continous value in range [0,1] + Each score is a continous value in the range [0,1] 1 - clean label (given label is likely correct). 0 - dirty label (given label is likely incorrect). From 4a0a9efdbf6a54c4bde3ed4588bcf5b97c3a1e9a Mon Sep 17 00:00:00 2001 From: Mayank Kumar Date: Fri, 23 Dec 2022 15:24:02 -0800 Subject: [PATCH 018/258] Update cleanlab/regression/rank.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Elías Snorrason --- cleanlab/regression/rank.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 4318d664df..dc98059253 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -115,8 +115,8 @@ def get_score_to_named_for_each_label( ) -> np.ndarray: """Returns label-quality scores. - This is function to compute label-quality scores for regression datasets, - where lower score indicate labels less likely to be correct. + This function computes label-quality scores for regression datasets, + where a lower score indicates labels that are less likely to be correct. Parameters ---------- From f2c5862864c76f72113a244e7dc2320c1b574d28 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 16:55:11 -0800 Subject: [PATCH 019/258] =?UTF-8?q?rank.py=20updates=201.=20added=20typing?= =?UTF-8?q?=20hints=20for=20scoring=20funcs=202.=20Removed=20try-except=20?= =?UTF-8?q?block=20for=20raising=20value=20error.=203.=20grammatical=20cor?= =?UTF-8?q?rections=204.=20knn=20and=20neighbors=20construction=20moved=20?= =?UTF-8?q?closer=20to=20first=20usage.=20Co-authored-by:=20El=C3=ADas=20S?= =?UTF-8?q?norrason=20?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cleanlab/regression/rank.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index dc98059253..99c42aba3e 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -2,6 +2,7 @@ from cleanlab.outlier import OutOfDistribution from sklearn.neighbors import NearestNeighbors from cleanlab.internal.regression_utils import assert_valid_inputs +from typing import Dict, Callable """ Generates label quality scores for every sample in regression dataset """ @@ -52,15 +53,14 @@ def get_label_quality_scores( # Check if inputs are valid assert_valid_inputs(labels=labels, predictions=predictions, method=method) - scoring_funcs = { + scoring_funcs: Dict[str, Callable[[np.ndarray, np.ndarray], np.ndarray]] = { "residual": get_residual_score_for_each_label, "TO_BE_NAMED": get_score_to_named_for_each_label, # TODO - update name once finalised } # TODO - update name once finalised - try: - scoring_func = scoring_funcs[method] - except KeyError: + scoring_func = scoring_funcs.get(method, None) + if not scoring_func: raise ValueError( f""" {method} is not a valid scoring method. @@ -121,10 +121,10 @@ def get_score_to_named_for_each_label( Parameters ---------- labels: np.ndarray - Labels in the same format expected by the :py:func:`get_label_quality_scores ` function. + Labels in the same format as expected by the :py:func:`get_label_quality_scores ` function. predictions: np.ndarray - Predicted labels in the same format expected by the :py:func:`get_label_quality_scores ` function. + Predicted labels in the same format as expected by the :py:func:`get_label_quality_scores ` function. variance: float, default = 10 Manipulates variance of the distribution of residual. @@ -135,19 +135,16 @@ def get_score_to_named_for_each_label( Contains one score (between 0 and 1) per example. Lower scores indicate more likely mislabled examples. """ - - neighbors = int(np.ceil(0.1 * labels.shape[0])) - knn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean") - residual = predictions - labels - labels = (labels - labels.mean()) / labels.std() residual = np.sqrt(variance) * ((residual - residual.mean()) / residual.std()) # 2D features by combining labels and residual features = np.array([labels, residual]).T - knn.fit(features) + neighbors = int(np.ceil(0.1 * labels.shape[0])) + knn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean").fit(features) ood = OutOfDistribution(params={"knn": knn}) + label_quality_scores = ood.score(features=features) return label_quality_scores From 7d97c26d35e039595e2e371cae2c5f42eb3efeb1 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 19:15:43 -0800 Subject: [PATCH 020/258] outre: code updated for name for second method --- cleanlab/regression/rank.py | 12 +++++------- tests/test_regression.py | 6 ++---- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 99c42aba3e..fe263f88ca 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -11,7 +11,7 @@ def get_label_quality_scores( labels: np.ndarray, predictions: np.ndarray, *, - method: str = "TO_BE_NAMED", # TODO update name once finalised + method: str = "outre", # TODO update name once finalised ) -> np.ndarray: """ Returns label quality score for each example in the regression dataset. @@ -30,7 +30,7 @@ def get_label_quality_scores( Predicated labels from regressor fitted on the dataset. Array of shape ``(N,)`` consisting predicted labels, where N is number of datapoints in the regression dataset. - method : {"residual", "TO_BE_NAMED"}, default="TO_BE_NAMED" #TODO - update name once finalised + method : {"residual", "outre"}, default="outre" Returns ------- @@ -55,10 +55,9 @@ def get_label_quality_scores( scoring_funcs: Dict[str, Callable[[np.ndarray, np.ndarray], np.ndarray]] = { "residual": get_residual_score_for_each_label, - "TO_BE_NAMED": get_score_to_named_for_each_label, # TODO - update name once finalised + "outre": get_outre_score_for_each_label, } - # TODO - update name once finalised scoring_func = scoring_funcs.get(method, None) if not scoring_func: raise ValueError( @@ -105,15 +104,14 @@ def get_residual_score_for_each_label( return label_quality_scores -# TODO - change name of the function # TODO - change name of function in test -def get_score_to_named_for_each_label( +def get_outre_score_for_each_label( labels: np.ndarray, predictions: np.ndarray, *, variance: float = 10, ) -> np.ndarray: - """Returns label-quality scores. + """Returns OUTRE based label-quality scores. This function computes label-quality scores for regression datasets, where a lower score indicates labels that are less likely to be correct. diff --git a/tests/test_regression.py b/tests/test_regression.py index 8a8154047c..dc29315f96 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -29,11 +29,10 @@ def test_assertion_error_for_input_shape(): _ = rank.get_label_quality_scores(labels=labels, predictions=predictions[:-1]) -# TODO - change name once finalised # test individual scoring functions @pytest.mark.parametrize( "scoring_funcs", - [rank.get_residual_score_for_each_label, rank.get_score_to_named_for_each_label], + [rank.get_residual_score_for_each_label, rank.get_outre_score_for_each_label], ) def test_individual_scoring_functions(scoring_funcs): scores = scoring_funcs(labels=labels, predictions=predictions) @@ -41,13 +40,12 @@ def test_individual_scoring_functions(scoring_funcs): assert isinstance(scores, np.ndarray) -# TODO - change name once finalised # test for method argument @pytest.mark.parametrize( "method", [ "residual", - "TO_BE_NAMED", + "outre", ], ) def test_method_pass_get_label_quality_scores(method): From 305d11bd271d29001e02d757d140bad91a749284 Mon Sep 17 00:00:00 2001 From: Mayank Kumar Date: Fri, 23 Dec 2022 22:21:46 -0800 Subject: [PATCH 021/258] Apply Docstring suggestions from code review Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/regression/rank.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index fe263f88ca..afd25c725c 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -24,11 +24,10 @@ def get_label_quality_scores( ---------- labels : np.ndarray Raw labels from original dataset. - Array of shape ``(N, )`` consisting given labels, where N is number of datapoints in the regression dataset. + 1D array of shape ``(N, )`` containing the given labels for each example (aka. Y-value, response, target, dependent variable, ...), where N is number of examples in the dataset. predictions : np.ndarray - Predicated labels from regressor fitted on the dataset. - Array of shape ``(N,)`` consisting predicted labels, where N is number of datapoints in the regression dataset. + 1D array of shape ``(N,)`` containing the predicted label for each example in the dataset. These should be out-of-sample predictions from a trained regression model, which you can obtain for every example in your dataset via :ref:`cross-validation `. method : {"residual", "outre"}, default="outre" @@ -37,7 +36,7 @@ def get_label_quality_scores( label_quality_scores: Array of shape ``(N, )`` of scores between 0 and 1, one per datapoint in the dataset. - Lower scores indicate datapoint more likely to contain a label issue. + Lower scores indicate datapoints more likely to contain a label issue. Examples -------- @@ -76,7 +75,7 @@ def get_residual_score_for_each_label( labels: np.ndarray, predictions: np.ndarray, ) -> np.ndarray: - """Returns the residual based label-quality scores for each datapoints. + """Returns a residual label-quality score for each datapoint. This is function to compute label-quality scores for regression datasets, where lower score indicate labels less likely to be correct. From e9468dd9e476f9821e3c0ca2e4e17b00cb02d6a5 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 22:42:56 -0800 Subject: [PATCH 022/258] Support for array_like labels and predictions --- cleanlab/internal/regression_utils.py | 26 +++++++++++----- cleanlab/regression/rank.py | 43 ++++++++++++++++++--------- 2 files changed, 48 insertions(+), 21 deletions(-) diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py index 04576c4012..a62e85aca5 100644 --- a/cleanlab/internal/regression_utils.py +++ b/cleanlab/internal/regression_utils.py @@ -10,19 +10,31 @@ def assert_valid_inputs( predictions: np.ndarray, method: str, ) -> None: - """Checks that ``labels``, ``pred_labels``, ``method`` are correctly formatted.""" + """Checks that ``labels``, ``predictions``, ``method`` are correctly formatted.""" # Check if labels and pred_labels are np.ndarray if not isinstance(labels, np.ndarray) or not isinstance(predictions, np.ndarray): - raise TypeError("labels and pred_labels must be of type np.ndarray") + raise TypeError("labels and predictions must be of type np.ndarray") - # Check if labels and pred_labels are of same shape + # Check if labels and predictions are of same shape assert ( labels.shape == predictions.shape ), f"shape of label {labels.shape} and predicted labels {predictions.shape} are not same." - # Check if method passed is string - if not isinstance(method, str): - raise TypeError( - f"Passed method is not of correct type. Expected string, got {type(method)}" + # Check if method is among allowed scoring method + scoring_methods = ["residual", "outre"] + if method not in scoring_methods: + raise ValueError( + f"Passed method is not among allowed method. Expected either of {scoring_methods}, got {method}" + ) + + +def check_dimensions(labels: np.ndarray, predictions: np.ndarray) -> None: + if labels.ndim != 1: + raise ValueError( + f"labels have dimensions {labels.ndim}, Expected 1-D array as input for labels" + ) + if predictions.ndim != 1: + raise ValueError( + f"predictions have dimensions {labels.ndim}, Expected 1-D array as input for predictions" ) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index fe263f88ca..c6250ae4af 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,17 +1,20 @@ import numpy as np from cleanlab.outlier import OutOfDistribution from sklearn.neighbors import NearestNeighbors -from cleanlab.internal.regression_utils import assert_valid_inputs -from typing import Dict, Callable +from cleanlab.internal.regression_utils import assert_valid_inputs, check_dimensions +from typing import Dict, Callable, Optional +from cleanlab.typing import LabelLike """ Generates label quality scores for every sample in regression dataset """ +EPS = 1e-30 + def get_label_quality_scores( - labels: np.ndarray, - predictions: np.ndarray, + labels: Optional[LabelLike], + predictions: Optional[LabelLike], *, - method: str = "outre", # TODO update name once finalised + method: str = "outre", ) -> np.ndarray: """ Returns label quality score for each example in the regression dataset. @@ -22,7 +25,7 @@ def get_label_quality_scores( Parameters ---------- - labels : np.ndarray + labels : array_like Raw labels from original dataset. Array of shape ``(N, )`` consisting given labels, where N is number of datapoints in the regression dataset. @@ -53,6 +56,11 @@ def get_label_quality_scores( # Check if inputs are valid assert_valid_inputs(labels=labels, predictions=predictions, method=method) + # Convert to numpy array and check if they are 1-D array. + labels = np.asarray(labels) + predictions = np.asarray(predictions) + check_dimensions(labels, predictions) + scoring_funcs: Dict[str, Callable[[np.ndarray, np.ndarray], np.ndarray]] = { "residual": get_residual_score_for_each_label, "outre": get_outre_score_for_each_label, @@ -104,12 +112,13 @@ def get_residual_score_for_each_label( return label_quality_scores -# TODO - change name of function in test def get_outre_score_for_each_label( labels: np.ndarray, predictions: np.ndarray, *, - variance: float = 10, + residual_scale: float = 10, + frac_neighbors: float = 0.1, + neighbor_metric: str = "euclidean", ) -> np.ndarray: """Returns OUTRE based label-quality scores. @@ -124,8 +133,14 @@ def get_outre_score_for_each_label( predictions: np.ndarray Predicted labels in the same format as expected by the :py:func:`get_label_quality_scores ` function. - variance: float, default = 10 - Manipulates variance of the distribution of residual. + residual_scale: float, default = 10 + Manipulates scale of the distribution of residual. + + frac_neighbors: float, default = 0.1 + Fraction of datapoints that should be considered as n_neighbors to NearestNeighbors. + + neighbor_metric: str, default = "euclidean" + The parameter is passed to sklearn NearestNeighbors. # TODO add reference to sklearn.NearestNeighbor? Returns ------- @@ -134,14 +149,14 @@ def get_outre_score_for_each_label( Lower scores indicate more likely mislabled examples. """ residual = predictions - labels - labels = (labels - labels.mean()) / labels.std() - residual = np.sqrt(variance) * ((residual - residual.mean()) / residual.std()) + labels = (labels - labels.mean()) / (labels.std() + EPS) + residual = residual_scale * ((residual - residual.mean()) / (residual.std() + EPS)) # 2D features by combining labels and residual features = np.array([labels, residual]).T - neighbors = int(np.ceil(0.1 * labels.shape[0])) - knn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean").fit(features) + neighbors = int(np.ceil(frac_neighbors * labels.shape[0])) + knn = NearestNeighbors(n_neighbors=neighbors, metric=neighbor_metric).fit(features) ood = OutOfDistribution(params={"knn": knn}) label_quality_scores = ood.score(features=features) From 48c3f5791863b406e6b447e18ffc3731e0e98fe5 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 23:13:07 -0800 Subject: [PATCH 023/258] doctring for method modified --- cleanlab/regression/rank.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index e0efe5521c..d57cd5ffec 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -27,12 +27,13 @@ def get_label_quality_scores( ---------- labels : array_like Raw labels from original dataset. - 1D array of shape ``(N, )`` containing the given labels for each example (aka. Y-value, response, target, dependent variable, ...), where N is number of examples in the dataset. + 1D array of shape ``(N, )`` containing the given labels for each example (aka. Y-value, response, target, dependent variable, ...), where N is number of examples in the dataset. predictions : np.ndarray 1D array of shape ``(N,)`` containing the predicted label for each example in the dataset. These should be out-of-sample predictions from a trained regression model, which you can obtain for every example in your dataset via :ref:`cross-validation `. method : {"residual", "outre"}, default="outre" + String specifying which method to use for scoring the quality of each label and identifying which labels appear most noisy. Returns ------- From 65e1a3cb854947ab556843e86e54e9291de30cef Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 23:17:38 -0800 Subject: [PATCH 024/258] datapoint -> example --- cleanlab/regression/rank.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index d57cd5ffec..6ebe33af64 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -38,9 +38,9 @@ def get_label_quality_scores( Returns ------- label_quality_scores: - Array of shape ``(N, )`` of scores between 0 and 1, one per datapoint in the dataset. + Array of shape ``(N, )`` of scores between 0 and 1, one per example in the dataset. - Lower scores indicate datapoints more likely to contain a label issue. + Lower scores indicate examples more likely to contain a label issue. Examples -------- @@ -84,7 +84,7 @@ def get_residual_score_for_each_label( labels: np.ndarray, predictions: np.ndarray, ) -> np.ndarray: - """Returns a residual label-quality score for each datapoint. + """Returns a residual label-quality score for each example. This is function to compute label-quality scores for regression datasets, where lower score indicate labels less likely to be correct. @@ -137,7 +137,7 @@ def get_outre_score_for_each_label( Manipulates scale of the distribution of residual. frac_neighbors: float, default = 0.1 - Fraction of datapoints that should be considered as n_neighbors to NearestNeighbors. + Fraction of examples that should be considered as n_neighbors to NearestNeighbors. neighbor_metric: str, default = "euclidean" The parameter is passed to sklearn NearestNeighbors. # TODO add reference to sklearn.NearestNeighbor? From 7b15cbac2ffd280885d864a160c774a8b187c408 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Sat, 24 Dec 2022 02:27:34 -0800 Subject: [PATCH 025/258] check_valid_inputs update 1. added support for list, np.ndarray, pd.Series, pd.DataFrame 2. check if inputs are numeric --- cleanlab/internal/regression_utils.py | 64 +++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py index a62e85aca5..a7bfb4bbd1 100644 --- a/cleanlab/internal/regression_utils.py +++ b/cleanlab/internal/regression_utils.py @@ -3,23 +3,35 @@ """ import numpy as np +import pandas as pd +from pandas.api.types import is_numeric_dtype +from cleanlab.typing import LabelLike +from typing import Optional def assert_valid_inputs( - labels: np.ndarray, - predictions: np.ndarray, + labels: Optional[LabelLike], + predictions: Optional[LabelLike], method: str, ) -> None: """Checks that ``labels``, ``predictions``, ``method`` are correctly formatted.""" - # Check if labels and pred_labels are np.ndarray - if not isinstance(labels, np.ndarray) or not isinstance(predictions, np.ndarray): - raise TypeError("labels and predictions must be of type np.ndarray") + supported_types = (list, np.ndarray, pd.Series, pd.DataFrame) - # Check if labels and predictions are of same shape - assert ( - labels.shape == predictions.shape - ), f"shape of label {labels.shape} and predicted labels {predictions.shape} are not same." + # Check if labels and predictions are of supported types + if not isinstance(labels, supported_types) and not isinstance(predictions, supported_types): + raise TypeError( + f"Expected labels and predictions to be either of {supported_types}, Got labels of type {type(labels)}, and predictions of type {type(predictions)}", + ) + + # check if labels and predictions are 1-D and numeric + check_dimension_and_datatype(check_input=labels, text = "labels") + check_dimension_and_datatype(check_input=predictions, text = "predictions") + + # check if number of examples are same. + assert len(labels) == len( + predictions + ), f"Length of labels {len(labels)} and predictions {len(predictions)} are not same." # Check if method is among allowed scoring method scoring_methods = ["residual", "outre"] @@ -29,6 +41,40 @@ def assert_valid_inputs( ) +def check_dimension_and_datatype(check_input: Optional[LabelLike], text : str): + # check if input is empty + if not len(check_input): + raise ValueError( + f"{text} is Empty, check input." + ) + + if isinstance(check_input, list): + if isinstance(check_input[0], list): + raise ValueError(f"{text} must be 1-D. List of List is not supported.") + elif not all(isinstance(x, (int, float)) for x in check_input): + raise ValueError( + f"All element of {text} must be of type numeric i.e., integer or float" + ) + + elif isinstance(check_input, pd.DataFrame): + if check_input.shape[1] != 1: + raise ValueError( + f"{text} must be 1-D. For DataFrame, second dimension must be 1, got {check_input.shape}." + ) + elif check_input.shape[1] == 1: + if not is_numeric_dtype(check_input): + raise ValueError(f"{text} must be 1-D and numeric type. got {check_input.dtype}.") + elif isinstance(check_input, (np.ndarray, pd.Series)): + if len(check_input.shape) != 1: + raise ValueError(f"{text} must be 1-D {type(check_input)}, got {check_input.shape}") + elif len(check_input.shape) == 1: + if isinstance(check_input, pd.Series) and not is_numeric_dtype(check_input): + raise ValueError(f"{text} must be 1-D and numeric type. got {check_input.dtype}.") + elif isinstance(check_input, np.ndarray): + if not all(isinstance(x, (int, float)) for x in check_input.tolist()): + raise ValueError(f"{text} must be 1-d and numeric type i.e., integer or float.") + + def check_dimensions(labels: np.ndarray, predictions: np.ndarray) -> None: if labels.ndim != 1: raise ValueError( From b9b91049da5062dae0c60ecf539f5ea214bcee12 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Tue, 27 Dec 2022 15:51:12 -0800 Subject: [PATCH 026/258] tutorial removed --- docs/source/tutorials/index.rst | 1 - docs/source/tutorials/regression.ipynb | 497 ------------------------- 2 files changed, 498 deletions(-) delete mode 100644 docs/source/tutorials/regression.ipynb diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst index 78e4efac5f..d040963629 100644 --- a/docs/source/tutorials/index.rst +++ b/docs/source/tutorials/index.rst @@ -14,7 +14,6 @@ Tutorials multiannotator multilabel_classification token_classification - regression pred_probs_cross_val faq diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb deleted file mode 100644 index cc02658e80..0000000000 --- a/docs/source/tutorials/regression.ipynb +++ /dev/null @@ -1,497 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Label Quality Scores for Regression with Noisy Labels " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This quickstart tutorial shows how to use cleanlab for finding label errors in regression data. Using the approach mentioned here, you can find label error in any regression dataset irrespective of modality i.e., tabular, text, image etc. \n", - "\n", - "**This example will take you through following:**\n", - "- Generate label quality scores for each datapoint in the dataset. \n", - "- Find label issue for regression dataset. " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Quickstart \n", - "\n", - "Cleanlab uses two inputs to generates scores for labels in the dataset:\n", - "- `labels`: NumPy array of given labels in the dataset. labels[i] should contain label for `i`-th datapoint. \n", - "- `predictions`: NumPy array of predictions generated through your favourite regressor. predictions[i] should contain predicted value for `i`-th datapoint. \n", - "\n", - "If you already have predictions from your regressor, you can generate label quality scores for each datapoint using the code below: \n", - "\n", - "
\n", - "\n", - "```python \n", - "\n", - "from cleanlab.regression.rank import get_label_quality_scores\n", - "label_quality_scores = get_label_quality_scores(labels, predictions)\n", - "\n", - "```\n", - "
\n", - "" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 0. Visualization (can skip these details)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is added just for reference. We will use this function to plot dataset, highlight points using label quality scores and true_errors.\n", - "You can skip this part and move to next section. " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
See the code for visualization **(click to expand)** \n", - "\n", - "```python \n", - "# Note: this pulldown is for docs.cleanlab.ai, if running on local Jupyter or colab, please ignore it. \n", - "\n", - "def plot_data(\n", - " data_x, data_y, circles, title, alpha=0.6, color=\"#1f77b4\", colorbar=False, xlabel=\"\", ylabel=\"\"\n", - "):\n", - " plt.figure(figsize=(14, 5))\n", - " data_x = data_x.to_numpy()\n", - " data_y = data_y.to_numpy()\n", - " plt.scatter(data_x, data_y, c=color, s=30)\n", - " for i in circles:\n", - " plt.plot(\n", - " data_x[i],\n", - " data_y[i],\n", - " \"o\",\n", - " markerfacecolor=\"none\",\n", - " markeredgecolor=\"red\",\n", - " markersize=10,\n", - " markeredgewidth=2.5,\n", - " alpha=alpha,\n", - " )\n", - " plt.title(title, fontsize=20)\n", - " plt.xlabel(xlabel)\n", - " plt.ylabel(ylabel)\n", - "\n", - " if colorbar:\n", - " plt.colorbar(orientation=\"vertical\")\n", - "\n", - "```\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_data(\n", - " data_x, data_y, circles, title, alpha=0.6, color=\"#1f77b4\", colorbar=False, xlabel=\"\", ylabel=\"\"\n", - "):\n", - " plt.figure(figsize=(14, 5))\n", - " data_x = data_x.to_numpy()\n", - " data_y = data_y.to_numpy()\n", - " plt.scatter(data_x, data_y, c=color, s=30)\n", - " for i in circles:\n", - " plt.plot(\n", - " data_x[i],\n", - " data_y[i],\n", - " \"o\",\n", - " markerfacecolor=\"none\",\n", - " markeredgecolor=\"red\",\n", - " markersize=10,\n", - " markeredgewidth=2.5,\n", - " alpha=alpha,\n", - " )\n", - " plt.title(title, fontsize=20)\n", - " plt.xlabel(xlabel)\n", - " plt.ylabel(ylabel)\n", - "\n", - " if colorbar:\n", - " plt.colorbar(orientation=\"vertical\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Install dependencies and import them \n", - "You can use `pip` to install all packages required for this tutorial as follows:\n", - "\n", - "`!pip install cleanlab xgboost`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install cleanlab xgboost" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "# Package installation (hidden on docs website).\n", - "# Package versions we used: xgboost==1.7.2\n", - "\n", - "dependencies = [\"cleanlab\", \"xgboost\"]\n", - "\n", - "if \"google.colab\" in str(get_ipython()): # Check if it's running in Google Colab\n", - " %pip install cleanlab # for colab\n", - " cmd = \" \".join([dep for dep in dependencies if dep != \"cleanlab\"])\n", - " %pip install $cmd\n", - "else:\n", - " missing_dependencies = []\n", - " for dependency in dependencies:\n", - " try:\n", - " __import__(dependency)\n", - " except ImportError:\n", - " missing_dependencies.append(dependency)\n", - "\n", - " if len(missing_dependencies) > 0:\n", - " print(\"Missing required dependencies:\")\n", - " print(*missing_dependencies, sep=\", \")\n", - " print(\"\\nPlease install them before running the rest of this notebook.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from cleanlab.regression.rank import get_label_quality_scores\n", - "import xgboost as xgb\n", - "import matplotlib.pyplot as plt\n", - "\n", - "np.set_printoptions(suppress=True)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Import dataset and Generate predictions" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
exam_1exam_2exam_3notestrue_labelslabels
0537793NaN76.276.2
1816480great participation +1085.585.5
2748897NaN87.487.4
3619478NaN77.777.7
4489091NaN77.877.8
\n", - "
" - ], - "text/plain": [ - " exam_1 exam_2 exam_3 notes true_labels labels\n", - "0 53 77 93 NaN 76.2 76.2\n", - "1 81 64 80 great participation +10 85.5 85.5\n", - "2 74 88 97 NaN 87.4 87.4\n", - "3 61 94 78 NaN 77.7 77.7\n", - "4 48 90 91 NaN 77.8 77.8" - ] - }, - "execution_count": 107, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "path = \"/Users/krmayank/Desktop/Work/cleanlab/experiments/student_score_regression.csv\"\n", - "data = pd.read_csv(path, index_col=0)\n", - "data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Generate true errors\n", - "true_errors = np.where(data.labels != data.true_labels)[0]\n", - "plot_data(\n", - " data_x=data[\"exam_3\"],\n", - " data_y=data[\"labels\"],\n", - " circles=true_errors,\n", - " title=\"Messy Regression dataset\",\n", - " xlabel=\"exam_3 feature\",\n", - " ylabel=\"label (Y value)\",\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the dataframe displayed above, `labels` represents the noisy labels and `true_labels` represents the ground truth. Please note that, ground truth are usually not available in real dataset, we have added it here for comparision and to demonstrate our method. `notes` also has text information, we will model this a categorical variable. \n", - "\n", - "We will use `xgboost` as regressor for this tutorial. xgboost provides easy to use interface to process categorical variable. This is demonstrated in the code below:" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [], - "source": [ - "# XGBOOST automatically factors categorical variable, you just need to mark the columns as category\n", - "data.notes = data.notes.astype(\"category\")\n", - "\n", - "# XGBOOST takes data and label seperately, so you will need to divide data accordingly.\n", - "X = data.drop([\"labels\", \"true_labels\"], axis=1)\n", - "y = data[\"labels\"]\n", - "\n", - "# convert data to format \"DMatrix\" to make it compatible with XGBOOST.\n", - "xgboost_data = xgb.DMatrix(data=X, label=y, enable_categorical=True)\n", - "\n", - "# declare parameters and train the model.\n", - "params = {\"booster\": \"gblinear\", \"objective\": \"reg:squarederror\"}\n", - "boost = xgb.train(params=params, dtrain=xgboost_data, num_boost_round=50)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Using cleanlab to generate label quality scores" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [], - "source": [ - "# using trained xgboost model to get predictions\n", - "predictions = boost.predict(xgboost_data)\n", - "\n", - "# get label quality score for each example in the dataset using cleanlab\n", - "label_quality_scores = get_label_quality_scores(labels=np.array(y), predictions=predictions)" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plot_data(\n", - " data_x=data[\"exam_3\"],\n", - " data_y=data[\"labels\"],\n", - " circles=true_errors,\n", - " color=label_quality_scores,\n", - " title=\"Messy Regression dataset with label quality scores\",\n", - " colorbar=True,\n", - " xlabel=\"exam_3 feature\",\n", - " ylabel=\"label (Y value)\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the above plot, we have colored each datapoint considering its label quality score. \\\n", - "Datapoints in the plot are same as earlier plot in the notebook. \\\n", - "**Red circle** represents that these datapoint were incorrectly marked when compared to true_label. \n", - "\n", - "Low scores for datapoints marked in **Red circle** and High scores for other datapoints justifies that method can identify the errors in the dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": 112, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "# Note: This cell is only for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", - "from sklearn.metrics import roc_auc_score\n", - "\n", - "label_quality_scores = get_label_quality_scores(np.array(y), predictions=predictions)\n", - "\n", - "true_errors = (data.labels != data.true_labels).astype(int)\n", - "label_quality_scores_residual = get_label_quality_scores(\n", - " np.array(y), predictions=predictions, method=\"residual\"\n", - ")\n", - "\n", - "if roc_auc_score(true_errors, 1 - label_quality_scores) < 0.5:\n", - " raise ValueError(\"Label quality scores did not perform well enough\")\n", - "\n", - "if roc_auc_score(true_errors, 1 - label_quality_scores) <= roc_auc_score(\n", - " true_errors, 1 - label_quality_scores_residual\n", - "):\n", - " raise ValueError(\"Label quality scores did not outperform alternative scores\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.8 ('ENV': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "1ed33b5e6ac3d9870092cd802185bba6fb7a8302b6022e7097221f18c33cb7b2" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From c05f1fe6fdf1b2e41e3db24b27e6a8141e06a2e3 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Tue, 27 Dec 2022 22:45:55 -0800 Subject: [PATCH 027/258] support for array_like --- cleanlab/internal/regression_utils.py | 126 +++++++++++++------------- cleanlab/regression/rank.py | 15 +-- 2 files changed, 70 insertions(+), 71 deletions(-) diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py index a7bfb4bbd1..4e396f2d55 100644 --- a/cleanlab/internal/regression_utils.py +++ b/cleanlab/internal/regression_utils.py @@ -3,84 +3,88 @@ """ import numpy as np -import pandas as pd -from pandas.api.types import is_numeric_dtype -from cleanlab.typing import LabelLike -from typing import Optional +from numpy.typing import ArrayLike +from typing import Tuple, Optional def assert_valid_inputs( - labels: Optional[LabelLike], - predictions: Optional[LabelLike], + labels: ArrayLike, + predictions: ArrayLike, method: str, -) -> None: +) -> Tuple[np.ndarray, np.ndarray]: """Checks that ``labels``, ``predictions``, ``method`` are correctly formatted.""" - supported_types = (list, np.ndarray, pd.Series, pd.DataFrame) + # Load array_like input as numpy array. If not raise error. + try: + labels = np.asarray(labels) + except: + raise ValueError(f"labels must be array_like.") - # Check if labels and predictions are of supported types - if not isinstance(labels, supported_types) and not isinstance(predictions, supported_types): - raise TypeError( - f"Expected labels and predictions to be either of {supported_types}, Got labels of type {type(labels)}, and predictions of type {type(predictions)}", - ) + try: + predictions = np.asarray(predictions) + except: + raise ValueError(f"predictions must be array_like.") + + # Check if labels and predictions are 1-D and numeric + valid_labels = check_dimension_and_datatype(check_input=labels, text="labels") + valid_predictions = check_dimension_and_datatype(check_input=predictions, text="predictions") - # check if labels and predictions are 1-D and numeric - check_dimension_and_datatype(check_input=labels, text = "labels") - check_dimension_and_datatype(check_input=predictions, text = "predictions") + # Check if number of examples are same. + assert ( + valid_labels.shape == valid_predictions.shape + ), f"Number of examples in labels {labels.shape} and predictions {predictions.shape} are not same." - # check if number of examples are same. - assert len(labels) == len( - predictions - ), f"Length of labels {len(labels)} and predictions {len(predictions)} are not same." + # Check if inputs have missing values + check_missing_values(valid_labels, text="labels") + check_missing_values(valid_predictions, text="predictions") # Check if method is among allowed scoring method scoring_methods = ["residual", "outre"] if method not in scoring_methods: raise ValueError( - f"Passed method is not among allowed method. Expected either of {scoring_methods}, got {method}" + f"Passed method is not among allowed methods. Expected either of {scoring_methods}, got {method}." ) + # return 1-D numpy array + return valid_labels, valid_predictions -def check_dimension_and_datatype(check_input: Optional[LabelLike], text : str): - # check if input is empty - if not len(check_input): - raise ValueError( - f"{text} is Empty, check input." - ) - if isinstance(check_input, list): - if isinstance(check_input[0], list): - raise ValueError(f"{text} must be 1-D. List of List is not supported.") - elif not all(isinstance(x, (int, float)) for x in check_input): - raise ValueError( - f"All element of {text} must be of type numeric i.e., integer or float" - ) - - elif isinstance(check_input, pd.DataFrame): - if check_input.shape[1] != 1: - raise ValueError( - f"{text} must be 1-D. For DataFrame, second dimension must be 1, got {check_input.shape}." - ) - elif check_input.shape[1] == 1: - if not is_numeric_dtype(check_input): - raise ValueError(f"{text} must be 1-D and numeric type. got {check_input.dtype}.") - elif isinstance(check_input, (np.ndarray, pd.Series)): - if len(check_input.shape) != 1: - raise ValueError(f"{text} must be 1-D {type(check_input)}, got {check_input.shape}") - elif len(check_input.shape) == 1: - if isinstance(check_input, pd.Series) and not is_numeric_dtype(check_input): - raise ValueError(f"{text} must be 1-D and numeric type. got {check_input.dtype}.") - elif isinstance(check_input, np.ndarray): - if not all(isinstance(x, (int, float)) for x in check_input.tolist()): - raise ValueError(f"{text} must be 1-d and numeric type i.e., integer or float.") - - -def check_dimensions(labels: np.ndarray, predictions: np.ndarray) -> None: - if labels.ndim != 1: - raise ValueError( - f"labels have dimensions {labels.ndim}, Expected 1-D array as input for labels" - ) - if predictions.ndim != 1: +def check_dimension_and_datatype(check_input: np.ndarray, text: str) -> np.ndarray: + """ + Raises errors related to: + 1. If input is empty + 2. If input is not 1-D + 3. If input is not numeric + + If all the checks are passed, it returns the squeezed 1-D array required by the main algorithm. + """ + + assert isinstance( + check_input, np.ndarray + ), f"{text} could not be converted to numpy array, check input." + + # Check if input is empty + if not check_input.size: + raise ValueError(f"{text} is Empty, check input.") + + # Remove axis with length one + check_input = np.squeeze(check_input) + + # Check if input is 1-D + if check_input.ndim != 1: raise ValueError( - f"predictions have dimensions {labels.ndim}, Expected 1-D array as input for predictions" + f"Expected 1-Dimensional inputs for {text}, got {check_input.ndim} dimensions." ) + + # Check if datatype is numeric + if not np.issubdtype(check_input.dtype, np.number): + raise ValueError(f"Expected {text} to be Numeric, got {check_input.dtype}.") + + return check_input + + +def check_missing_values(check_input: np.ndarray, text: str): + """Raise error if there are any missing values in Numpy array.""" + + if np.isnan(check_input).any(): + raise ValueError(f"{text} has missing values, check input.") diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 6ebe33af64..cd6c474c93 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,9 +1,9 @@ import numpy as np from cleanlab.outlier import OutOfDistribution from sklearn.neighbors import NearestNeighbors -from cleanlab.internal.regression_utils import assert_valid_inputs, check_dimensions +from cleanlab.internal.regression_utils import assert_valid_inputs from typing import Dict, Callable, Optional -from cleanlab.typing import LabelLike +from numpy.typing import ArrayLike """ Generates label quality scores for every sample in regression dataset """ @@ -11,8 +11,8 @@ def get_label_quality_scores( - labels: Optional[LabelLike], - predictions: Optional[LabelLike], + labels: ArrayLike, + predictions: ArrayLike, *, method: str = "outre", ) -> np.ndarray: @@ -54,12 +54,7 @@ def get_label_quality_scores( """ # Check if inputs are valid - assert_valid_inputs(labels=labels, predictions=predictions, method=method) - - # Convert to numpy array and check if they are 1-D array. - labels = np.asarray(labels) - predictions = np.asarray(predictions) - check_dimensions(labels, predictions) + labels, predictions = assert_valid_inputs(labels=labels, predictions=predictions, method=method) scoring_funcs: Dict[str, Callable[[np.ndarray, np.ndarray], np.ndarray]] = { "residual": get_residual_score_for_each_label, From d6ac6424c7b5b9ab4ba011e17da60d8e0c72cd45 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Tue, 27 Dec 2022 22:47:13 -0800 Subject: [PATCH 028/258] unit tests to factor array_like --- tests/test_regression.py | 55 +++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/tests/test_regression.py b/tests/test_regression.py index dc29315f96..355752dee3 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -1,13 +1,25 @@ import numpy as np -import pandas as pd + +# import pandas as pd import pytest +from typing import Union, Sequence from cleanlab.regression import rank +ArrayLike = Union[np.ndarray, Sequence] + # To be used for all the tests labels = np.array([1, 2, 3, 4]) predictions = np.array([1, 3, 4, 5]) +# Inputs that are not array like +aConstant = 1 +aString = "predictions_non_array" +aDict = {"labels": [1, 2], "predictions": [2, 3]} +aSet = {1, 2, 3, 4} +aBool = True + + # test with deafault parameters def test_output_shape_type(): scores = rank.get_label_quality_scores(labels=labels, predictions=predictions) @@ -15,18 +27,43 @@ def test_output_shape_type(): assert isinstance(scores, np.ndarray) -# test for acceptable datatypes -@pytest.mark.parametrize("format", [pd.Series, pd.DataFrame, list]) -def test_type_error_for_input_types(format): - with pytest.raises(TypeError) as error: - _ = rank.get_label_quality_scores(labels=format(labels), predictions=format(predictions)) +@pytest.mark.parametrize( + "aInput", + [aConstant, aString, aDict, aSet, aBool], +) +def test_labels_are_arraylike(aInput): + with pytest.raises(ValueError) as error: + rank.get_label_quality_scores(labels=aInput, predictions=predictions) + assert error.type == ValueError + + +@pytest.mark.parametrize( + "aInput", + [aConstant, aString, aDict, aSet, aBool], +) +def test_predictionns_are_arraylike(aInput): + with pytest.raises(ValueError) as error: + rank.get_label_quality_scores(labels=labels, predictions=aInput) + assert error.type == ValueError # test for input shapes -def test_assertion_error_for_input_shape(): +def test_input_shape_labels(): + with pytest.raises(AssertionError) as error: + rank.get_label_quality_scores(labels=labels[:-1], predictions=predictions) + assert ( + str(error.value) + == f"Number of examples in labels {labels[:-1].shape} and predictions {predictions.shape} are not same." + ) + + +def test_input_shape_predictions(): with pytest.raises(AssertionError) as error: - _ = rank.get_label_quality_scores(labels=labels[:-1], predictions=predictions) - _ = rank.get_label_quality_scores(labels=labels, predictions=predictions[:-1]) + rank.get_label_quality_scores(labels=labels, predictions=predictions[:-1]) + assert ( + str(error.value) + == f"Number of examples in labels {labels.shape} and predictions {predictions[:-1].shape} are not same." + ) # test individual scoring functions From 2b327c30a3fd78a1c81b803bacadea42183e0069 Mon Sep 17 00:00:00 2001 From: Mayank Kumar Date: Tue, 27 Dec 2022 23:58:51 -0800 Subject: [PATCH 029/258] Update docs/source/tutorials/index.rst Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- docs/source/tutorials/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst index d040963629..e0d63a7e06 100644 --- a/docs/source/tutorials/index.rst +++ b/docs/source/tutorials/index.rst @@ -16,4 +16,3 @@ Tutorials token_classification pred_probs_cross_val faq - From 4283a671c01a2753fcc6f299df69d2dd05666848 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Tue, 11 Oct 2022 11:45:37 -0700 Subject: [PATCH 030/258] added basic regression ranking --- cleanlab/regression/__init__.py | 1 + cleanlab/regression/rank.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 cleanlab/regression/__init__.py create mode 100644 cleanlab/regression/rank.py diff --git a/cleanlab/regression/__init__.py b/cleanlab/regression/__init__.py new file mode 100644 index 0000000000..77e9b5a97b --- /dev/null +++ b/cleanlab/regression/__init__.py @@ -0,0 +1 @@ +from . import rank \ No newline at end of file diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py new file mode 100644 index 0000000000..2cd91694c3 --- /dev/null +++ b/cleanlab/regression/rank.py @@ -0,0 +1,24 @@ +import numpy as np + +def get_label_quality_score( + true_labels: np.ndarray, + pred_labels: np.ndarray +)-> np.ndarray: + """ + Returns label quality score + + Score is continous value in range [0,1] + + 1 - clean label (given label is likely correct). + 0 - dirty label (given label is likely incorrect). + """ + residual = true_labels - pred_labels + quality_scores = np.exp(-abs(residual)) + return quality_scores + + +if __name__ == "__main__": +## WILL BE DELETED LATER + a = np.array([1,2,3,4]) + b = np.array([2,2,5,4.1]) + print(get_label_quality_score(a,b)) From 53455bff8410e06cdd5afccf92c69ad921507ff9 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 4 Nov 2022 14:17:40 -0700 Subject: [PATCH 031/258] minor fixes, docstring modified --- cleanlab/regression/__init__.py | 2 +- cleanlab/regression/rank.py | 56 +++++++++++++++++++++++---------- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/cleanlab/regression/__init__.py b/cleanlab/regression/__init__.py index 77e9b5a97b..aab0b677cf 100644 --- a/cleanlab/regression/__init__.py +++ b/cleanlab/regression/__init__.py @@ -1 +1 @@ -from . import rank \ No newline at end of file +from . import rank diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 2cd91694c3..2b4520b1b7 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,24 +1,46 @@ -import numpy as np +import numpy as np -def get_label_quality_score( - true_labels: np.ndarray, - pred_labels: np.ndarray -)-> np.ndarray: + +def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np.ndarray: """ - Returns label quality score - - Score is continous value in range [0,1] - + Returns label quality score for each example in the regression dataset. + + Each score is continous value in range [0,1] 1 - clean label (given label is likely correct). 0 - dirty label (given label is likely incorrect). + + Parameters + ---------- + labels: + Raw labels from original dataset. + Array of shape ``(N, )`` consisting given labels, where N is number of datapoints in the regression dataset. + + pred_labels: + Predicated labels from regressor fitted on the dataset. + Array of shape ``(N,)`` consisting predicted labels, where N is number of datapoints in the regression dataset. + + Returns + ------- + label_quality_scores: + Array of shape ``(N, )`` of scores between 0 and 1, one per datapoint in the dataset. + + Lower scores indicate datapoint more likely to contain a label issue. + + Examples + -------- + >>> import numpy as np + >>> from cleanlab.regression.rank import get_label_quality_scores + >>> labels = np.array([1,2,3,4]) + >>> pred_labels = np.array([2,2,5,4.1]) + >>> label_quality_scores = get_label_quality_scores(labels, pred_labels) + >>> label_quality_scores + array([0.36787944, 1. , 0.13533528, 0.90483742]) """ - residual = true_labels - pred_labels - quality_scores = np.exp(-abs(residual)) - return quality_scores + assert ( + labels.shape == pred_labels.shape + ), f"shape of label {labels.shape} and predicted labels {pred_labels.shape} are not same." -if __name__ == "__main__": -## WILL BE DELETED LATER - a = np.array([1,2,3,4]) - b = np.array([2,2,5,4.1]) - print(get_label_quality_score(a,b)) + residual = pred_labels - labels + quality_scores = np.exp(-abs(residual)) + return quality_scores From 987ae0eefe29d5f7502c4e27a77e29e8c0a6eea2 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Wed, 9 Nov 2022 17:25:35 -0800 Subject: [PATCH 032/258] tutorial added, added to docs index pages --- cleanlab/__init__.py | 1 + cleanlab/regression/rank.py | 2 + docs/source/cleanlab/regression.rst | 8 + docs/source/tutorials/index.rst | 1 + docs/source/tutorials/regression.ipynb | 338 +++++++++++++++++++++++++ 5 files changed, 350 insertions(+) create mode 100644 docs/source/cleanlab/regression.rst create mode 100644 docs/source/tutorials/regression.ipynb diff --git a/cleanlab/__init__.py b/cleanlab/__init__.py index 5746a49a21..753e47e82a 100644 --- a/cleanlab/__init__.py +++ b/cleanlab/__init__.py @@ -9,3 +9,4 @@ from . import outlier from . import token_classification from . import multilabel_classification +from . import regression diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 2b4520b1b7..2fdde78299 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,5 +1,7 @@ import numpy as np +""" generate label quality score for regression dataset""" + def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np.ndarray: """ diff --git a/docs/source/cleanlab/regression.rst b/docs/source/cleanlab/regression.rst new file mode 100644 index 0000000000..1cae31915a --- /dev/null +++ b/docs/source/cleanlab/regression.rst @@ -0,0 +1,8 @@ +regression +============== + +.. automodule:: cleanlab.regression + :autosummary: + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst index e0d63a7e06..d0f89d7489 100644 --- a/docs/source/tutorials/index.rst +++ b/docs/source/tutorials/index.rst @@ -16,3 +16,4 @@ Tutorials token_classification pred_probs_cross_val faq + regression diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb new file mode 100644 index 0000000000..46f3b5ee38 --- /dev/null +++ b/docs/source/tutorials/regression.ipynb @@ -0,0 +1,338 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Label Quality Scores for Regression with Noisy Labels\n", + "In this tutorial, you will learn how to use cleanlab on regression dataset to: \n", + "- find label issue in your regression dataset\n", + "- generate label quality scores for each example in the dataset. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install dependencies and import them \n", + "You can use pip to install all packages required for this tutorial as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install cleanlab\n", + "!pip install scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np \n", + "import pandas as pd \n", + "from cleanlab.regression.rank import get_label_quality_scores\n", + "from sklearn.linear_model import LinearRegression\n", + "import matplotlib.pyplot as plt " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def make_data(feature_size = (20, 3), \n", + " means = [8, 20, -10], \n", + " stds = [2, 5, 3], \n", + " bias = 0.8,\n", + " coeff = [2, 0.1, 0.5],\n", + " error = [-2, 0, 2], \n", + " prob_error = [0.2, 0.6, 0.2], \n", + " seed = 42\n", + ") -> pd.DataFrame:\n", + " \"\"\"\n", + " feature_size: Tuple of (datapoints, features)\n", + " \"\"\"\n", + " assert (len(means) == feature_size[1]), (f\"length of mean {len(means)} is not same as features requested{feature_size[0]}\")\n", + " assert (len(stds) == feature_size[1]), (f\"length of stds {len(stds)} is not same as features requested{feature_size[0]}\")\n", + " np.random.seed(seed) \n", + "\n", + " features = []\n", + " for i in range(feature_size[1]):\n", + " values = coeff[i] * np.random.normal(loc=means[i], scale=stds[i], size=feature_size[0])\n", + " features.append(values)\n", + " \n", + " true_labels = sum(map(np.array, features))+ bias\n", + " labels = true_labels + np.random.choice(error, feature_size[0], p=prob_error)\n", + " \n", + " data_dict = {\n", + " \"lables\" : labels, # You have these labels, which have some errors.\n", + " \"true_labels\" : true_labels, # You never get to see these perfect labels.\n", + " } \n", + " for idx, feature in enumerate(features): # adding names to each features \n", + " data_dict[\"feature_\"+str(idx+1)] = feature\n", + " data = pd.DataFrame.from_dict(data_dict)\n", + " col = list(data.columns)\n", + " new_col = col[2:] + col[:2]\n", + " data = data.reindex(columns=new_col)\n", + " return data\n", + "\n", + "def plot_data(data, \n", + " circles, \n", + " title, \n", + " alpha=0.6, \n", + " color = '#1f77b4', \n", + " xlabel = \"feature\", \n", + " colorbar = False):\n", + " \n", + " plt.figure(figsize=(14, 5))\n", + " plt.xlabel(xlabel, size=13)\n", + " plt.ylabel('label',size=13)\n", + " data = data.to_numpy()\n", + " plt.scatter(data[:,0], data[:,1], c = color, s=60)\n", + " for i in circles:\n", + " plt.plot(\n", + " data[i][0],\n", + " data[i][1],\n", + " \"o\",\n", + " markerfacecolor=\"none\",\n", + " markeredgecolor=\"red\",\n", + " markersize=14,\n", + " markeredgewidth=2.5,\n", + " alpha=alpha\n", + " )\n", + " plt.title(title, fontsize=20)\n", + " \n", + " if colorbar: plt.colorbar(orientation = 'vertical')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data = make_data(feature_size=(100,3))\n", + "true_errors = np.where(data['true_labels'] != data['lables'])[0]\n", + "plot_data(data[['feature_1','lables']], circles=true_errors, title=\"Messy Regression dataset\", xlabel=\"feature_1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The figure above represents a toy dataset we'll use to demostrate label scoring for regression dataset. In this example, lables are ploted w.r.t. one of the features of the dataset. \n", + "\n", + "Like many real-world datasets, the given label happen to be incorrect for some of the examples(**circled in red**) in this dataset. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using cleanlab to generate label quality scores" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feature_1feature_2feature_3lablestrue_labels
017.9868571.292315-4.46331915.61585215.615852
115.4469431.789677-4.15882311.87779713.877797
218.5907541.828643-3.37542317.84397417.843974
322.0921191.598861-3.41929723.07168421.071684
415.0633871.919357-7.06650412.71624010.716240
\n", + "
" + ], + "text/plain": [ + " feature_1 feature_2 feature_3 lables true_labels\n", + "0 17.986857 1.292315 -4.463319 15.615852 15.615852\n", + "1 15.446943 1.789677 -4.158823 11.877797 13.877797\n", + "2 18.590754 1.828643 -3.375423 17.843974 17.843974\n", + "3 22.092119 1.598861 -3.419297 23.071684 21.071684\n", + "4 15.063387 1.919357 -7.066504 12.716240 10.716240" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# start with checking the dataset generated\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Note that \"true_labels\" will not be available in real-life dataset. We have added here only for comparision.\n", + "X = data[['feature_'+str(i+1) for i in range(3)]]\n", + "y = data['lables']\n", + "\n", + "# initialize your favourite model and generate predictions \n", + "yourFavouriteModel = LinearRegression()\n", + "yourFavouriteModel = yourFavouriteModel.fit(X,y)\n", + "predictions = yourFavouriteModel.predict(X)\n", + "\n", + "# get label quality score for each example in the dataset \n", + "label_quality = get_label_quality_scores(labels=np.array(data['lables']), pred_labels=predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_data(\n", + " data[['feature_1','lables']], \n", + " circles=true_errors ,\n", + " color=label_quality, \n", + " title=\"Messy Regression dataset with label quality scores\", \n", + " colorbar=True, \n", + " xlabel = \"feature_1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the above plot, we have colored each datapoint considering its label quality score. \\\n", + "Datapoints in the plot are same as earlier plot in the notebook. **Red circle** represents that these datapoint were incorrectly marked when compared to true_label. \n", + "\n", + "Low scores for datapoints marked in **Red circle** and High scores for other datapoints justifies that method can identify the errors in the dataset. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.8 ('ENV': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "1ed33b5e6ac3d9870092cd802185bba6fb7a8302b6022e7097221f18c33cb7b2" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 7f9372bc1139079fedea6d8b4213ce690dc176d2 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 10 Nov 2022 12:44:24 -0800 Subject: [PATCH 033/258] unit tests added --- cleanlab/regression/rank.py | 5 ++++- tests/test_regression.py | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 tests/test_regression.py diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 2fdde78299..54c38577f1 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,6 +1,6 @@ import numpy as np -""" generate label quality score for regression dataset""" +""" generate label quality score for regression dataset """ def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np.ndarray: @@ -39,6 +39,9 @@ def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np. array([0.36787944, 1. , 0.13533528, 0.90483742]) """ + if not isinstance(labels, np.ndarray) or not isinstance(pred_labels, np.ndarray): + raise TypeError("labels and pred_labels must be of type np.ndarray") + assert ( labels.shape == pred_labels.shape ), f"shape of label {labels.shape} and predicted labels {pred_labels.shape} are not same." diff --git a/tests/test_regression.py b/tests/test_regression.py new file mode 100644 index 0000000000..71de96cf3c --- /dev/null +++ b/tests/test_regression.py @@ -0,0 +1,27 @@ +import numpy as np +import pandas as pd +import pytest + +from cleanlab.regression.rank import get_label_quality_scores + +# To be used for all the tests +labels = np.array([1, 2, 3, 4]) +pred_labels = np.array([1, 3, 4, 5]) + + +def test_output_shape_type(): + scores = get_label_quality_scores(labels=labels, pred_labels=pred_labels) + assert labels.shape == scores.shape + assert isinstance(scores, np.ndarray) + + +@pytest.mark.parametrize("format", [pd.Series, pd.DataFrame, list]) +def test_type_error_for_input_types(format): + with pytest.raises(TypeError) as error: + _ = get_label_quality_scores(labels=format(labels), pred_labels=format(pred_labels)) + + +def test_assertion_error_for_input_shape(): + with pytest.raises(AssertionError) as error: + _ = get_label_quality_scores(labels=labels[:-1], pred_labels=pred_labels) + _ = get_label_quality_scores(labels=labels, pred_labels=pred_labels[:-1]) From 581c1f05263b25be10cf8509ae613b1e39b138c3 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 10 Nov 2022 12:47:49 -0800 Subject: [PATCH 034/258] reindexed tutorial, punctuation fix for docstring --- cleanlab/regression/rank.py | 2 +- docs/source/tutorials/index.rst | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 54c38577f1..6f8ad48323 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,6 +1,6 @@ import numpy as np -""" generate label quality score for regression dataset """ +""" Generates label quality scores for every sample in regression dataset """ def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np.ndarray: diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst index d0f89d7489..78e4efac5f 100644 --- a/docs/source/tutorials/index.rst +++ b/docs/source/tutorials/index.rst @@ -14,6 +14,7 @@ Tutorials multiannotator multilabel_classification token_classification + regression pred_probs_cross_val faq - regression + From 13ab45e381a87e08732ab657914bede36ea127e8 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Tue, 15 Nov 2022 15:51:04 -0800 Subject: [PATCH 035/258] plots changed in tutorial notebook --- docs/source/tutorials/regression.ipynb | 99 ++++++++++++-------------- 1 file changed, 47 insertions(+), 52 deletions(-) diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb index 46f3b5ee38..92bc78ff13 100644 --- a/docs/source/tutorials/regression.ipynb +++ b/docs/source/tutorials/regression.ipynb @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -43,15 +43,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "def make_data(feature_size = (20, 3), \n", - " means = [8, 20, -10], \n", - " stds = [2, 5, 3], \n", + "def make_data(feature_size = (20, 2), \n", + " means = [8, -10], \n", + " stds = [2, 5], \n", " bias = 0.8,\n", - " coeff = [2, 0.1, 0.5],\n", + " coeff = [2, 0.1],\n", " error = [-2, 0, 2], \n", " prob_error = [0.2, 0.6, 0.2], \n", " seed = 42\n", @@ -88,12 +88,9 @@ " title, \n", " alpha=0.6, \n", " color = '#1f77b4', \n", - " xlabel = \"feature\", \n", " colorbar = False):\n", " \n", " plt.figure(figsize=(14, 5))\n", - " plt.xlabel(xlabel, size=13)\n", - " plt.ylabel('label',size=13)\n", " data = data.to_numpy()\n", " plt.scatter(data[:,0], data[:,1], c = color, s=60)\n", " for i in circles:\n", @@ -114,14 +111,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -129,16 +126,20 @@ } ], "source": [ - "data = make_data(feature_size=(100,3))\n", + "data = make_data(feature_size=(100, 2))\n", "true_errors = np.where(data['true_labels'] != data['lables'])[0]\n", - "plot_data(data[['feature_1','lables']], circles=true_errors, title=\"Messy Regression dataset\", xlabel=\"feature_1\")" + "plot_data(data[['feature_1','feature_2']], \n", + " circles=true_errors, \n", + " color=data['lables'], \n", + " colorbar=True, \n", + " title=\"Messy Regression dataset\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The figure above represents a toy dataset we'll use to demostrate label scoring for regression dataset. In this example, lables are ploted w.r.t. one of the features of the dataset. \n", + "The figure above represents a toy dataset we'll use to demonstrate label scoring for regression dataset. In this example, datapoints are ploted on 2-D space (in this case feature_1 vs feature_2). Each datapoint is colored based on given label. \n", "\n", "Like many real-world datasets, the given label happen to be incorrect for some of the examples(**circled in red**) in this dataset. " ] @@ -152,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -178,7 +179,6 @@ " \n", " feature_1\n", " feature_2\n", - " feature_3\n", " lables\n", " true_labels\n", " \n", @@ -187,57 +187,52 @@ " \n", " 0\n", " 17.986857\n", - " 1.292315\n", - " -4.463319\n", - " 15.615852\n", - " 15.615852\n", + " -1.707685\n", + " 19.079171\n", + " 17.079171\n", " \n", " \n", " 1\n", " 15.446943\n", - " 1.789677\n", - " -4.158823\n", - " 11.877797\n", - " 13.877797\n", + " -1.210323\n", + " 15.036620\n", + " 15.036620\n", " \n", " \n", " 2\n", " 18.590754\n", - " 1.828643\n", - " -3.375423\n", - " 17.843974\n", - " 17.843974\n", + " -1.171357\n", + " 18.219397\n", + " 18.219397\n", " \n", " \n", " 3\n", " 22.092119\n", - " 1.598861\n", - " -3.419297\n", - " 23.071684\n", - " 21.071684\n", + " -1.401139\n", + " 21.490981\n", + " 21.490981\n", " \n", " \n", " 4\n", " 15.063387\n", - " 1.919357\n", - " -7.066504\n", - " 12.716240\n", - " 10.716240\n", + " -1.080643\n", + " 14.782744\n", + " 14.782744\n", " \n", " \n", "\n", "" ], "text/plain": [ - " feature_1 feature_2 feature_3 lables true_labels\n", - "0 17.986857 1.292315 -4.463319 15.615852 15.615852\n", - "1 15.446943 1.789677 -4.158823 11.877797 13.877797\n", - "2 18.590754 1.828643 -3.375423 17.843974 17.843974\n", - "3 22.092119 1.598861 -3.419297 23.071684 21.071684\n", - "4 15.063387 1.919357 -7.066504 12.716240 10.716240" + " feature_1 feature_2 lables true_labels\n", + "0 17.986857 -1.707685 19.079171 17.079171\n", + "1 15.446943 -1.210323 15.036620 15.036620\n", + "2 18.590754 -1.171357 18.219397 18.219397\n", + "3 22.092119 -1.401139 21.490981 21.490981\n", + "4 15.063387 -1.080643 14.782744 14.782744" ] }, - "execution_count": 5, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -249,12 +244,12 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# Note that \"true_labels\" will not be available in real-life dataset. We have added here only for comparision.\n", - "X = data[['feature_'+str(i+1) for i in range(3)]]\n", + "X = data[['feature_'+str(i+1) for i in range(2)]]\n", "y = data['lables']\n", "\n", "# initialize your favourite model and generate predictions \n", @@ -268,12 +263,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -284,12 +279,11 @@ ], "source": [ "plot_data(\n", - " data[['feature_1','lables']], \n", + " data[['feature_1','feature_2']], \n", " circles=true_errors ,\n", " color=label_quality, \n", " title=\"Messy Regression dataset with label quality scores\", \n", - " colorbar=True, \n", - " xlabel = \"feature_1\")" + " colorbar=True)" ] }, { @@ -297,7 +291,8 @@ "metadata": {}, "source": [ "In the above plot, we have colored each datapoint considering its label quality score. \\\n", - "Datapoints in the plot are same as earlier plot in the notebook. **Red circle** represents that these datapoint were incorrectly marked when compared to true_label. \n", + "Datapoints in the plot are same as earlier plot in the notebook. \\\n", + "**Red circle** represents that these datapoint were incorrectly marked when compared to true_label. \n", "\n", "Low scores for datapoints marked in **Red circle** and High scores for other datapoints justifies that method can identify the errors in the dataset. " ] From 0eac77616732ceefd2d4e66c4f30a97d57459b43 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Wed, 7 Dec 2022 22:30:18 -0700 Subject: [PATCH 036/258] typo fix --- docs/source/tutorials/regression.ipynb | 132 ++++++++++++++++++++++--- 1 file changed, 118 insertions(+), 14 deletions(-) diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb index 92bc78ff13..b4770e47d1 100644 --- a/docs/source/tutorials/regression.ipynb +++ b/docs/source/tutorials/regression.ipynb @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -72,7 +72,7 @@ " labels = true_labels + np.random.choice(error, feature_size[0], p=prob_error)\n", " \n", " data_dict = {\n", - " \"lables\" : labels, # You have these labels, which have some errors.\n", + " \"labels\" : labels, # You have these labels, which have some errors.\n", " \"true_labels\" : true_labels, # You never get to see these perfect labels.\n", " } \n", " for idx, feature in enumerate(features): # adding names to each features \n", @@ -111,7 +111,111 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
exam_1exam_2exam_3bonus_or_penaltytrue_labelslabels
0537793076.256.2
18164801085.565.5
2748897087.467.4
3619478077.757.7
4489091077.877.8
\n", + "
" + ], + "text/plain": [ + " exam_1 exam_2 exam_3 bonus_or_penalty true_labels labels\n", + "0 53 77 93 0 76.2 56.2\n", + "1 81 64 80 10 85.5 65.5\n", + "2 74 88 97 0 87.4 67.4\n", + "3 61 94 78 0 77.7 57.7\n", + "4 48 90 91 0 77.8 77.8" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "path = \"/Users/krmayank/Desktop/Work/cleanlab/experiments/student_score_regression.csv\"\n", + "data = pd.read_csv(path, index_col=0)\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -127,10 +231,10 @@ ], "source": [ "data = make_data(feature_size=(100, 2))\n", - "true_errors = np.where(data['true_labels'] != data['lables'])[0]\n", + "true_errors = np.where(data['true_labels'] != data['labels'])[0]\n", "plot_data(data[['feature_1','feature_2']], \n", " circles=true_errors, \n", - " color=data['lables'], \n", + " color=data['labels'], \n", " colorbar=True, \n", " title=\"Messy Regression dataset\")" ] @@ -153,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -179,7 +283,7 @@ " \n", " feature_1\n", " feature_2\n", - " lables\n", + " labels\n", " true_labels\n", " \n", " \n", @@ -224,7 +328,7 @@ "" ], "text/plain": [ - " feature_1 feature_2 lables true_labels\n", + " feature_1 feature_2 labels true_labels\n", "0 17.986857 -1.707685 19.079171 17.079171\n", "1 15.446943 -1.210323 15.036620 15.036620\n", "2 18.590754 -1.171357 18.219397 18.219397\n", @@ -232,7 +336,7 @@ "4 15.063387 -1.080643 14.782744 14.782744" ] }, - "execution_count": 11, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -244,13 +348,13 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# Note that \"true_labels\" will not be available in real-life dataset. We have added here only for comparision.\n", "X = data[['feature_'+str(i+1) for i in range(2)]]\n", - "y = data['lables']\n", + "y = data['labels']\n", "\n", "# initialize your favourite model and generate predictions \n", "yourFavouriteModel = LinearRegression()\n", @@ -258,12 +362,12 @@ "predictions = yourFavouriteModel.predict(X)\n", "\n", "# get label quality score for each example in the dataset \n", - "label_quality = get_label_quality_scores(labels=np.array(data['lables']), pred_labels=predictions)" + "label_quality = get_label_quality_scores(labels=np.array(data['labels']), pred_labels=predictions)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 26, "metadata": {}, "outputs": [ { From 1a65c9a209b9e66d6032d80a28a666460cb82f57 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 8 Dec 2022 23:03:45 -0700 Subject: [PATCH 037/258] cleanlab outlier based scoring method added --- cleanlab/regression/rank.py | 116 ++++++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 5 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 6f8ad48323..dcd9460c64 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,9 +1,16 @@ import numpy as np +from cleanlab.outlier import OutOfDistribution +from sklearn.neighbors import NearestNeighbors """ Generates label quality scores for every sample in regression dataset """ -def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np.ndarray: +def get_label_quality_scores( + labels: np.ndarray, + pred_labels: np.ndarray, + *, + method: str = "residual", +) -> np.ndarray: """ Returns label quality score for each example in the regression dataset. @@ -13,14 +20,16 @@ def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np. Parameters ---------- - labels: + labels : np.ndarray Raw labels from original dataset. Array of shape ``(N, )`` consisting given labels, where N is number of datapoints in the regression dataset. - pred_labels: + pred_labels : np.ndarray Predicated labels from regressor fitted on the dataset. Array of shape ``(N,)`` consisting predicted labels, where N is number of datapoints in the regression dataset. + method : {"residual", "TO_BE_NAMED"}, default="residual" #TODO - update name once finalised + Returns ------- label_quality_scores: @@ -39,6 +48,7 @@ def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np. array([0.36787944, 1. , 0.13533528, 0.90483742]) """ + # TODO - add error trigger function in utils. if not isinstance(labels, np.ndarray) or not isinstance(pred_labels, np.ndarray): raise TypeError("labels and pred_labels must be of type np.ndarray") @@ -46,6 +56,102 @@ def get_label_quality_scores(labels: np.ndarray, pred_labels: np.ndarray) -> np. labels.shape == pred_labels.shape ), f"shape of label {labels.shape} and predicted labels {pred_labels.shape} are not same." + scoring_funcs = { + "residual": get_residual_score_for_each_label, + "TO_BE_NAMED": get_score_to_named_for_each_label, # TODO - update name once finalised + } + + # TODO - update name once finalised + try: + scoring_func = scoring_funcs[method] + except KeyError: + raise ValueError( + f""" + {method} is not a valid scoring method. + Please choose a valid scoring technique: residual, TO_BE_NAMED. + """ + ) + + # Calculate scores + label_quality_score = scoring_func(labels, pred_labels) + return label_quality_score + + +def get_residual_score_for_each_label( + labels: np.ndarray, + pred_labels: np.ndarray, +) -> np.ndarray: + """Returns the residual based label-quality scores for each datapoints. + + This is function to compute label-quality scores for regression datasets, + where lower score indicate labels less likely to be correct. + + Residual based scores can work better for datasets where independent variables + are based out of normal distribution. + + Parameters + ---------- + labels: np.ndarray + Labels in the same format expected by the :py:func:`get_label_quality_scores ` function. + + pred_labels: np.ndarray + Predicted labels in the same format expected by the :py:func:`get_label_quality_scores ` function. + + Returns + ------- + label_quality_scores: np.ndarray + Contains one score (between 0 and 1) per example. + Lower scores indicate more likely mislabled examples. + + """ residual = pred_labels - labels - quality_scores = np.exp(-abs(residual)) - return quality_scores + label_quality_scores = np.exp(-abs(residual)) + return label_quality_scores + + +# TODO - change name of the function +def get_score_to_named_for_each_label( + label: np.ndarray, + pred_labels: np.ndarray, + *, + variance: float = 10, +) -> np.ndarray: + """Returns label-quality scores. + + This is function to compute label-quality scores for regression datasets, + where lower score indicate labels less likely to be correct. + + Parameters + ---------- + labels: np.ndarray + Labels in the same format expected by the :py:func:`get_label_quality_scores ` function. + + pred_labels: np.ndarray + Predicted labels in the same format expected by the :py:func:`get_label_quality_scores ` function. + + variance: float, default = 10 + Manipulates variance of the distribution of residual. + + Returns + ------- + label_quality_scores: np.ndarray + Contains one score (between 0 and 1) per example. + Lower scores indicate more likely mislabled examples. + """ + + neighbors = int(np.ceil(0.1 * label.shape[0])) + print(neighbors) + knn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean") + + residual = pred_labels - label + + label = (label - label.mean()) / label.std() + residual = np.sqrt(variance) * ((residual - residual.mean()) / residual.std()) + + # 2D features by combining labels and residual + features = np.array([label, residual]).T + + knn.fit(features) + ood = OutOfDistribution(params={"knn": knn}) + label_quality_scores = ood.score(features=features) + return label_quality_scores From e8a9a495a3a7d87a92dfc1963c086ccd9f44b805 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 9 Dec 2022 11:25:30 -0700 Subject: [PATCH 038/258] regression_utils created --- cleanlab/internal/regression_utils.py | 28 +++++++++++++++++++++++++++ cleanlab/regression/rank.py | 10 +++------- 2 files changed, 31 insertions(+), 7 deletions(-) create mode 100644 cleanlab/internal/regression_utils.py diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py new file mode 100644 index 0000000000..57c99baedb --- /dev/null +++ b/cleanlab/internal/regression_utils.py @@ -0,0 +1,28 @@ +""" +Helper function internally used in cleanlab.regression +""" + +import numpy as np + + +def assert_valid_inputs( + labels: np.ndarray, + pred_labels: np.ndarray, + method: str, +) -> None: + """Checks that ``labels``, ``pred_labels``, ``method`` are correctly formatted.""" + + # Check if labels and pred_labels are np.ndarray + if not isinstance(labels, np.ndarray) or not isinstance(pred_labels, np.ndarray): + raise TypeError("labels and pred_labels must be of type np.ndarray") + + # Check if labels and pred_labels are of same shape + assert ( + labels.shape == pred_labels.shape + ), f"shape of label {labels.shape} and predicted labels {pred_labels.shape} are not same." + + # Check if method passed is string + if not isinstance(method, str): + raise TypeError( + f"Passed method is not of correct type. Expected string, got {type(method)}" + ) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index dcd9460c64..8d53af9adf 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,6 +1,7 @@ import numpy as np from cleanlab.outlier import OutOfDistribution from sklearn.neighbors import NearestNeighbors +from cleanlab.internal.regression_utils import assert_valid_inputs """ Generates label quality scores for every sample in regression dataset """ @@ -48,13 +49,8 @@ def get_label_quality_scores( array([0.36787944, 1. , 0.13533528, 0.90483742]) """ - # TODO - add error trigger function in utils. - if not isinstance(labels, np.ndarray) or not isinstance(pred_labels, np.ndarray): - raise TypeError("labels and pred_labels must be of type np.ndarray") - - assert ( - labels.shape == pred_labels.shape - ), f"shape of label {labels.shape} and predicted labels {pred_labels.shape} are not same." + # Check if inputs are valid + assert_valid_inputs(labels=labels, pred_labels=pred_labels, method=method) scoring_funcs = { "residual": get_residual_score_for_each_label, From 98930fcdaf3ce6592aa6d6c771b4091f55450f68 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Mon, 12 Dec 2022 09:40:36 -0800 Subject: [PATCH 039/258] pred_labels changed to predictions --- cleanlab/internal/regression_utils.py | 8 ++++---- cleanlab/regression/rank.py | 25 ++++++++++++------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py index 57c99baedb..04576c4012 100644 --- a/cleanlab/internal/regression_utils.py +++ b/cleanlab/internal/regression_utils.py @@ -7,19 +7,19 @@ def assert_valid_inputs( labels: np.ndarray, - pred_labels: np.ndarray, + predictions: np.ndarray, method: str, ) -> None: """Checks that ``labels``, ``pred_labels``, ``method`` are correctly formatted.""" # Check if labels and pred_labels are np.ndarray - if not isinstance(labels, np.ndarray) or not isinstance(pred_labels, np.ndarray): + if not isinstance(labels, np.ndarray) or not isinstance(predictions, np.ndarray): raise TypeError("labels and pred_labels must be of type np.ndarray") # Check if labels and pred_labels are of same shape assert ( - labels.shape == pred_labels.shape - ), f"shape of label {labels.shape} and predicted labels {pred_labels.shape} are not same." + labels.shape == predictions.shape + ), f"shape of label {labels.shape} and predicted labels {predictions.shape} are not same." # Check if method passed is string if not isinstance(method, str): diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 8d53af9adf..3784d290b0 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -8,7 +8,7 @@ def get_label_quality_scores( labels: np.ndarray, - pred_labels: np.ndarray, + predictions: np.ndarray, *, method: str = "residual", ) -> np.ndarray: @@ -25,7 +25,7 @@ def get_label_quality_scores( Raw labels from original dataset. Array of shape ``(N, )`` consisting given labels, where N is number of datapoints in the regression dataset. - pred_labels : np.ndarray + predictions : np.ndarray Predicated labels from regressor fitted on the dataset. Array of shape ``(N,)`` consisting predicted labels, where N is number of datapoints in the regression dataset. @@ -43,14 +43,14 @@ def get_label_quality_scores( >>> import numpy as np >>> from cleanlab.regression.rank import get_label_quality_scores >>> labels = np.array([1,2,3,4]) - >>> pred_labels = np.array([2,2,5,4.1]) - >>> label_quality_scores = get_label_quality_scores(labels, pred_labels) + >>> predictions = np.array([2,2,5,4.1]) + >>> label_quality_scores = get_label_quality_scores(labels, predictions) >>> label_quality_scores array([0.36787944, 1. , 0.13533528, 0.90483742]) """ # Check if inputs are valid - assert_valid_inputs(labels=labels, pred_labels=pred_labels, method=method) + assert_valid_inputs(labels=labels, predictions=predictions, method=method) scoring_funcs = { "residual": get_residual_score_for_each_label, @@ -69,13 +69,13 @@ def get_label_quality_scores( ) # Calculate scores - label_quality_score = scoring_func(labels, pred_labels) + label_quality_score = scoring_func(labels, predictions) return label_quality_score def get_residual_score_for_each_label( labels: np.ndarray, - pred_labels: np.ndarray, + predictions: np.ndarray, ) -> np.ndarray: """Returns the residual based label-quality scores for each datapoints. @@ -90,7 +90,7 @@ def get_residual_score_for_each_label( labels: np.ndarray Labels in the same format expected by the :py:func:`get_label_quality_scores ` function. - pred_labels: np.ndarray + predictions: np.ndarray Predicted labels in the same format expected by the :py:func:`get_label_quality_scores ` function. Returns @@ -100,7 +100,7 @@ def get_residual_score_for_each_label( Lower scores indicate more likely mislabled examples. """ - residual = pred_labels - labels + residual = predictions - labels label_quality_scores = np.exp(-abs(residual)) return label_quality_scores @@ -108,7 +108,7 @@ def get_residual_score_for_each_label( # TODO - change name of the function def get_score_to_named_for_each_label( label: np.ndarray, - pred_labels: np.ndarray, + predictions: np.ndarray, *, variance: float = 10, ) -> np.ndarray: @@ -122,7 +122,7 @@ def get_score_to_named_for_each_label( labels: np.ndarray Labels in the same format expected by the :py:func:`get_label_quality_scores ` function. - pred_labels: np.ndarray + predictions: np.ndarray Predicted labels in the same format expected by the :py:func:`get_label_quality_scores ` function. variance: float, default = 10 @@ -136,10 +136,9 @@ def get_score_to_named_for_each_label( """ neighbors = int(np.ceil(0.1 * label.shape[0])) - print(neighbors) knn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean") - residual = pred_labels - label + residual = predictions - label label = (label - label.mean()) / label.std() residual = np.sqrt(variance) * ((residual - residual.mean()) / residual.std()) From e4e6307ad555a0347eae20166f72d6bc46e5ee84 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 22 Dec 2022 00:23:12 -0800 Subject: [PATCH 040/258] unit tests for new scoring method --- cleanlab/regression/rank.py | 19 ++++++++-------- tests/test_regression.py | 43 +++++++++++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 3784d290b0..f53a0fb47e 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -10,7 +10,7 @@ def get_label_quality_scores( labels: np.ndarray, predictions: np.ndarray, *, - method: str = "residual", + method: str = "TO_BE_NAMED", # TODO update name once finalised ) -> np.ndarray: """ Returns label quality score for each example in the regression dataset. @@ -64,13 +64,13 @@ def get_label_quality_scores( raise ValueError( f""" {method} is not a valid scoring method. - Please choose a valid scoring technique: residual, TO_BE_NAMED. + Please choose a valid scoring technique: {scoring_funcs.keys()}. """ ) # Calculate scores - label_quality_score = scoring_func(labels, predictions) - return label_quality_score + label_quality_scores = scoring_func(labels, predictions) + return label_quality_scores def get_residual_score_for_each_label( @@ -106,8 +106,9 @@ def get_residual_score_for_each_label( # TODO - change name of the function +# TODO - change name of function in test def get_score_to_named_for_each_label( - label: np.ndarray, + labels: np.ndarray, predictions: np.ndarray, *, variance: float = 10, @@ -135,16 +136,16 @@ def get_score_to_named_for_each_label( Lower scores indicate more likely mislabled examples. """ - neighbors = int(np.ceil(0.1 * label.shape[0])) + neighbors = int(np.ceil(0.1 * labels.shape[0])) knn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean") - residual = predictions - label + residual = predictions - labels - label = (label - label.mean()) / label.std() + labels = (labels - labels.mean()) / labels.std() residual = np.sqrt(variance) * ((residual - residual.mean()) / residual.std()) # 2D features by combining labels and residual - features = np.array([label, residual]).T + features = np.array([labels, residual]).T knn.fit(features) ood = OutOfDistribution(params={"knn": knn}) diff --git a/tests/test_regression.py b/tests/test_regression.py index 71de96cf3c..8a8154047c 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -2,26 +2,55 @@ import pandas as pd import pytest -from cleanlab.regression.rank import get_label_quality_scores +from cleanlab.regression import rank # To be used for all the tests labels = np.array([1, 2, 3, 4]) -pred_labels = np.array([1, 3, 4, 5]) - +predictions = np.array([1, 3, 4, 5]) +# test with deafault parameters def test_output_shape_type(): - scores = get_label_quality_scores(labels=labels, pred_labels=pred_labels) + scores = rank.get_label_quality_scores(labels=labels, predictions=predictions) assert labels.shape == scores.shape assert isinstance(scores, np.ndarray) +# test for acceptable datatypes @pytest.mark.parametrize("format", [pd.Series, pd.DataFrame, list]) def test_type_error_for_input_types(format): with pytest.raises(TypeError) as error: - _ = get_label_quality_scores(labels=format(labels), pred_labels=format(pred_labels)) + _ = rank.get_label_quality_scores(labels=format(labels), predictions=format(predictions)) +# test for input shapes def test_assertion_error_for_input_shape(): with pytest.raises(AssertionError) as error: - _ = get_label_quality_scores(labels=labels[:-1], pred_labels=pred_labels) - _ = get_label_quality_scores(labels=labels, pred_labels=pred_labels[:-1]) + _ = rank.get_label_quality_scores(labels=labels[:-1], predictions=predictions) + _ = rank.get_label_quality_scores(labels=labels, predictions=predictions[:-1]) + + +# TODO - change name once finalised +# test individual scoring functions +@pytest.mark.parametrize( + "scoring_funcs", + [rank.get_residual_score_for_each_label, rank.get_score_to_named_for_each_label], +) +def test_individual_scoring_functions(scoring_funcs): + scores = scoring_funcs(labels=labels, predictions=predictions) + assert labels.shape == scores.shape + assert isinstance(scores, np.ndarray) + + +# TODO - change name once finalised +# test for method argument +@pytest.mark.parametrize( + "method", + [ + "residual", + "TO_BE_NAMED", + ], +) +def test_method_pass_get_label_quality_scores(method): + scores = rank.get_label_quality_scores(labels=labels, predictions=predictions, method=method) + assert labels.shape == scores.shape + assert isinstance(scores, np.ndarray) From af2454b06f2c0198b598f55fc231f13c00ef8cfc Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 12:46:08 -0800 Subject: [PATCH 041/258] init merge conflict resolved --- cleanlab/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cleanlab/__init__.py b/cleanlab/__init__.py index 753e47e82a..fffef7d53e 100644 --- a/cleanlab/__init__.py +++ b/cleanlab/__init__.py @@ -8,5 +8,6 @@ from . import multiannotator from . import outlier from . import token_classification + from . import multilabel_classification from . import regression From be8afaaca28fc1302e5c4bb0a16b14cc7068c6d6 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 22 Dec 2022 01:15:43 -0800 Subject: [PATCH 042/258] tutorial draft1 --- docs/source/tutorials/regression.ipynb | 476 ++++++++++++++----------- 1 file changed, 268 insertions(+), 208 deletions(-) diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb index b4770e47d1..682a81ffb2 100644 --- a/docs/source/tutorials/regression.ipynb +++ b/docs/source/tutorials/regression.ipynb @@ -1,117 +1,215 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "# Label Quality Scores for Regression with Noisy Labels\n", - "In this tutorial, you will learn how to use cleanlab on regression dataset to: \n", - "- find label issue in your regression dataset\n", - "- generate label quality scores for each example in the dataset. " + "# Label Quality Scores for Regression with Noisy Labels " ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Install dependencies and import them \n", - "You can use pip to install all packages required for this tutorial as follows:" + "This quickstart tutorial shows how to use cleanlab for finding label errors in regression data. Using the approach mentioned here, you can find label error in any regression dataset irrespective of modality i.e., tabular, text, image etc. \n", + "\n", + "**This example will take you through following:**\n", + "- Generate label quality scores for each datapoint in the dataset. \n", + "- Find label issue for regression dataset. " ] }, { - "cell_type": "code", - "execution_count": null, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "!pip install cleanlab\n", - "!pip install scikit-learn" + "Quickstart \n", + "\n", + "Cleanlab uses two inputs to generates scores for labels in the dataset:\n", + "- `labels`: NumPy array of given labels in the dataset. labels[i] should contain label for `i`-th datapoint. \n", + "- `predictions`: NumPy array of predictions generated through your favourite regressor. predictions[i] should contain predicted value for `i`-th datapoint. \n", + "\n", + "If you already have predictions from your regressor, you can generate label quality scores for each datapoint using the code below: \n", + "\n", + "
\n", + "\n", + "```python \n", + "\n", + "from cleanlab.regression.rank import get_label_quality_scores\n", + "label_quality_scores = get_label_quality_scores(labels, predictions)\n", + "\n", + "```\n", + "
\n", + "" ] }, { - "cell_type": "code", - "execution_count": 1, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import numpy as np \n", - "import pandas as pd \n", - "from cleanlab.regression.rank import get_label_quality_scores\n", - "from sklearn.linear_model import LinearRegression\n", - "import matplotlib.pyplot as plt " + "# 0. Visualization (can skip these details)" ] }, { - "cell_type": "code", - "execution_count": 21, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "def make_data(feature_size = (20, 2), \n", - " means = [8, -10], \n", - " stds = [2, 5], \n", - " bias = 0.8,\n", - " coeff = [2, 0.1],\n", - " error = [-2, 0, 2], \n", - " prob_error = [0.2, 0.6, 0.2], \n", - " seed = 42\n", - ") -> pd.DataFrame:\n", - " \"\"\"\n", - " feature_size: Tuple of (datapoints, features)\n", - " \"\"\"\n", - " assert (len(means) == feature_size[1]), (f\"length of mean {len(means)} is not same as features requested{feature_size[0]}\")\n", - " assert (len(stds) == feature_size[1]), (f\"length of stds {len(stds)} is not same as features requested{feature_size[0]}\")\n", - " np.random.seed(seed) \n", + "This is added just for reference. We will use this function to plot dataset, highlight points using label quality scores and true_errors.\n", + "You can skip this part and move to next section. " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
See the code for visualization **(click to expand)** \n", "\n", - " features = []\n", - " for i in range(feature_size[1]):\n", - " values = coeff[i] * np.random.normal(loc=means[i], scale=stds[i], size=feature_size[0])\n", - " features.append(values)\n", - " \n", - " true_labels = sum(map(np.array, features))+ bias\n", - " labels = true_labels + np.random.choice(error, feature_size[0], p=prob_error)\n", - " \n", - " data_dict = {\n", - " \"labels\" : labels, # You have these labels, which have some errors.\n", - " \"true_labels\" : true_labels, # You never get to see these perfect labels.\n", - " } \n", - " for idx, feature in enumerate(features): # adding names to each features \n", - " data_dict[\"feature_\"+str(idx+1)] = feature\n", - " data = pd.DataFrame.from_dict(data_dict)\n", - " col = list(data.columns)\n", - " new_col = col[2:] + col[:2]\n", - " data = data.reindex(columns=new_col)\n", - " return data\n", + "```python \n", + "# Note: this pulldown is for docs.cleanlab.ai, if running on local Jupyter or colab, please ignore it. \n", "\n", - "def plot_data(data, \n", - " circles, \n", - " title, \n", - " alpha=0.6, \n", - " color = '#1f77b4', \n", - " colorbar = False):\n", - " \n", + "def plot_data(\n", + " data_x, data_y, circles, title, alpha=0.6, color=\"#1f77b4\", colorbar=False, xlabel=\"\", ylabel=\"\"\n", + "):\n", " plt.figure(figsize=(14, 5))\n", - " data = data.to_numpy()\n", - " plt.scatter(data[:,0], data[:,1], c = color, s=60)\n", + " data_x = data_x.to_numpy()\n", + " data_y = data_y.to_numpy()\n", + " plt.scatter(data_x, data_y, c=color, s=30)\n", " for i in circles:\n", " plt.plot(\n", - " data[i][0],\n", - " data[i][1],\n", + " data_x[i],\n", + " data_y[i],\n", " \"o\",\n", " markerfacecolor=\"none\",\n", " markeredgecolor=\"red\",\n", - " markersize=14,\n", + " markersize=10,\n", " markeredgewidth=2.5,\n", - " alpha=alpha\n", + " alpha=alpha,\n", " )\n", " plt.title(title, fontsize=20)\n", - " \n", - " if colorbar: plt.colorbar(orientation = 'vertical')\n" + " plt.xlabel(xlabel)\n", + " plt.ylabel(ylabel)\n", + "\n", + " if colorbar:\n", + " plt.colorbar(orientation=\"vertical\")\n", + "\n", + "```\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_data(\n", + " data_x, data_y, circles, title, alpha=0.6, color=\"#1f77b4\", colorbar=False, xlabel=\"\", ylabel=\"\"\n", + "):\n", + " plt.figure(figsize=(14, 5))\n", + " data_x = data_x.to_numpy()\n", + " data_y = data_y.to_numpy()\n", + " plt.scatter(data_x, data_y, c=color, s=30)\n", + " for i in circles:\n", + " plt.plot(\n", + " data_x[i],\n", + " data_y[i],\n", + " \"o\",\n", + " markerfacecolor=\"none\",\n", + " markeredgecolor=\"red\",\n", + " markersize=10,\n", + " markeredgewidth=2.5,\n", + " alpha=alpha,\n", + " )\n", + " plt.title(title, fontsize=20)\n", + " plt.xlabel(xlabel)\n", + " plt.ylabel(ylabel)\n", + "\n", + " if colorbar:\n", + " plt.colorbar(orientation=\"vertical\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Install dependencies and import them \n", + "You can use `pip` to install all packages required for this tutorial as follows:\n", + "\n", + "`!pip install cleanlab xgboost`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install cleanlab xgboost" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Package installation (hidden on docs website).\n", + "# Package versions we used: xgboost==1.7.2\n", + "\n", + "dependencies = [\"cleanlab\", \"xgboost\"]\n", + "\n", + "if \"google.colab\" in str(get_ipython()): # Check if it's running in Google Colab\n", + " %pip install cleanlab # for colab\n", + " cmd = \" \".join([dep for dep in dependencies if dep != \"cleanlab\"])\n", + " %pip install $cmd\n", + "else:\n", + " missing_dependencies = []\n", + " for dependency in dependencies:\n", + " try:\n", + " __import__(dependency)\n", + " except ImportError:\n", + " missing_dependencies.append(dependency)\n", + "\n", + " if len(missing_dependencies) > 0:\n", + " print(\"Missing required dependencies:\")\n", + " print(*missing_dependencies, sep=\", \")\n", + " print(\"\\nPlease install them before running the rest of this notebook.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from cleanlab.regression.rank import get_label_quality_scores\n", + "import xgboost as xgb\n", + "import matplotlib.pyplot as plt\n", + "\n", + "np.set_printoptions(suppress=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Import dataset and Generate predictions" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 107, "metadata": {}, "outputs": [ { @@ -138,7 +236,7 @@ " exam_1\n", " exam_2\n", " exam_3\n", - " bonus_or_penalty\n", + " notes\n", " true_labels\n", " labels\n", " \n", @@ -149,43 +247,43 @@ " 53\n", " 77\n", " 93\n", - " 0\n", + " NaN\n", + " 76.2\n", " 76.2\n", - " 56.2\n", " \n", " \n", " 1\n", " 81\n", " 64\n", " 80\n", - " 10\n", + " great participation +10\n", + " 85.5\n", " 85.5\n", - " 65.5\n", " \n", " \n", " 2\n", " 74\n", " 88\n", " 97\n", - " 0\n", + " NaN\n", + " 87.4\n", " 87.4\n", - " 67.4\n", " \n", " \n", " 3\n", " 61\n", " 94\n", " 78\n", - " 0\n", + " NaN\n", + " 77.7\n", " 77.7\n", - " 57.7\n", " \n", " \n", " 4\n", " 48\n", " 90\n", " 91\n", - " 0\n", + " NaN\n", " 77.8\n", " 77.8\n", " \n", @@ -194,15 +292,15 @@ "" ], "text/plain": [ - " exam_1 exam_2 exam_3 bonus_or_penalty true_labels labels\n", - "0 53 77 93 0 76.2 56.2\n", - "1 81 64 80 10 85.5 65.5\n", - "2 74 88 97 0 87.4 67.4\n", - "3 61 94 78 0 77.7 57.7\n", - "4 48 90 91 0 77.8 77.8" + " exam_1 exam_2 exam_3 notes true_labels labels\n", + "0 53 77 93 NaN 76.2 76.2\n", + "1 81 64 80 great participation +10 85.5 85.5\n", + "2 74 88 97 NaN 87.4 87.4\n", + "3 61 94 78 NaN 77.7 77.7\n", + "4 48 90 91 NaN 77.8 77.8" ] }, - "execution_count": 22, + "execution_count": 107, "metadata": {}, "output_type": "execute_result" } @@ -215,14 +313,14 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 108, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -230,149 +328,78 @@ } ], "source": [ - "data = make_data(feature_size=(100, 2))\n", - "true_errors = np.where(data['true_labels'] != data['labels'])[0]\n", - "plot_data(data[['feature_1','feature_2']], \n", - " circles=true_errors, \n", - " color=data['labels'], \n", - " colorbar=True, \n", - " title=\"Messy Regression dataset\")" + "# Generate true errors\n", + "true_errors = np.where(data.labels != data.true_labels)[0]\n", + "plot_data(\n", + " data_x=data[\"exam_3\"], \n", + " data_y=data[\"labels\"],\n", + " circles=true_errors,\n", + " title=\"Messy Regression dataset\",\n", + " xlabel=\"exam_3 feature\",\n", + " ylabel=\"label (Y value)\",\n", + ")" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "The figure above represents a toy dataset we'll use to demonstrate label scoring for regression dataset. In this example, datapoints are ploted on 2-D space (in this case feature_1 vs feature_2). Each datapoint is colored based on given label. \n", + "In the dataframe displayed above, `labels` represents the noisy labels and `true_labels` represents the ground truth. Please note that, ground truth are usually not available in real dataset, we have added it here for comparision and to demonstrate our method. `notes` also has text information, we will model this a categorical variable. \n", "\n", - "Like many real-world datasets, the given label happen to be incorrect for some of the examples(**circled in red**) in this dataset. " + "We will use `xgboost` as regressor for this tutorial. xgboost provides easy to use interface to process categorical variable. This is demonstrated in the code below:" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 109, "metadata": {}, + "outputs": [], "source": [ - "## Using cleanlab to generate label quality scores" + "# XGBOOST automatically factors categorical variable, you just need to mark the columns as category\n", + "data.notes = data.notes.astype(\"category\")\n", + "\n", + "# XGBOOST takes data and label seperately, so you will need to divide data accordingly.\n", + "X = data.drop([\"labels\", \"true_labels\"], axis=1)\n", + "y = data[\"labels\"]\n", + "\n", + "# convert data to format \"DMatrix\" to make it compatible with XGBOOST.\n", + "xgboost_data = xgb.DMatrix(data=X, label=y, enable_categorical=True)\n", + "\n", + "# declare parameters and train the model.\n", + "params = {\"booster\": \"gblinear\", \"objective\": \"reg:squarederror\"}\n", + "boost = xgb.train(params=params, dtrain=xgboost_data, num_boost_round=50)" ] }, { - "cell_type": "code", - "execution_count": 24, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
feature_1feature_2labelstrue_labels
017.986857-1.70768519.07917117.079171
115.446943-1.21032315.03662015.036620
218.590754-1.17135718.21939718.219397
322.092119-1.40113921.49098121.490981
415.063387-1.08064314.78274414.782744
\n", - "
" - ], - "text/plain": [ - " feature_1 feature_2 labels true_labels\n", - "0 17.986857 -1.707685 19.079171 17.079171\n", - "1 15.446943 -1.210323 15.036620 15.036620\n", - "2 18.590754 -1.171357 18.219397 18.219397\n", - "3 22.092119 -1.401139 21.490981 21.490981\n", - "4 15.063387 -1.080643 14.782744 14.782744" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "# start with checking the dataset generated\n", - "data.head()" + "## 3. Using cleanlab to generate label quality scores" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ - "# Note that \"true_labels\" will not be available in real-life dataset. We have added here only for comparision.\n", - "X = data[['feature_'+str(i+1) for i in range(2)]]\n", - "y = data['labels']\n", - "\n", - "# initialize your favourite model and generate predictions \n", - "yourFavouriteModel = LinearRegression()\n", - "yourFavouriteModel = yourFavouriteModel.fit(X,y)\n", - "predictions = yourFavouriteModel.predict(X)\n", + "# using trained xgboost model to get predictions\n", + "predictions = boost.predict(xgboost_data)\n", "\n", - "# get label quality score for each example in the dataset \n", - "label_quality = get_label_quality_scores(labels=np.array(data['labels']), pred_labels=predictions)" + "# get label quality score for each example in the dataset using cleanlab\n", + "label_quality_scores = get_label_quality_scores(labels=np.array(y), predictions=predictions)" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 111, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -383,11 +410,15 @@ ], "source": [ "plot_data(\n", - " data[['feature_1','feature_2']], \n", - " circles=true_errors ,\n", - " color=label_quality, \n", - " title=\"Messy Regression dataset with label quality scores\", \n", - " colorbar=True)" + " data_x=data[\"exam_3\"], \n", + " data_y=data[\"labels\"],\n", + " circles=true_errors,\n", + " color=label_quality_scores,\n", + " title=\"Messy Regression dataset with label quality scores\",\n", + " colorbar=True,\n", + " xlabel=\"exam_3 feature\",\n", + " ylabel=\"label (Y value)\",\n", + ")" ] }, { @@ -402,8 +433,37 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 112, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Note: This cell is only for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", + "from sklearn.metrics import roc_auc_score\n", + "\n", + "label_quality_scores = get_label_quality_scores(np.array(y), predictions=predictions)\n", + "\n", + "true_errors = (data.labels != data.true_labels).astype(int)\n", + "label_quality_scores_residual = get_label_quality_scores(\n", + " np.array(y), predictions=predictions, method=\"residual\"\n", + ")\n", + "\n", + "if roc_auc_score(true_errors, 1 - label_quality_scores) < 0.5:\n", + " raise ValueError(\"Label quality scores did not perform well enough\")\n", + "\n", + "if roc_auc_score(true_errors, 1 - label_quality_scores) <= roc_auc_score(\n", + " true_errors, 1 - label_quality_scores_residual\n", + "):\n", + " raise ValueError(\"Label quality scores did not outperform alternative scores\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [] } ], From ea2f723ab24af83b6c52da3e98d24dc6dfbe0a43 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 22 Dec 2022 01:16:36 -0800 Subject: [PATCH 043/258] tutorial draft1 --- docs/source/tutorials/regression.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb index 682a81ffb2..cc02658e80 100644 --- a/docs/source/tutorials/regression.ipynb +++ b/docs/source/tutorials/regression.ipynb @@ -331,7 +331,7 @@ "# Generate true errors\n", "true_errors = np.where(data.labels != data.true_labels)[0]\n", "plot_data(\n", - " data_x=data[\"exam_3\"], \n", + " data_x=data[\"exam_3\"],\n", " data_y=data[\"labels\"],\n", " circles=true_errors,\n", " title=\"Messy Regression dataset\",\n", @@ -410,7 +410,7 @@ ], "source": [ "plot_data(\n", - " data_x=data[\"exam_3\"], \n", + " data_x=data[\"exam_3\"],\n", " data_y=data[\"labels\"],\n", " circles=true_errors,\n", " color=label_quality_scores,\n", From f9af6ebf271d3c9f7281f12da73cb2ae68efc70c Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 13:07:04 -0800 Subject: [PATCH 044/258] merge conflict --- cleanlab/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cleanlab/__init__.py b/cleanlab/__init__.py index fffef7d53e..753e47e82a 100644 --- a/cleanlab/__init__.py +++ b/cleanlab/__init__.py @@ -8,6 +8,5 @@ from . import multiannotator from . import outlier from . import token_classification - from . import multilabel_classification from . import regression From 00bcf61c301d425a1fb6748ca2baaab47b25fcc0 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 14:56:32 -0800 Subject: [PATCH 045/258] default modified for method in docstring --- cleanlab/regression/rank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index f53a0fb47e..a6176a8de3 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -29,7 +29,7 @@ def get_label_quality_scores( Predicated labels from regressor fitted on the dataset. Array of shape ``(N,)`` consisting predicted labels, where N is number of datapoints in the regression dataset. - method : {"residual", "TO_BE_NAMED"}, default="residual" #TODO - update name once finalised + method : {"residual", "TO_BE_NAMED"}, default="TO_BE_NAMED" #TODO - update name once finalised Returns ------- From 542e30f6840a0f138a0052558cfdca0f56dbd288 Mon Sep 17 00:00:00 2001 From: Mayank Kumar Date: Fri, 23 Dec 2022 15:01:21 -0800 Subject: [PATCH 046/258] grammatical correction in rank.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Elías Snorrason --- cleanlab/regression/rank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index a6176a8de3..4318d664df 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -15,7 +15,7 @@ def get_label_quality_scores( """ Returns label quality score for each example in the regression dataset. - Each score is continous value in range [0,1] + Each score is a continous value in the range [0,1] 1 - clean label (given label is likely correct). 0 - dirty label (given label is likely incorrect). From 3958b583299b8be18bbd61ba2276516fe328cf0f Mon Sep 17 00:00:00 2001 From: Mayank Kumar Date: Fri, 23 Dec 2022 15:24:02 -0800 Subject: [PATCH 047/258] Update cleanlab/regression/rank.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Elías Snorrason --- cleanlab/regression/rank.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 4318d664df..dc98059253 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -115,8 +115,8 @@ def get_score_to_named_for_each_label( ) -> np.ndarray: """Returns label-quality scores. - This is function to compute label-quality scores for regression datasets, - where lower score indicate labels less likely to be correct. + This function computes label-quality scores for regression datasets, + where a lower score indicates labels that are less likely to be correct. Parameters ---------- From db0bb5d8147a204af6495e5c537aa0f226bfac33 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 16:55:11 -0800 Subject: [PATCH 048/258] =?UTF-8?q?rank.py=20updates=201.=20added=20typing?= =?UTF-8?q?=20hints=20for=20scoring=20funcs=202.=20Removed=20try-except=20?= =?UTF-8?q?block=20for=20raising=20value=20error.=203.=20grammatical=20cor?= =?UTF-8?q?rections=204.=20knn=20and=20neighbors=20construction=20moved=20?= =?UTF-8?q?closer=20to=20first=20usage.=20Co-authored-by:=20El=C3=ADas=20S?= =?UTF-8?q?norrason=20?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cleanlab/regression/rank.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index dc98059253..99c42aba3e 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -2,6 +2,7 @@ from cleanlab.outlier import OutOfDistribution from sklearn.neighbors import NearestNeighbors from cleanlab.internal.regression_utils import assert_valid_inputs +from typing import Dict, Callable """ Generates label quality scores for every sample in regression dataset """ @@ -52,15 +53,14 @@ def get_label_quality_scores( # Check if inputs are valid assert_valid_inputs(labels=labels, predictions=predictions, method=method) - scoring_funcs = { + scoring_funcs: Dict[str, Callable[[np.ndarray, np.ndarray], np.ndarray]] = { "residual": get_residual_score_for_each_label, "TO_BE_NAMED": get_score_to_named_for_each_label, # TODO - update name once finalised } # TODO - update name once finalised - try: - scoring_func = scoring_funcs[method] - except KeyError: + scoring_func = scoring_funcs.get(method, None) + if not scoring_func: raise ValueError( f""" {method} is not a valid scoring method. @@ -121,10 +121,10 @@ def get_score_to_named_for_each_label( Parameters ---------- labels: np.ndarray - Labels in the same format expected by the :py:func:`get_label_quality_scores ` function. + Labels in the same format as expected by the :py:func:`get_label_quality_scores ` function. predictions: np.ndarray - Predicted labels in the same format expected by the :py:func:`get_label_quality_scores ` function. + Predicted labels in the same format as expected by the :py:func:`get_label_quality_scores ` function. variance: float, default = 10 Manipulates variance of the distribution of residual. @@ -135,19 +135,16 @@ def get_score_to_named_for_each_label( Contains one score (between 0 and 1) per example. Lower scores indicate more likely mislabled examples. """ - - neighbors = int(np.ceil(0.1 * labels.shape[0])) - knn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean") - residual = predictions - labels - labels = (labels - labels.mean()) / labels.std() residual = np.sqrt(variance) * ((residual - residual.mean()) / residual.std()) # 2D features by combining labels and residual features = np.array([labels, residual]).T - knn.fit(features) + neighbors = int(np.ceil(0.1 * labels.shape[0])) + knn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean").fit(features) ood = OutOfDistribution(params={"knn": knn}) + label_quality_scores = ood.score(features=features) return label_quality_scores From 0ea2981a917fd70e8db52a8078b543c86b9edd38 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 19:15:43 -0800 Subject: [PATCH 049/258] outre: code updated for name for second method --- cleanlab/regression/rank.py | 12 +++++------- tests/test_regression.py | 6 ++---- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 99c42aba3e..fe263f88ca 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -11,7 +11,7 @@ def get_label_quality_scores( labels: np.ndarray, predictions: np.ndarray, *, - method: str = "TO_BE_NAMED", # TODO update name once finalised + method: str = "outre", # TODO update name once finalised ) -> np.ndarray: """ Returns label quality score for each example in the regression dataset. @@ -30,7 +30,7 @@ def get_label_quality_scores( Predicated labels from regressor fitted on the dataset. Array of shape ``(N,)`` consisting predicted labels, where N is number of datapoints in the regression dataset. - method : {"residual", "TO_BE_NAMED"}, default="TO_BE_NAMED" #TODO - update name once finalised + method : {"residual", "outre"}, default="outre" Returns ------- @@ -55,10 +55,9 @@ def get_label_quality_scores( scoring_funcs: Dict[str, Callable[[np.ndarray, np.ndarray], np.ndarray]] = { "residual": get_residual_score_for_each_label, - "TO_BE_NAMED": get_score_to_named_for_each_label, # TODO - update name once finalised + "outre": get_outre_score_for_each_label, } - # TODO - update name once finalised scoring_func = scoring_funcs.get(method, None) if not scoring_func: raise ValueError( @@ -105,15 +104,14 @@ def get_residual_score_for_each_label( return label_quality_scores -# TODO - change name of the function # TODO - change name of function in test -def get_score_to_named_for_each_label( +def get_outre_score_for_each_label( labels: np.ndarray, predictions: np.ndarray, *, variance: float = 10, ) -> np.ndarray: - """Returns label-quality scores. + """Returns OUTRE based label-quality scores. This function computes label-quality scores for regression datasets, where a lower score indicates labels that are less likely to be correct. diff --git a/tests/test_regression.py b/tests/test_regression.py index 8a8154047c..dc29315f96 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -29,11 +29,10 @@ def test_assertion_error_for_input_shape(): _ = rank.get_label_quality_scores(labels=labels, predictions=predictions[:-1]) -# TODO - change name once finalised # test individual scoring functions @pytest.mark.parametrize( "scoring_funcs", - [rank.get_residual_score_for_each_label, rank.get_score_to_named_for_each_label], + [rank.get_residual_score_for_each_label, rank.get_outre_score_for_each_label], ) def test_individual_scoring_functions(scoring_funcs): scores = scoring_funcs(labels=labels, predictions=predictions) @@ -41,13 +40,12 @@ def test_individual_scoring_functions(scoring_funcs): assert isinstance(scores, np.ndarray) -# TODO - change name once finalised # test for method argument @pytest.mark.parametrize( "method", [ "residual", - "TO_BE_NAMED", + "outre", ], ) def test_method_pass_get_label_quality_scores(method): From 9ab2092af94779c13a8745112f47d827f89b96de Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 22:42:56 -0800 Subject: [PATCH 050/258] Support for array_like labels and predictions --- cleanlab/internal/regression_utils.py | 26 +++++++++++----- cleanlab/regression/rank.py | 43 ++++++++++++++++++--------- 2 files changed, 48 insertions(+), 21 deletions(-) diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py index 04576c4012..a62e85aca5 100644 --- a/cleanlab/internal/regression_utils.py +++ b/cleanlab/internal/regression_utils.py @@ -10,19 +10,31 @@ def assert_valid_inputs( predictions: np.ndarray, method: str, ) -> None: - """Checks that ``labels``, ``pred_labels``, ``method`` are correctly formatted.""" + """Checks that ``labels``, ``predictions``, ``method`` are correctly formatted.""" # Check if labels and pred_labels are np.ndarray if not isinstance(labels, np.ndarray) or not isinstance(predictions, np.ndarray): - raise TypeError("labels and pred_labels must be of type np.ndarray") + raise TypeError("labels and predictions must be of type np.ndarray") - # Check if labels and pred_labels are of same shape + # Check if labels and predictions are of same shape assert ( labels.shape == predictions.shape ), f"shape of label {labels.shape} and predicted labels {predictions.shape} are not same." - # Check if method passed is string - if not isinstance(method, str): - raise TypeError( - f"Passed method is not of correct type. Expected string, got {type(method)}" + # Check if method is among allowed scoring method + scoring_methods = ["residual", "outre"] + if method not in scoring_methods: + raise ValueError( + f"Passed method is not among allowed method. Expected either of {scoring_methods}, got {method}" + ) + + +def check_dimensions(labels: np.ndarray, predictions: np.ndarray) -> None: + if labels.ndim != 1: + raise ValueError( + f"labels have dimensions {labels.ndim}, Expected 1-D array as input for labels" + ) + if predictions.ndim != 1: + raise ValueError( + f"predictions have dimensions {labels.ndim}, Expected 1-D array as input for predictions" ) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index fe263f88ca..c6250ae4af 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,17 +1,20 @@ import numpy as np from cleanlab.outlier import OutOfDistribution from sklearn.neighbors import NearestNeighbors -from cleanlab.internal.regression_utils import assert_valid_inputs -from typing import Dict, Callable +from cleanlab.internal.regression_utils import assert_valid_inputs, check_dimensions +from typing import Dict, Callable, Optional +from cleanlab.typing import LabelLike """ Generates label quality scores for every sample in regression dataset """ +EPS = 1e-30 + def get_label_quality_scores( - labels: np.ndarray, - predictions: np.ndarray, + labels: Optional[LabelLike], + predictions: Optional[LabelLike], *, - method: str = "outre", # TODO update name once finalised + method: str = "outre", ) -> np.ndarray: """ Returns label quality score for each example in the regression dataset. @@ -22,7 +25,7 @@ def get_label_quality_scores( Parameters ---------- - labels : np.ndarray + labels : array_like Raw labels from original dataset. Array of shape ``(N, )`` consisting given labels, where N is number of datapoints in the regression dataset. @@ -53,6 +56,11 @@ def get_label_quality_scores( # Check if inputs are valid assert_valid_inputs(labels=labels, predictions=predictions, method=method) + # Convert to numpy array and check if they are 1-D array. + labels = np.asarray(labels) + predictions = np.asarray(predictions) + check_dimensions(labels, predictions) + scoring_funcs: Dict[str, Callable[[np.ndarray, np.ndarray], np.ndarray]] = { "residual": get_residual_score_for_each_label, "outre": get_outre_score_for_each_label, @@ -104,12 +112,13 @@ def get_residual_score_for_each_label( return label_quality_scores -# TODO - change name of function in test def get_outre_score_for_each_label( labels: np.ndarray, predictions: np.ndarray, *, - variance: float = 10, + residual_scale: float = 10, + frac_neighbors: float = 0.1, + neighbor_metric: str = "euclidean", ) -> np.ndarray: """Returns OUTRE based label-quality scores. @@ -124,8 +133,14 @@ def get_outre_score_for_each_label( predictions: np.ndarray Predicted labels in the same format as expected by the :py:func:`get_label_quality_scores ` function. - variance: float, default = 10 - Manipulates variance of the distribution of residual. + residual_scale: float, default = 10 + Manipulates scale of the distribution of residual. + + frac_neighbors: float, default = 0.1 + Fraction of datapoints that should be considered as n_neighbors to NearestNeighbors. + + neighbor_metric: str, default = "euclidean" + The parameter is passed to sklearn NearestNeighbors. # TODO add reference to sklearn.NearestNeighbor? Returns ------- @@ -134,14 +149,14 @@ def get_outre_score_for_each_label( Lower scores indicate more likely mislabled examples. """ residual = predictions - labels - labels = (labels - labels.mean()) / labels.std() - residual = np.sqrt(variance) * ((residual - residual.mean()) / residual.std()) + labels = (labels - labels.mean()) / (labels.std() + EPS) + residual = residual_scale * ((residual - residual.mean()) / (residual.std() + EPS)) # 2D features by combining labels and residual features = np.array([labels, residual]).T - neighbors = int(np.ceil(0.1 * labels.shape[0])) - knn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean").fit(features) + neighbors = int(np.ceil(frac_neighbors * labels.shape[0])) + knn = NearestNeighbors(n_neighbors=neighbors, metric=neighbor_metric).fit(features) ood = OutOfDistribution(params={"knn": knn}) label_quality_scores = ood.score(features=features) From c078d67e3ed340e14669cf079076074e393a3b25 Mon Sep 17 00:00:00 2001 From: Mayank Kumar Date: Fri, 23 Dec 2022 22:21:46 -0800 Subject: [PATCH 051/258] Apply Docstring suggestions from code review Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/regression/rank.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index c6250ae4af..e0efe5521c 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -27,11 +27,10 @@ def get_label_quality_scores( ---------- labels : array_like Raw labels from original dataset. - Array of shape ``(N, )`` consisting given labels, where N is number of datapoints in the regression dataset. + 1D array of shape ``(N, )`` containing the given labels for each example (aka. Y-value, response, target, dependent variable, ...), where N is number of examples in the dataset. predictions : np.ndarray - Predicated labels from regressor fitted on the dataset. - Array of shape ``(N,)`` consisting predicted labels, where N is number of datapoints in the regression dataset. + 1D array of shape ``(N,)`` containing the predicted label for each example in the dataset. These should be out-of-sample predictions from a trained regression model, which you can obtain for every example in your dataset via :ref:`cross-validation `. method : {"residual", "outre"}, default="outre" @@ -40,7 +39,7 @@ def get_label_quality_scores( label_quality_scores: Array of shape ``(N, )`` of scores between 0 and 1, one per datapoint in the dataset. - Lower scores indicate datapoint more likely to contain a label issue. + Lower scores indicate datapoints more likely to contain a label issue. Examples -------- @@ -84,7 +83,7 @@ def get_residual_score_for_each_label( labels: np.ndarray, predictions: np.ndarray, ) -> np.ndarray: - """Returns the residual based label-quality scores for each datapoints. + """Returns a residual label-quality score for each datapoint. This is function to compute label-quality scores for regression datasets, where lower score indicate labels less likely to be correct. From d1518da1172be6e174d372c29f6b9af595df12c8 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 23:13:07 -0800 Subject: [PATCH 052/258] doctring for method modified --- cleanlab/regression/rank.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index e0efe5521c..d57cd5ffec 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -27,12 +27,13 @@ def get_label_quality_scores( ---------- labels : array_like Raw labels from original dataset. - 1D array of shape ``(N, )`` containing the given labels for each example (aka. Y-value, response, target, dependent variable, ...), where N is number of examples in the dataset. + 1D array of shape ``(N, )`` containing the given labels for each example (aka. Y-value, response, target, dependent variable, ...), where N is number of examples in the dataset. predictions : np.ndarray 1D array of shape ``(N,)`` containing the predicted label for each example in the dataset. These should be out-of-sample predictions from a trained regression model, which you can obtain for every example in your dataset via :ref:`cross-validation `. method : {"residual", "outre"}, default="outre" + String specifying which method to use for scoring the quality of each label and identifying which labels appear most noisy. Returns ------- From a819fe4fcbab3268ebd6e8c46fad2b22ce9b60da Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 23 Dec 2022 23:17:38 -0800 Subject: [PATCH 053/258] datapoint -> example --- cleanlab/regression/rank.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index d57cd5ffec..6ebe33af64 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -38,9 +38,9 @@ def get_label_quality_scores( Returns ------- label_quality_scores: - Array of shape ``(N, )`` of scores between 0 and 1, one per datapoint in the dataset. + Array of shape ``(N, )`` of scores between 0 and 1, one per example in the dataset. - Lower scores indicate datapoints more likely to contain a label issue. + Lower scores indicate examples more likely to contain a label issue. Examples -------- @@ -84,7 +84,7 @@ def get_residual_score_for_each_label( labels: np.ndarray, predictions: np.ndarray, ) -> np.ndarray: - """Returns a residual label-quality score for each datapoint. + """Returns a residual label-quality score for each example. This is function to compute label-quality scores for regression datasets, where lower score indicate labels less likely to be correct. @@ -137,7 +137,7 @@ def get_outre_score_for_each_label( Manipulates scale of the distribution of residual. frac_neighbors: float, default = 0.1 - Fraction of datapoints that should be considered as n_neighbors to NearestNeighbors. + Fraction of examples that should be considered as n_neighbors to NearestNeighbors. neighbor_metric: str, default = "euclidean" The parameter is passed to sklearn NearestNeighbors. # TODO add reference to sklearn.NearestNeighbor? From ac52da7294cfde7c47ca17760f85a19a5a8e9953 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Sat, 24 Dec 2022 02:27:34 -0800 Subject: [PATCH 054/258] check_valid_inputs update 1. added support for list, np.ndarray, pd.Series, pd.DataFrame 2. check if inputs are numeric --- cleanlab/internal/regression_utils.py | 64 +++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py index a62e85aca5..a7bfb4bbd1 100644 --- a/cleanlab/internal/regression_utils.py +++ b/cleanlab/internal/regression_utils.py @@ -3,23 +3,35 @@ """ import numpy as np +import pandas as pd +from pandas.api.types import is_numeric_dtype +from cleanlab.typing import LabelLike +from typing import Optional def assert_valid_inputs( - labels: np.ndarray, - predictions: np.ndarray, + labels: Optional[LabelLike], + predictions: Optional[LabelLike], method: str, ) -> None: """Checks that ``labels``, ``predictions``, ``method`` are correctly formatted.""" - # Check if labels and pred_labels are np.ndarray - if not isinstance(labels, np.ndarray) or not isinstance(predictions, np.ndarray): - raise TypeError("labels and predictions must be of type np.ndarray") + supported_types = (list, np.ndarray, pd.Series, pd.DataFrame) - # Check if labels and predictions are of same shape - assert ( - labels.shape == predictions.shape - ), f"shape of label {labels.shape} and predicted labels {predictions.shape} are not same." + # Check if labels and predictions are of supported types + if not isinstance(labels, supported_types) and not isinstance(predictions, supported_types): + raise TypeError( + f"Expected labels and predictions to be either of {supported_types}, Got labels of type {type(labels)}, and predictions of type {type(predictions)}", + ) + + # check if labels and predictions are 1-D and numeric + check_dimension_and_datatype(check_input=labels, text = "labels") + check_dimension_and_datatype(check_input=predictions, text = "predictions") + + # check if number of examples are same. + assert len(labels) == len( + predictions + ), f"Length of labels {len(labels)} and predictions {len(predictions)} are not same." # Check if method is among allowed scoring method scoring_methods = ["residual", "outre"] @@ -29,6 +41,40 @@ def assert_valid_inputs( ) +def check_dimension_and_datatype(check_input: Optional[LabelLike], text : str): + # check if input is empty + if not len(check_input): + raise ValueError( + f"{text} is Empty, check input." + ) + + if isinstance(check_input, list): + if isinstance(check_input[0], list): + raise ValueError(f"{text} must be 1-D. List of List is not supported.") + elif not all(isinstance(x, (int, float)) for x in check_input): + raise ValueError( + f"All element of {text} must be of type numeric i.e., integer or float" + ) + + elif isinstance(check_input, pd.DataFrame): + if check_input.shape[1] != 1: + raise ValueError( + f"{text} must be 1-D. For DataFrame, second dimension must be 1, got {check_input.shape}." + ) + elif check_input.shape[1] == 1: + if not is_numeric_dtype(check_input): + raise ValueError(f"{text} must be 1-D and numeric type. got {check_input.dtype}.") + elif isinstance(check_input, (np.ndarray, pd.Series)): + if len(check_input.shape) != 1: + raise ValueError(f"{text} must be 1-D {type(check_input)}, got {check_input.shape}") + elif len(check_input.shape) == 1: + if isinstance(check_input, pd.Series) and not is_numeric_dtype(check_input): + raise ValueError(f"{text} must be 1-D and numeric type. got {check_input.dtype}.") + elif isinstance(check_input, np.ndarray): + if not all(isinstance(x, (int, float)) for x in check_input.tolist()): + raise ValueError(f"{text} must be 1-d and numeric type i.e., integer or float.") + + def check_dimensions(labels: np.ndarray, predictions: np.ndarray) -> None: if labels.ndim != 1: raise ValueError( From 8394ee16d6510f534592d0e918f06abe39ac75f4 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Tue, 27 Dec 2022 15:51:12 -0800 Subject: [PATCH 055/258] tutorial removed --- docs/source/tutorials/index.rst | 1 - docs/source/tutorials/regression.ipynb | 497 ------------------------- 2 files changed, 498 deletions(-) delete mode 100644 docs/source/tutorials/regression.ipynb diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst index 78e4efac5f..d040963629 100644 --- a/docs/source/tutorials/index.rst +++ b/docs/source/tutorials/index.rst @@ -14,7 +14,6 @@ Tutorials multiannotator multilabel_classification token_classification - regression pred_probs_cross_val faq diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb deleted file mode 100644 index cc02658e80..0000000000 --- a/docs/source/tutorials/regression.ipynb +++ /dev/null @@ -1,497 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Label Quality Scores for Regression with Noisy Labels " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This quickstart tutorial shows how to use cleanlab for finding label errors in regression data. Using the approach mentioned here, you can find label error in any regression dataset irrespective of modality i.e., tabular, text, image etc. \n", - "\n", - "**This example will take you through following:**\n", - "- Generate label quality scores for each datapoint in the dataset. \n", - "- Find label issue for regression dataset. " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Quickstart \n", - "\n", - "Cleanlab uses two inputs to generates scores for labels in the dataset:\n", - "- `labels`: NumPy array of given labels in the dataset. labels[i] should contain label for `i`-th datapoint. \n", - "- `predictions`: NumPy array of predictions generated through your favourite regressor. predictions[i] should contain predicted value for `i`-th datapoint. \n", - "\n", - "If you already have predictions from your regressor, you can generate label quality scores for each datapoint using the code below: \n", - "\n", - "
\n", - "\n", - "```python \n", - "\n", - "from cleanlab.regression.rank import get_label_quality_scores\n", - "label_quality_scores = get_label_quality_scores(labels, predictions)\n", - "\n", - "```\n", - "
\n", - "" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 0. Visualization (can skip these details)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is added just for reference. We will use this function to plot dataset, highlight points using label quality scores and true_errors.\n", - "You can skip this part and move to next section. " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
See the code for visualization **(click to expand)** \n", - "\n", - "```python \n", - "# Note: this pulldown is for docs.cleanlab.ai, if running on local Jupyter or colab, please ignore it. \n", - "\n", - "def plot_data(\n", - " data_x, data_y, circles, title, alpha=0.6, color=\"#1f77b4\", colorbar=False, xlabel=\"\", ylabel=\"\"\n", - "):\n", - " plt.figure(figsize=(14, 5))\n", - " data_x = data_x.to_numpy()\n", - " data_y = data_y.to_numpy()\n", - " plt.scatter(data_x, data_y, c=color, s=30)\n", - " for i in circles:\n", - " plt.plot(\n", - " data_x[i],\n", - " data_y[i],\n", - " \"o\",\n", - " markerfacecolor=\"none\",\n", - " markeredgecolor=\"red\",\n", - " markersize=10,\n", - " markeredgewidth=2.5,\n", - " alpha=alpha,\n", - " )\n", - " plt.title(title, fontsize=20)\n", - " plt.xlabel(xlabel)\n", - " plt.ylabel(ylabel)\n", - "\n", - " if colorbar:\n", - " plt.colorbar(orientation=\"vertical\")\n", - "\n", - "```\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_data(\n", - " data_x, data_y, circles, title, alpha=0.6, color=\"#1f77b4\", colorbar=False, xlabel=\"\", ylabel=\"\"\n", - "):\n", - " plt.figure(figsize=(14, 5))\n", - " data_x = data_x.to_numpy()\n", - " data_y = data_y.to_numpy()\n", - " plt.scatter(data_x, data_y, c=color, s=30)\n", - " for i in circles:\n", - " plt.plot(\n", - " data_x[i],\n", - " data_y[i],\n", - " \"o\",\n", - " markerfacecolor=\"none\",\n", - " markeredgecolor=\"red\",\n", - " markersize=10,\n", - " markeredgewidth=2.5,\n", - " alpha=alpha,\n", - " )\n", - " plt.title(title, fontsize=20)\n", - " plt.xlabel(xlabel)\n", - " plt.ylabel(ylabel)\n", - "\n", - " if colorbar:\n", - " plt.colorbar(orientation=\"vertical\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Install dependencies and import them \n", - "You can use `pip` to install all packages required for this tutorial as follows:\n", - "\n", - "`!pip install cleanlab xgboost`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install cleanlab xgboost" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "# Package installation (hidden on docs website).\n", - "# Package versions we used: xgboost==1.7.2\n", - "\n", - "dependencies = [\"cleanlab\", \"xgboost\"]\n", - "\n", - "if \"google.colab\" in str(get_ipython()): # Check if it's running in Google Colab\n", - " %pip install cleanlab # for colab\n", - " cmd = \" \".join([dep for dep in dependencies if dep != \"cleanlab\"])\n", - " %pip install $cmd\n", - "else:\n", - " missing_dependencies = []\n", - " for dependency in dependencies:\n", - " try:\n", - " __import__(dependency)\n", - " except ImportError:\n", - " missing_dependencies.append(dependency)\n", - "\n", - " if len(missing_dependencies) > 0:\n", - " print(\"Missing required dependencies:\")\n", - " print(*missing_dependencies, sep=\", \")\n", - " print(\"\\nPlease install them before running the rest of this notebook.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from cleanlab.regression.rank import get_label_quality_scores\n", - "import xgboost as xgb\n", - "import matplotlib.pyplot as plt\n", - "\n", - "np.set_printoptions(suppress=True)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Import dataset and Generate predictions" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
exam_1exam_2exam_3notestrue_labelslabels
0537793NaN76.276.2
1816480great participation +1085.585.5
2748897NaN87.487.4
3619478NaN77.777.7
4489091NaN77.877.8
\n", - "
" - ], - "text/plain": [ - " exam_1 exam_2 exam_3 notes true_labels labels\n", - "0 53 77 93 NaN 76.2 76.2\n", - "1 81 64 80 great participation +10 85.5 85.5\n", - "2 74 88 97 NaN 87.4 87.4\n", - "3 61 94 78 NaN 77.7 77.7\n", - "4 48 90 91 NaN 77.8 77.8" - ] - }, - "execution_count": 107, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "path = \"/Users/krmayank/Desktop/Work/cleanlab/experiments/student_score_regression.csv\"\n", - "data = pd.read_csv(path, index_col=0)\n", - "data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Generate true errors\n", - "true_errors = np.where(data.labels != data.true_labels)[0]\n", - "plot_data(\n", - " data_x=data[\"exam_3\"],\n", - " data_y=data[\"labels\"],\n", - " circles=true_errors,\n", - " title=\"Messy Regression dataset\",\n", - " xlabel=\"exam_3 feature\",\n", - " ylabel=\"label (Y value)\",\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the dataframe displayed above, `labels` represents the noisy labels and `true_labels` represents the ground truth. Please note that, ground truth are usually not available in real dataset, we have added it here for comparision and to demonstrate our method. `notes` also has text information, we will model this a categorical variable. \n", - "\n", - "We will use `xgboost` as regressor for this tutorial. xgboost provides easy to use interface to process categorical variable. This is demonstrated in the code below:" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [], - "source": [ - "# XGBOOST automatically factors categorical variable, you just need to mark the columns as category\n", - "data.notes = data.notes.astype(\"category\")\n", - "\n", - "# XGBOOST takes data and label seperately, so you will need to divide data accordingly.\n", - "X = data.drop([\"labels\", \"true_labels\"], axis=1)\n", - "y = data[\"labels\"]\n", - "\n", - "# convert data to format \"DMatrix\" to make it compatible with XGBOOST.\n", - "xgboost_data = xgb.DMatrix(data=X, label=y, enable_categorical=True)\n", - "\n", - "# declare parameters and train the model.\n", - "params = {\"booster\": \"gblinear\", \"objective\": \"reg:squarederror\"}\n", - "boost = xgb.train(params=params, dtrain=xgboost_data, num_boost_round=50)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Using cleanlab to generate label quality scores" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [], - "source": [ - "# using trained xgboost model to get predictions\n", - "predictions = boost.predict(xgboost_data)\n", - "\n", - "# get label quality score for each example in the dataset using cleanlab\n", - "label_quality_scores = get_label_quality_scores(labels=np.array(y), predictions=predictions)" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plot_data(\n", - " data_x=data[\"exam_3\"],\n", - " data_y=data[\"labels\"],\n", - " circles=true_errors,\n", - " color=label_quality_scores,\n", - " title=\"Messy Regression dataset with label quality scores\",\n", - " colorbar=True,\n", - " xlabel=\"exam_3 feature\",\n", - " ylabel=\"label (Y value)\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the above plot, we have colored each datapoint considering its label quality score. \\\n", - "Datapoints in the plot are same as earlier plot in the notebook. \\\n", - "**Red circle** represents that these datapoint were incorrectly marked when compared to true_label. \n", - "\n", - "Low scores for datapoints marked in **Red circle** and High scores for other datapoints justifies that method can identify the errors in the dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": 112, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "# Note: This cell is only for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", - "from sklearn.metrics import roc_auc_score\n", - "\n", - "label_quality_scores = get_label_quality_scores(np.array(y), predictions=predictions)\n", - "\n", - "true_errors = (data.labels != data.true_labels).astype(int)\n", - "label_quality_scores_residual = get_label_quality_scores(\n", - " np.array(y), predictions=predictions, method=\"residual\"\n", - ")\n", - "\n", - "if roc_auc_score(true_errors, 1 - label_quality_scores) < 0.5:\n", - " raise ValueError(\"Label quality scores did not perform well enough\")\n", - "\n", - "if roc_auc_score(true_errors, 1 - label_quality_scores) <= roc_auc_score(\n", - " true_errors, 1 - label_quality_scores_residual\n", - "):\n", - " raise ValueError(\"Label quality scores did not outperform alternative scores\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.8 ('ENV': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "1ed33b5e6ac3d9870092cd802185bba6fb7a8302b6022e7097221f18c33cb7b2" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 569b2ff16377bfe8968fde74692a45e3bc161f6d Mon Sep 17 00:00:00 2001 From: krmayankb Date: Tue, 27 Dec 2022 22:45:55 -0800 Subject: [PATCH 056/258] support for array_like --- cleanlab/internal/regression_utils.py | 126 +++++++++++++------------- cleanlab/regression/rank.py | 15 +-- 2 files changed, 70 insertions(+), 71 deletions(-) diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py index a7bfb4bbd1..4e396f2d55 100644 --- a/cleanlab/internal/regression_utils.py +++ b/cleanlab/internal/regression_utils.py @@ -3,84 +3,88 @@ """ import numpy as np -import pandas as pd -from pandas.api.types import is_numeric_dtype -from cleanlab.typing import LabelLike -from typing import Optional +from numpy.typing import ArrayLike +from typing import Tuple, Optional def assert_valid_inputs( - labels: Optional[LabelLike], - predictions: Optional[LabelLike], + labels: ArrayLike, + predictions: ArrayLike, method: str, -) -> None: +) -> Tuple[np.ndarray, np.ndarray]: """Checks that ``labels``, ``predictions``, ``method`` are correctly formatted.""" - supported_types = (list, np.ndarray, pd.Series, pd.DataFrame) + # Load array_like input as numpy array. If not raise error. + try: + labels = np.asarray(labels) + except: + raise ValueError(f"labels must be array_like.") - # Check if labels and predictions are of supported types - if not isinstance(labels, supported_types) and not isinstance(predictions, supported_types): - raise TypeError( - f"Expected labels and predictions to be either of {supported_types}, Got labels of type {type(labels)}, and predictions of type {type(predictions)}", - ) + try: + predictions = np.asarray(predictions) + except: + raise ValueError(f"predictions must be array_like.") + + # Check if labels and predictions are 1-D and numeric + valid_labels = check_dimension_and_datatype(check_input=labels, text="labels") + valid_predictions = check_dimension_and_datatype(check_input=predictions, text="predictions") - # check if labels and predictions are 1-D and numeric - check_dimension_and_datatype(check_input=labels, text = "labels") - check_dimension_and_datatype(check_input=predictions, text = "predictions") + # Check if number of examples are same. + assert ( + valid_labels.shape == valid_predictions.shape + ), f"Number of examples in labels {labels.shape} and predictions {predictions.shape} are not same." - # check if number of examples are same. - assert len(labels) == len( - predictions - ), f"Length of labels {len(labels)} and predictions {len(predictions)} are not same." + # Check if inputs have missing values + check_missing_values(valid_labels, text="labels") + check_missing_values(valid_predictions, text="predictions") # Check if method is among allowed scoring method scoring_methods = ["residual", "outre"] if method not in scoring_methods: raise ValueError( - f"Passed method is not among allowed method. Expected either of {scoring_methods}, got {method}" + f"Passed method is not among allowed methods. Expected either of {scoring_methods}, got {method}." ) + # return 1-D numpy array + return valid_labels, valid_predictions -def check_dimension_and_datatype(check_input: Optional[LabelLike], text : str): - # check if input is empty - if not len(check_input): - raise ValueError( - f"{text} is Empty, check input." - ) - if isinstance(check_input, list): - if isinstance(check_input[0], list): - raise ValueError(f"{text} must be 1-D. List of List is not supported.") - elif not all(isinstance(x, (int, float)) for x in check_input): - raise ValueError( - f"All element of {text} must be of type numeric i.e., integer or float" - ) - - elif isinstance(check_input, pd.DataFrame): - if check_input.shape[1] != 1: - raise ValueError( - f"{text} must be 1-D. For DataFrame, second dimension must be 1, got {check_input.shape}." - ) - elif check_input.shape[1] == 1: - if not is_numeric_dtype(check_input): - raise ValueError(f"{text} must be 1-D and numeric type. got {check_input.dtype}.") - elif isinstance(check_input, (np.ndarray, pd.Series)): - if len(check_input.shape) != 1: - raise ValueError(f"{text} must be 1-D {type(check_input)}, got {check_input.shape}") - elif len(check_input.shape) == 1: - if isinstance(check_input, pd.Series) and not is_numeric_dtype(check_input): - raise ValueError(f"{text} must be 1-D and numeric type. got {check_input.dtype}.") - elif isinstance(check_input, np.ndarray): - if not all(isinstance(x, (int, float)) for x in check_input.tolist()): - raise ValueError(f"{text} must be 1-d and numeric type i.e., integer or float.") - - -def check_dimensions(labels: np.ndarray, predictions: np.ndarray) -> None: - if labels.ndim != 1: - raise ValueError( - f"labels have dimensions {labels.ndim}, Expected 1-D array as input for labels" - ) - if predictions.ndim != 1: +def check_dimension_and_datatype(check_input: np.ndarray, text: str) -> np.ndarray: + """ + Raises errors related to: + 1. If input is empty + 2. If input is not 1-D + 3. If input is not numeric + + If all the checks are passed, it returns the squeezed 1-D array required by the main algorithm. + """ + + assert isinstance( + check_input, np.ndarray + ), f"{text} could not be converted to numpy array, check input." + + # Check if input is empty + if not check_input.size: + raise ValueError(f"{text} is Empty, check input.") + + # Remove axis with length one + check_input = np.squeeze(check_input) + + # Check if input is 1-D + if check_input.ndim != 1: raise ValueError( - f"predictions have dimensions {labels.ndim}, Expected 1-D array as input for predictions" + f"Expected 1-Dimensional inputs for {text}, got {check_input.ndim} dimensions." ) + + # Check if datatype is numeric + if not np.issubdtype(check_input.dtype, np.number): + raise ValueError(f"Expected {text} to be Numeric, got {check_input.dtype}.") + + return check_input + + +def check_missing_values(check_input: np.ndarray, text: str): + """Raise error if there are any missing values in Numpy array.""" + + if np.isnan(check_input).any(): + raise ValueError(f"{text} has missing values, check input.") diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 6ebe33af64..cd6c474c93 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,9 +1,9 @@ import numpy as np from cleanlab.outlier import OutOfDistribution from sklearn.neighbors import NearestNeighbors -from cleanlab.internal.regression_utils import assert_valid_inputs, check_dimensions +from cleanlab.internal.regression_utils import assert_valid_inputs from typing import Dict, Callable, Optional -from cleanlab.typing import LabelLike +from numpy.typing import ArrayLike """ Generates label quality scores for every sample in regression dataset """ @@ -11,8 +11,8 @@ def get_label_quality_scores( - labels: Optional[LabelLike], - predictions: Optional[LabelLike], + labels: ArrayLike, + predictions: ArrayLike, *, method: str = "outre", ) -> np.ndarray: @@ -54,12 +54,7 @@ def get_label_quality_scores( """ # Check if inputs are valid - assert_valid_inputs(labels=labels, predictions=predictions, method=method) - - # Convert to numpy array and check if they are 1-D array. - labels = np.asarray(labels) - predictions = np.asarray(predictions) - check_dimensions(labels, predictions) + labels, predictions = assert_valid_inputs(labels=labels, predictions=predictions, method=method) scoring_funcs: Dict[str, Callable[[np.ndarray, np.ndarray], np.ndarray]] = { "residual": get_residual_score_for_each_label, From 86532b03c1c4cb46dfbf6e412fa4b474a9de4f6c Mon Sep 17 00:00:00 2001 From: krmayankb Date: Tue, 27 Dec 2022 22:47:13 -0800 Subject: [PATCH 057/258] unit tests to factor array_like --- tests/test_regression.py | 55 +++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/tests/test_regression.py b/tests/test_regression.py index dc29315f96..355752dee3 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -1,13 +1,25 @@ import numpy as np -import pandas as pd + +# import pandas as pd import pytest +from typing import Union, Sequence from cleanlab.regression import rank +ArrayLike = Union[np.ndarray, Sequence] + # To be used for all the tests labels = np.array([1, 2, 3, 4]) predictions = np.array([1, 3, 4, 5]) +# Inputs that are not array like +aConstant = 1 +aString = "predictions_non_array" +aDict = {"labels": [1, 2], "predictions": [2, 3]} +aSet = {1, 2, 3, 4} +aBool = True + + # test with deafault parameters def test_output_shape_type(): scores = rank.get_label_quality_scores(labels=labels, predictions=predictions) @@ -15,18 +27,43 @@ def test_output_shape_type(): assert isinstance(scores, np.ndarray) -# test for acceptable datatypes -@pytest.mark.parametrize("format", [pd.Series, pd.DataFrame, list]) -def test_type_error_for_input_types(format): - with pytest.raises(TypeError) as error: - _ = rank.get_label_quality_scores(labels=format(labels), predictions=format(predictions)) +@pytest.mark.parametrize( + "aInput", + [aConstant, aString, aDict, aSet, aBool], +) +def test_labels_are_arraylike(aInput): + with pytest.raises(ValueError) as error: + rank.get_label_quality_scores(labels=aInput, predictions=predictions) + assert error.type == ValueError + + +@pytest.mark.parametrize( + "aInput", + [aConstant, aString, aDict, aSet, aBool], +) +def test_predictionns_are_arraylike(aInput): + with pytest.raises(ValueError) as error: + rank.get_label_quality_scores(labels=labels, predictions=aInput) + assert error.type == ValueError # test for input shapes -def test_assertion_error_for_input_shape(): +def test_input_shape_labels(): + with pytest.raises(AssertionError) as error: + rank.get_label_quality_scores(labels=labels[:-1], predictions=predictions) + assert ( + str(error.value) + == f"Number of examples in labels {labels[:-1].shape} and predictions {predictions.shape} are not same." + ) + + +def test_input_shape_predictions(): with pytest.raises(AssertionError) as error: - _ = rank.get_label_quality_scores(labels=labels[:-1], predictions=predictions) - _ = rank.get_label_quality_scores(labels=labels, predictions=predictions[:-1]) + rank.get_label_quality_scores(labels=labels, predictions=predictions[:-1]) + assert ( + str(error.value) + == f"Number of examples in labels {labels.shape} and predictions {predictions[:-1].shape} are not same." + ) # test individual scoring functions From cb596a94ece1e821877f4fc31ecfd99007947c93 Mon Sep 17 00:00:00 2001 From: Mayank Kumar Date: Tue, 27 Dec 2022 23:58:51 -0800 Subject: [PATCH 058/258] Update docs/source/tutorials/index.rst Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- docs/source/tutorials/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst index d040963629..e0d63a7e06 100644 --- a/docs/source/tutorials/index.rst +++ b/docs/source/tutorials/index.rst @@ -16,4 +16,3 @@ Tutorials token_classification pred_probs_cross_val faq - From 27ccc26301891f932cec2c511e941087b32e89ef Mon Sep 17 00:00:00 2001 From: krmayankb Date: Wed, 28 Dec 2022 17:15:06 -0800 Subject: [PATCH 059/258] merge master to regression --- docs/source/tutorials/student_grades.csv | 980 +++++++++++++++++++++++ 1 file changed, 980 insertions(+) create mode 100644 docs/source/tutorials/student_grades.csv diff --git a/docs/source/tutorials/student_grades.csv b/docs/source/tutorials/student_grades.csv new file mode 100644 index 0000000000..347454bb87 --- /dev/null +++ b/docs/source/tutorials/student_grades.csv @@ -0,0 +1,980 @@ +,exam_1,exam_2,exam_3,notes,true_grade,grade +0,53,77,93,,76.2,76.2 +1,81,64,80,great participation +10,85.5,85.5 +2,74,88,97,,87.4,87.4 +3,61,94,78,,77.7,77.7 +4,48,90,91,,77.8,77.8 +5,89,95,72,,84.0,84.0 +6,0,83,97,"cheated on exam, gets 0pts",63.7,63.7 +7,71,82,97,,84.7,84.7 +8,0,56,96,"cheated on exam, gets 0pts",55.2,55.2 +9,75,80,98,missed class frequently -10,75.7,75.7 +10,81,65,82,,76.6,76.6 +11,95,86,53,missed homework frequently -10,65.5,65.5 +12,89,72,98,missed homework frequently -10,77.5,77.5 +13,80,59,65,,67.7,77.7 +14,67,82,98,,83.9,83.9 +15,87,78,95,,87.5,87.5 +16,89,69,98,great participation +10,96.6,96.6 +17,53,72,46,missed class frequently -10,45.9,45.9 +18,46,84,89,missed homework frequently -10,64.6,64.6 +19,70,63,95,great participation +10,87.9,87.9 +20,63,93,94,,84.4,84.4 +21,90,75,90,missed homework frequently -10,75.5,75.5 +22,72,81,95,missed class frequently -10,73.9,73.9 +23,90,83,51,,72.3,72.3 +24,99,53,76,,76.0,76.0 +25,85,49,83,,73.4,73.4 +26,89,71,94,great final presentation +10,95.6,95.6 +27,86,80,89,,85.4,85.4 +28,75,94,79,,82.3,82.3 +29,61,95,94,,84.4,84.4 +30,74,64,91,,77.8,77.8 +31,98,92,96,,95.4,95.4 +32,95,83,78,,84.6,84.6 +33,92,63,91,missed homework frequently -10,72.9,72.9 +34,91,63,48,great participation +10,75.4,75.4 +35,69,94,95,great participation +10,96.9,96.9 +36,81,90,90,missed homework frequently -10,77.3,77.3 +37,98,69,90,,86.1,86.1 +38,82,94,99,missed class frequently -10,82.4,82.4 +39,73,57,96,,77.4,77.4 +40,98,73,86,great final presentation +10,95.7,95.7 +41,0,81,99,"cheated on exam, gets 0pts",63.9,63.9 +42,0,89,100,"cheated on exam, gets 0pts",66.7,66.7 +43,62,83,81,,75.9,75.9 +44,78,84,92,great final presentation +10,95.4,95.4 +45,79,84,92,,85.7,85.7 +46,65,69,92,missed homework frequently -10,67.0,67.0 +47,98,82,47,great final presentation +10,82.8,82.8 +48,92,0,89,"cheated on exam, gets 0pts",63.2,63.2 +49,84,46,95,,77.0,77.0 +50,95,85,98,,93.2,93.2 +51,86,88,90,great final presentation +10,98.2,98.2 +52,65,58,91,,73.3,73.3 +53,79,83,88,,83.8,83.8 +54,0,95,87,"cheated on exam, gets 0pts",63.3,63.3 +55,90,57,97,,82.9,82.9 +56,72,69,81,missed homework frequently -10,64.7,64.7 +57,70,85,96,,84.9,84.9 +58,89,59,79,missed homework frequently -10,66.0,56.0 +59,75,83,69,,75.0,75.0 +60,89,70,90,missed homework frequently -10,73.7,73.7 +61,81,85,95,,87.8,87.8 +62,76,70,73,great participation +10,83.0,83.0 +63,92,62,66,great participation +10,82.6,82.6 +64,94,98,99,,97.2,97.2 +65,96,99,96,great final presentation +10,100.0,100.0 +66,65,85,80,great participation +10,87.0,87.0 +67,90,79,85,great participation +10,94.7,94.7 +68,0,87,77,"cheated on exam, gets 0pts",56.9,56.9 +69,91,75,95,great participation +10,97.8,97.8 +70,88,77,92,,86.3,86.3 +71,86,64,96,,83.4,83.4 +72,88,67,74,,76.1,76.1 +73,76,59,65,,66.5,66.5 +74,96,68,59,great participation +10,82.8,82.8 +75,84,98,51,,75.0,75.0 +76,97,85,75,,84.6,84.6 +77,97,98,80,great final presentation +10,100.0,100.0 +78,89,74,71,,77.3,77.3 +79,72,90,93,,85.8,85.8 +80,49,0,78,"cheated on exam, gets 0pts",45.9,45.9 +81,88,93,78,great participation +10,95.5,95.5 +82,88,80,82,great participation +10,93.2,103.2 +83,75,91,68,great final presentation +10,87.0,87.0 +84,68,99,91,,86.5,86.5 +85,92,87,71,,82.1,72.1 +86,83,97,75,great final presentation +10,94.0,94.0 +87,87,100,94,,93.7,93.7 +88,86,95,77,,85.1,85.1 +89,79,0,76,"cheated on exam, gets 0pts",54.1,54.1 +90,35,100,61,great participation +10,74.9,74.9 +91,99,83,44,great final presentation +10,82.2,82.2 +92,96,77,79,missed class frequently -10,73.5,83.5 +93,78,94,81,,84.0,84.0 +94,58,76,94,great participation +10,87.8,87.8 +95,70,0,63,"cheated on exam, gets 0pts",46.2,46.2 +96,75,46,48,great participation +10,65.5,65.5 +97,77,84,95,,86.3,86.3 +98,81,56,54,,62.7,62.7 +99,76,96,78,great final presentation +10,92.8,102.8 +100,99,53,73,missed homework frequently -10,64.8,64.8 +101,93,91,84,great final presentation +10,98.8,98.8 +102,73,87,86,missed class frequently -10,72.4,72.4 +103,85,99,71,,83.6,83.6 +104,82,79,64,,73.9,73.9 +105,100,100,81,missed homework frequently -10,82.4,82.4 +106,99,87,99,great final presentation +10,100.0,100.0 +107,0,84,93,"cheated on exam, gets 0pts",62.4,62.4 +108,90,81,79,,82.9,82.9 +109,78,0,76,"cheated on exam, gets 0pts",53.8,53.8 +110,92,73,89,,85.1,85.1 +111,74,77,79,missed homework frequently -10,66.9,66.9 +112,98,93,92,,94.1,94.1 +113,87,82,56,,73.1,73.1 +114,96,56,70,,73.6,73.6 +115,85,88,27,,62.7,62.7 +116,81,89,78,missed homework frequently -10,72.2,62.2 +117,77,81,88,great participation +10,92.6,92.6 +118,78,53,96,missed homework frequently -10,67.7,67.7 +119,71,57,96,great participation +10,86.8,86.8 +120,58,97,89,great participation +10,92.1,92.1 +121,53,99,69,missed class frequently -10,63.2,73.2 +122,96,82,86,great participation +10,97.8,97.8 +123,71,90,92,great final presentation +10,95.1,95.1 +124,0,81,97,"cheated on exam, gets 0pts",63.1,63.1 +125,74,94,81,missed homework frequently -10,72.8,72.8 +126,73,73,76,missed class frequently -10,64.2,64.2 +127,0,78,80,"cheated on exam, gets 0pts",55.4,55.4 +128,86,92,86,great participation +10,97.8,97.8 +129,97,94,87,great final presentation +10,100.0,100.0 +130,92,81,97,great final presentation +10,100.0,100.0 +131,93,0,60,"cheated on exam, gets 0pts",51.9,51.9 +132,82,0,79,"cheated on exam, gets 0pts",56.2,56.2 +133,99,62,89,,83.9,83.9 +134,75,0,55,"cheated on exam, gets 0pts",44.5,44.5 +135,77,70,84,,77.7,77.7 +136,84,66,49,great final presentation +10,74.6,74.6 +137,76,81,96,,85.5,85.5 +138,68,63,96,missed class frequently -10,67.7,67.7 +139,71,54,99,missed homework frequently -10,67.1,67.1 +140,84,92,75,missed homework frequently -10,72.8,72.8 +141,90,84,97,great participation +10,100.0,110.0 +142,61,100,94,,85.9,85.9 +143,73,56,91,missed class frequently -10,65.1,65.1 +144,83,88,57,missed homework frequently -10,64.1,74.1 +145,69,77,57,great participation +10,76.6,76.6 +146,79,77,95,great participation +10,94.8,94.8 +147,0,97,90,"cheated on exam, gets 0pts",65.1,65.1 +148,58,68,75,,67.8,67.8 +149,63,82,53,,64.7,64.7 +150,76,78,70,,74.2,74.2 +151,0,82,77,"cheated on exam, gets 0pts",55.4,55.4 +152,91,79,93,great final presentation +10,98.2,98.2 +153,71,65,81,great participation +10,83.2,83.2 +154,71,0,75,"cheated on exam, gets 0pts",51.3,51.3 +155,97,57,67,great participation +10,83.0,83.0 +156,98,77,86,,86.9,86.9 +157,56,80,79,,72.4,72.4 +158,99,90,90,,92.7,92.7 +159,80,83,87,,83.7,83.7 +160,79,75,74,,75.8,75.8 +161,95,81,76,,83.2,83.2 +162,72,87,90,missed class frequently -10,73.7,73.7 +163,72,0,86,"cheated on exam, gets 0pts",56.0,66.0 +164,91,0,94,"cheated on exam, gets 0pts",64.9,64.9 +165,73,84,68,great participation +10,84.3,74.3 +166,0,96,90,"cheated on exam, gets 0pts",64.8,64.8 +167,98,89,71,missed class frequently -10,74.5,74.5 +168,93,65,96,missed class frequently -10,75.8,75.8 +169,79,91,89,missed homework frequently -10,76.6,76.6 +170,89,93,79,,86.2,86.2 +171,96,90,77,,86.6,86.6 +172,0,65,87,"cheated on exam, gets 0pts",54.3,54.3 +173,92,86,78,,84.6,84.6 +174,56,52,78,,63.6,63.6 +175,74,0,85,"cheated on exam, gets 0pts",56.2,66.2 +176,73,80,95,,83.9,83.9 +177,69,93,84,,82.2,82.2 +178,91,99,90,,93.0,93.0 +179,73,92,91,great final presentation +10,95.9,95.9 +180,94,41,91,great participation +10,86.9,86.9 +181,91,96,92,,92.9,92.9 +182,69,94,58,missed homework frequently -10,62.1,62.1 +183,94,79,89,,87.5,87.5 +184,0,92,98,"cheated on exam, gets 0pts",66.8,66.8 +185,89,92,79,,85.9,85.9 +186,92,86,81,,85.8,85.8 +187,82,85,89,,85.7,85.7 +188,82,77,92,missed class frequently -10,74.5,74.5 +189,99,65,87,,84.0,84.0 +190,73,95,67,,77.2,77.2 +191,98,94,90,great participation +10,100.0,100.0 +192,95,78,60,great participation +10,85.9,85.9 +193,90,67,77,missed class frequently -10,67.9,67.9 +194,96,0,90,"cheated on exam, gets 0pts",64.8,64.8 +195,86,80,89,,85.4,85.4 +196,51,78,84,,72.3,72.3 +197,99,75,88,,87.4,87.4 +198,93,58,70,great final presentation +10,83.3,83.3 +199,95,0,72,"cheated on exam, gets 0pts",57.3,57.3 +200,88,71,93,,84.9,84.9 +201,91,92,70,,82.9,82.9 +202,98,92,69,,84.6,74.6 +203,90,0,100,"cheated on exam, gets 0pts",67.0,57.0 +204,89,70,74,,77.3,77.3 +205,97,82,80,,85.7,85.7 +206,78,81,74,missed homework frequently -10,67.3,67.3 +207,77,87,83,,82.4,82.4 +208,86,99,97,,94.3,84.3 +209,92,92,85,great final presentation +10,99.2,99.2 +210,86,80,66,,76.2,76.2 +211,59,70,86,,73.1,73.1 +212,93,75,91,,86.8,86.8 +213,76,54,65,,65.0,65.0 +214,72,78,69,missed homework frequently -10,62.6,62.6 +215,58,90,80,,76.4,76.4 +216,77,84,86,,82.7,82.7 +217,92,66,65,missed homework frequently -10,63.400000000000000,63.400000000000000 +218,88,83,86,great final presentation +10,95.7,95.7 +219,65,0,88,"cheated on exam, gets 0pts",54.7,64.7 +220,89,72,60,great final presentation +10,82.3,82.3 +221,84,79,97,great final presentation +10,97.7,97.7 +222,96,64,65,,74.0,74.0 +223,96,75,92,great final presentation +10,98.1,98.1 +224,62,92,76,,76.6,76.6 +225,94,62,66,,73.2,73.2 +226,85,74,92,,84.5,84.5 +227,73,90,68,missed homework frequently -10,66.1,66.1 +228,81,76,97,great participation +10,95.9,95.9 +229,96,87,80,missed class frequently -10,76.9,76.9 +230,76,65,59,great final presentation +10,75.9,85.9 +231,83,85,90,great participation +10,96.4,96.4 +232,85,100,79,missed class frequently -10,77.1,77.1 +233,75,77,73,missed homework frequently -10,64.8,64.8 +234,84,94,47,missed homework frequently -10,62.2,62.2 +235,84,96,73,great participation +10,93.2,93.2 +236,65,85,50,,65.0,65.0 +237,90,54,80,,75.2,75.2 +238,68,99,99,great final presentation +10,99.7,99.7 +239,95,88,69,,82.5,82.5 +240,0,97,85,"cheated on exam, gets 0pts",63.1,63.1 +241,89,93,70,missed homework frequently -10,72.6,72.6 +242,68,83,76,,75.7,75.7 +243,79,81,89,great final presentation +10,93.6,93.6 +244,73,95,93,,87.6,87.6 +245,87,93,91,great participation +10,100.0,100.0 +246,86,71,68,,74.3,74.3 +247,95,96,64,,82.9,82.9 +248,99,95,67,,85.0,85.0 +249,75,78,97,,84.7,84.7 +250,85,72,70,,75.1,75.1 +251,89,74,85,missed class frequently -10,72.9,72.9 +252,74,94,84,great participation +10,94.0,94.0 +253,95,75,97,great final presentation +10,99.8,99.8 +254,83,93,77,,83.6,73.6 +255,78,58,86,great final presentation +10,85.2,85.2 +256,88,56,80,,75.2,75.2 +257,94,94,72,great participation +10,95.2,95.2 +258,84,87,96,great final presentation +10,99.7,99.7 +259,99,76,95,great participation +10,100.0,100.0 +260,96,94,89,,92.6,92.6 +261,67,80,51,,64.5,64.5 +262,91,92,74,,84.5,84.5 +263,77,96,78,,83.1,83.1 +264,99,86,95,missed class frequently -10,83.5,73.5 +265,72,82,100,great participation +10,96.2,96.2 +266,89,99,73,great final presentation +10,95.6,95.6 +267,67,76,33,,56.1,56.1 +268,46,70,77,,65.6,65.6 +269,46,92,66,,67.8,57.8 +270,87,78,82,missed homework frequently -10,72.3,72.3 +271,80,98,84,great participation +10,97.0,97.0 +272,91,92,95,missed homework frequently -10,82.9,82.9 +273,80,60,80,,74.0,74.0 +274,88,75,59,missed class frequently -10,62.5,62.5 +275,93,73,97,great final presentation +10,98.6,98.6 +276,86,67,76,missed homework frequently -10,66.3,66.3 +277,59,94,80,missed homework frequently -10,67.9,67.9 +278,61,77,88,,76.6,86.6 +279,88,56,82,great participation +10,86.0,86.0 +280,86,89,85,great final presentation +10,96.5,96.5 +281,72,90,98,,87.8,87.8 +282,68,93,73,,77.5,77.5 +283,94,93,91,missed homework frequently -10,82.5,82.5 +284,91,98,97,great participation +10,100.0,100.0 +285,91,0,89,"cheated on exam, gets 0pts",62.9,62.9 +286,58,54,37,,48.4,48.4 +287,93,88,79,missed homework frequently -10,75.9,75.9 +288,0,72,88,"cheated on exam, gets 0pts",56.8,56.8 +289,61,76,87,,75.9,65.9 +290,84,97,99,,93.9,93.9 +291,86,98,95,,93.2,103.2 +292,80,76,66,,73.2,73.2 +293,73,86,40,great final presentation +10,73.7,73.7 +294,0,50,85,"cheated on exam, gets 0pts",49.0,49.0 +295,73,64,87,missed homework frequently -10,65.9,75.9 +296,95,76,41,,67.7,77.7 +297,0,69,90,"cheated on exam, gets 0pts",56.7,56.7 +298,86,45,86,great final presentation +10,83.7,83.7 +299,64,93,70,,75.1,75.1 +300,87,57,76,,73.6,63.600000000000000 +301,61,65,100,missed homework frequently -10,67.8,67.8 +302,87,56,85,,76.9,76.9 +303,83,77,72,missed homework frequently -10,66.8,66.8 +304,78,89,81,,82.5,82.5 +305,95,94,89,,92.3,92.3 +306,84,96,91,great final presentation +10,100.0,90.0 +307,67,93,93,,85.2,95.2 +308,77,79,94,,84.4,84.4 +309,86,74,61,,72.4,72.4 +310,0,71,80,"cheated on exam, gets 0pts",53.3,53.3 +311,79,62,82,,75.1,75.1 +312,66,72,83,missed homework frequently -10,64.6,64.6 +313,79,89,86,great final presentation +10,94.8,94.8 +314,100,98,96,missed homework frequently -10,87.8,87.8 +315,73,82,69,missed class frequently -10,64.1,64.1 +316,76,84,67,,74.8,74.8 +317,0,84,72,"cheated on exam, gets 0pts",54.0,54.0 +318,0,92,56,"cheated on exam, gets 0pts",50.0,50.0 +319,99,83,88,great participation +10,99.8,99.8 +320,91,61,100,,85.6,85.6 +321,83,52,88,missed homework frequently -10,65.7,65.7 +322,83,93,54,,74.4,74.4 +323,0,93,88,"cheated on exam, gets 0pts",63.1,63.1 +324,95,82,54,,74.7,74.7 +325,86,67,98,,85.1,85.1 +326,91,89,83,missed class frequently -10,77.2,77.2 +327,100,73,84,missed homework frequently -10,75.5,75.5 +328,52,95,72,great participation +10,82.9,82.9 +329,64,73,79,great final presentation +10,82.7,82.7 +330,98,51,74,,74.3,74.3 +331,86,91,86,missed class frequently -10,77.5,77.5 +332,97,97,92,missed homework frequently -10,85.0,85.0 +333,88,89,80,missed class frequently -10,75.1,75.1 +334,54,80,81,,72.6,72.6 +335,90,70,68,,75.2,65.2 +336,78,63,75,great final presentation +10,82.3,82.3 +337,93,90,82,,87.7,87.7 +338,66,73,89,,77.3,77.3 +339,82,88,78,great final presentation +10,92.2,92.2 +340,83,87,88,missed homework frequently -10,76.2,86.2 +341,53,100,78,great final presentation +10,87.1,77.1 +342,93,70,97,,87.7,87.7 +343,93,73,82,,82.6,92.6 +344,93,79,97,great participation +10,100.0,100.0 +345,99,86,54,,77.1,77.1 +346,85,80,60,,73.5,73.5 +347,64,86,78,,76.2,76.2 +348,89,97,100,missed class frequently -10,85.8,85.8 +349,95,88,77,,85.7,85.7 +350,96,62,92,great participation +10,94.2,94.2 +351,84,0,80,"cheated on exam, gets 0pts",57.2,57.2 +352,88,71,94,great participation +10,95.3,95.3 +353,92,93,81,,87.9,87.9 +354,93,94,94,,93.7,93.7 +355,86,87,86,missed class frequently -10,76.3,76.3 +356,80,85,84,missed homework frequently -10,73.1,73.1 +357,0,73,89,"cheated on exam, gets 0pts",57.5,57.5 +358,89,78,88,great participation +10,95.3,95.3 +359,66,90,50,,66.8,66.8 +360,81,0,75,"cheated on exam, gets 0pts",54.3,54.3 +361,87,93,76,missed class frequently -10,74.4,74.4 +362,71,76,56,,66.5,66.5 +363,94,70,85,great participation +10,93.2,93.2 +364,90,60,78,,76.2,76.2 +365,66,60,86,,72.2,72.2 +366,73,96,92,,87.5,87.5 +367,0,49,93,"cheated on exam, gets 0pts",51.9,51.9 +368,97,81,100,great participation +10,100.0,100.0 +369,0,54,74,"cheated on exam, gets 0pts",45.8,45.8 +370,86,75,62,,73.1,83.1 +371,89,90,93,great final presentation +10,100.0,100.0 +372,65,94,94,,85.3,85.3 +373,91,99,79,great participation +10,98.6,98.6 +374,69,55,95,,75.2,75.2 +375,77,71,78,,75.6,75.6 +376,99,59,70,missed class frequently -10,65.4,65.4 +377,88,93,92,great final presentation +10,100.0,100.0 +378,81,92,77,,82.7,82.7 +379,84,74,91,great participation +10,93.8,93.8 +380,85,81,87,,84.6,84.6 +381,88,67,92,,83.3,83.3 +382,97,0,94,"cheated on exam, gets 0pts",66.7,66.7 +383,99,67,89,great final presentation +10,95.4,95.4 +384,95,77,87,,86.4,86.4 +385,81,95,77,,83.6,83.6 +386,72,80,69,,73.2,73.2 +387,95,59,90,great participation +10,92.2,92.2 +388,89,95,87,great participation +10,100.0,100.0 +389,81,56,90,great final presentation +10,87.1,87.1 +390,50,67,55,,57.1,57.1 +391,87,0,91,"cheated on exam, gets 0pts",62.5,62.5 +392,79,0,59,"cheated on exam, gets 0pts",47.3,57.3 +393,85,62,75,,74.1,74.1 +394,64,75,90,,77.7,67.7 +395,69,87,94,great participation +10,94.4,94.4 +396,87,77,96,,87.6,87.6 +397,80,92,84,,85.2,85.2 +398,52,80,50,missed class frequently -10,49.6,49.6 +399,83,70,71,,74.3,74.3 +400,77,95,85,,85.6,75.6 +401,72,80,93,,82.8,82.8 +402,89,67,95,great final presentation +10,94.8,84.8 +403,90,87,99,,92.7,92.7 +404,85,79,87,,84.0,84.0 +405,69,95,62,great participation +10,84.0,74.0 +406,69,92,99,missed homework frequently -10,77.9,77.9 +407,97,61,76,missed homework frequently -10,67.8,67.8 +408,0,76,75,"cheated on exam, gets 0pts",52.8,52.8 +409,91,74,88,great participation +10,94.7,94.7 +410,83,92,80,missed class frequently -10,74.5,74.5 +411,82,0,58,"cheated on exam, gets 0pts",47.8,47.8 +412,92,96,90,,92.4,92.4 +413,59,82,61,,66.7,66.7 +414,65,91,72,,75.6,75.6 +415,65,81,75,,73.8,73.8 +416,97,0,92,"cheated on exam, gets 0pts",65.9,65.9 +417,71,86,89,missed class frequently -10,72.7,82.7 +418,94,0,95,"cheated on exam, gets 0pts",66.2,66.2 +419,65,95,68,,75.2,75.2 +420,84,83,60,,74.1,74.1 +421,67,87,95,missed homework frequently -10,74.2,74.2 +422,88,69,100,,87.1,87.1 +423,65,98,93,,86.1,86.1 +424,78,57,85,,74.5,74.5 +425,92,97,44,,74.3,74.3 +426,53,49,88,,65.8,65.8 +427,83,88,88,,86.5,86.5 +428,83,79,93,great participation +10,95.8,95.8 +429,96,91,90,great final presentation +10,100.0,100.0 +430,76,91,100,great participation +10,100.0,100.0 +431,85,92,84,missed homework frequently -10,76.7,76.7 +432,92,90,77,great participation +10,95.4,95.4 +433,83,78,74,,77.9,77.9 +434,97,98,92,,95.3,95.3 +435,88,78,91,great final presentation +10,96.2,96.2 +436,95,41,56,,63.2,53.2 +437,97,86,54,missed homework frequently -10,66.5,66.5 +438,68,73,81,missed class frequently -10,64.7,64.7 +439,94,92,75,great participation +10,95.8,95.8 +440,96,84,74,,83.6,73.6 +441,86,85,89,missed homework frequently -10,76.9,76.9 +442,87,81,86,,84.8,84.8 +443,77,0,87,"cheated on exam, gets 0pts",57.9,57.9 +444,84,83,85,missed class frequently -10,74.1,74.1 +445,0,69,93,"cheated on exam, gets 0pts",57.9,57.9 +446,94,95,90,missed class frequently -10,82.7,82.7 +447,57,78,83,,73.7,73.7 +448,92,74,88,great final presentation +10,95.0,95.0 +449,72,91,92,,85.7,85.7 +450,0,77,81,"cheated on exam, gets 0pts",55.5,45.5 +451,87,83,78,,82.2,82.2 +452,69,85,79,,77.8,77.8 +453,73,74,96,,82.5,82.5 +454,51,65,97,,73.6,73.6 +455,90,99,90,,92.7,92.7 +456,85,0,96,"cheated on exam, gets 0pts",63.9,63.9 +457,71,81,80,,77.6,77.6 +458,88,80,91,missed class frequently -10,76.8,76.8 +459,80,89,88,,85.9,85.9 +460,74,86,93,great participation +10,95.2,95.2 +461,0,72,67,"cheated on exam, gets 0pts",48.4,48.4 +462,89,98,80,great participation +10,98.1,98.1 +463,92,92,76,,85.6,95.6 +464,62,95,52,missed homework frequently -10,57.900000000000000,57.900000000000000 +465,77,78,95,missed homework frequently -10,74.5,74.5 +466,93,91,52,,76.0,76.0 +467,84,97,89,great participation +10,99.9,89.9 +468,94,62,50,missed homework frequently -10,56.8,56.8 +469,56,76,60,,63.6,63.6 +470,80,95,61,,76.9,76.9 +471,90,95,75,great final presentation +10,95.5,95.5 +472,91,92,96,missed class frequently -10,83.3,83.3 +473,66,91,70,missed class frequently -10,65.1,65.1 +474,58,92,93,,82.2,82.2 +475,72,93,98,great participation +10,98.7,98.7 +476,80,96,83,missed homework frequently -10,76.0,76.0 +477,100,69,41,,67.1,67.1 +478,91,98,77,,87.5,87.5 +479,71,77,73,great final presentation +10,83.6,83.6 +480,90,73,91,,85.3,85.3 +481,66,95,86,,82.7,72.7 +482,67,88,89,,82.1,72.1 +483,50,67,94,great final presentation +10,82.7,82.7 +484,84,99,71,,83.3,83.3 +485,80,65,55,,65.5,65.5 +486,67,83,77,,75.8,75.8 +487,94,78,78,,82.8,82.8 +488,72,63,91,great final presentation +10,86.9,86.9 +489,90,94,79,missed homework frequently -10,76.8,76.8 +490,83,90,83,,85.1,85.1 +491,74,79,92,missed homework frequently -10,72.7,72.7 +492,93,75,93,,87.6,87.6 +493,72,88,96,,86.4,76.4 +494,90,100,89,great participation +10,100.0,100.0 +495,72,92,85,great participation +10,93.2,93.2 +496,60,92,55,,67.6,67.6 +497,83,79,67,,75.4,75.4 +498,93,96,94,great final presentation +10,100.0,100.0 +499,63,68,84,,72.9,72.9 +500,77,88,82,missed homework frequently -10,72.3,72.3 +501,94,82,97,great final presentation +10,100.0,100.0 +502,69,86,97,missed homework frequently -10,75.3,75.3 +503,90,77,99,great participation +10,99.7,99.7 +504,0,99,92,"cheated on exam, gets 0pts",66.5,66.5 +505,93,82,88,,87.7,87.7 +506,86,86,85,missed class frequently -10,75.6,75.6 +507,76,92,64,great participation +10,86.0,86.0 +508,94,97,93,great participation +10,100.0,100.0 +509,89,71,94,,85.6,85.6 +510,89,57,47,,62.6,62.6 +511,96,69,92,,86.3,86.3 +512,91,74,88,,84.7,84.7 +513,85,100,78,,86.7,86.7 +514,85,0,59,"cheated on exam, gets 0pts",49.1,49.1 +515,81,94,82,,85.3,85.3 +516,87,64,94,,82.9,82.9 +517,68,56,98,great final presentation +10,86.4,86.4 +518,56,79,85,,74.5,74.5 +519,81,63,59,missed class frequently -10,56.8,56.8 +520,88,83,83,,84.5,84.5 +521,85,93,84,,87.0,87.0 +522,87,79,82,great final presentation +10,92.6,92.6 +523,92,87,60,great participation +10,87.7,87.7 +524,81,97,100,,93.4,93.4 +525,78,85,84,,82.5,92.5 +526,94,83,88,great final presentation +10,98.3,98.3 +527,0,97,88,"cheated on exam, gets 0pts",64.3,64.3 +528,96,64,93,great final presentation +10,95.2,85.2 +529,69,87,51,,67.2,67.2 +530,81,67,100,great participation +10,94.4,94.4 +531,81,87,60,missed class frequently -10,64.4,64.4 +532,66,83,77,,75.5,75.5 +533,76,72,100,great final presentation +10,94.4,94.4 +534,92,73,85,great final presentation +10,93.5,93.5 +535,80,85,93,great participation +10,96.7,96.7 +536,86,81,88,great final presentation +10,95.3,95.3 +537,56,75,71,,67.7,67.7 +538,72,0,76,"cheated on exam, gets 0pts",52.0,52.0 +539,62,64,63,,63.0,63.0 +540,75,0,65,"cheated on exam, gets 0pts",48.5,48.5 +541,89,65,74,missed homework frequently -10,65.8,65.8 +542,91,0,68,"cheated on exam, gets 0pts",54.5,54.5 +543,72,81,80,,77.9,77.9 +544,87,72,50,great final presentation +10,77.7,77.7 +545,73,96,100,great participation +10,100.0,100.0 +546,78,94,83,,84.8,84.8 +547,94,82,83,,86.0,86.0 +548,95,83,100,,93.4,93.4 +549,76,83,86,,82.1,82.1 +550,93,56,74,,74.3,74.3 +551,91,75,100,great participation +10,99.8,99.8 +552,82,94,85,,86.8,86.8 +553,83,79,92,missed homework frequently -10,75.4,75.4 +554,92,44,66,,67.2,67.2 +555,78,0,86,"cheated on exam, gets 0pts",57.8,57.8 +556,80,96,80,,84.8,84.8 +557,87,48,91,,76.9,76.9 +558,78,89,68,,77.3,77.3 +559,100,100,99,,99.6,99.6 +560,43,0,85,"cheated on exam, gets 0pts",46.9,36.9 +561,0,89,90,"cheated on exam, gets 0pts",62.7,62.7 +562,88,74,96,great final presentation +10,97.0,97.0 +563,81,98,96,,92.1,92.1 +564,84,92,86,,87.2,87.2 +565,83,86,85,missed homework frequently -10,74.7,74.7 +566,67,98,65,missed homework frequently -10,65.5,65.5 +567,0,95,94,"cheated on exam, gets 0pts",66.1,66.1 +568,84,99,47,great final presentation +10,83.7,83.7 +569,91,98,75,missed class frequently -10,76.7,76.7 +570,85,91,83,great participation +10,96.0,96.0 +571,92,50,87,,77.4,77.4 +572,99,84,82,missed class frequently -10,77.7,77.7 +573,87,74,86,missed homework frequently -10,72.7,62.7 +574,71,64,62,great participation +10,75.3,75.3 +575,90,96,71,missed class frequently -10,74.2,84.2 +576,61,54,100,,74.5,74.5 +577,74,48,74,,66.2,66.2 +578,98,68,58,,73.0,73.0 +579,92,87,49,,73.3,73.3 +580,70,75,99,,83.1,83.1 +581,80,0,99,"cheated on exam, gets 0pts",63.6,63.6 +582,66,83,96,,83.1,83.1 +583,69,49,75,,65.4,65.4 +584,93,95,67,,83.2,83.2 +585,87,85,77,,82.4,82.4 +586,82,68,48,,64.2,64.2 +587,75,90,93,,86.7,76.7 +588,67,92,64,,73.3,73.3 +589,98,97,93,,95.7,95.7 +590,73,93,86,,84.2,74.2 +591,58,0,71,"cheated on exam, gets 0pts",45.8,45.8 +592,97,94,76,,87.7,87.7 +593,92,99,87,missed homework frequently -10,82.1,82.1 +594,73,87,92,,84.8,84.8 +595,79,60,84,,75.3,75.3 +596,62,54,71,,63.2,63.2 +597,98,97,68,missed homework frequently -10,75.7,75.7 +598,41,80,93,,73.5,73.5 +599,91,90,81,great participation +10,96.7,96.7 +600,96,83,73,great final presentation +10,92.9,92.9 +601,98,88,95,,93.8,93.8 +602,83,80,86,,83.3,83.3 +603,86,90,80,great final presentation +10,94.8,94.8 +604,90,67,77,missed class frequently -10,67.9,67.9 +605,59,68,65,,64.1,64.1 +606,76,85,74,,77.9,77.9 +607,71,89,96,great final presentation +10,96.4,96.4 +608,82,68,80,,77.0,77.0 +609,57,74,86,,73.7,73.7 +610,91,97,96,,94.8,94.8 +611,89,75,64,great final presentation +10,84.8,84.8 +612,69,87,70,missed homework frequently -10,64.8,64.8 +613,90,66,96,great final presentation +10,95.2,95.2 +614,92,0,71,"cheated on exam, gets 0pts",56.0,56.0 +615,71,89,90,,84.0,84.0 +616,53,0,76,"cheated on exam, gets 0pts",46.3,46.3 +617,90,79,89,,86.3,86.3 +618,92,95,92,,92.9,92.9 +619,81,100,74,,83.9,83.9 +620,92,71,93,great participation +10,96.1,96.1 +621,0,87,94,"cheated on exam, gets 0pts",63.7,63.7 +622,54,76,96,,77.4,67.4 +623,86,67,80,,77.9,77.9 +624,94,79,58,great final presentation +10,85.1,95.1 +625,57,89,55,,65.8,65.8 +626,81,81,91,,85.0,85.0 +627,71,94,89,,85.1,85.1 +628,95,83,83,,86.6,86.6 +629,99,87,71,,84.2,84.2 +630,81,86,95,great participation +10,98.1,98.1 +631,94,78,89,,87.2,87.2 +632,99,73,84,missed homework frequently -10,75.2,75.2 +633,56,90,71,missed class frequently -10,62.2,62.2 +634,79,83,92,,85.4,85.4 +635,94,77,82,great participation +10,94.1,94.1 +636,100,62,61,missed class frequently -10,63.0,63.0 +637,88,92,83,,87.2,87.2 +638,89,86,90,great final presentation +10,98.5,98.5 +639,81,71,54,missed homework frequently -10,57.2,67.2 +640,93,53,77,,74.6,74.6 +641,92,0,61,"cheated on exam, gets 0pts",52.0,52.0 +642,89,77,68,,77.0,77.0 +643,96,98,92,great final presentation +10,100.0,90.0 +644,79,98,84,,86.7,86.7 +645,93,47,79,great participation +10,83.6,83.6 +646,92,93,73,missed homework frequently -10,74.7,74.7 +647,67,69,84,missed class frequently -10,64.4,64.4 +648,94,79,79,great participation +10,93.5,93.5 +649,95,73,57,great final presentation +10,83.2,83.2 +650,91,91,78,great final presentation +10,95.8,95.8 +651,81,69,95,,83.0,73.0 +652,80,76,96,missed class frequently -10,75.2,75.2 +653,76,82,91,great final presentation +10,93.8,93.8 +654,78,95,54,missed homework frequently -10,63.5,63.5 +655,86,96,100,,94.6,104.6 +656,97,98,63,missed class frequently -10,73.7,73.7 +657,98,75,89,great final presentation +10,97.5,97.5 +658,79,73,72,,74.4,74.4 +659,38,86,75,great final presentation +10,77.2,77.2 +660,83,74,97,,85.9,95.9 +661,78,95,87,,86.7,76.7 +662,0,79,65,"cheated on exam, gets 0pts",49.7,49.7 +663,89,66,90,,82.5,82.5 +664,81,96,96,great participation +10,100.0,100.0 +665,68,65,89,,75.5,75.5 +666,86,85,89,,86.9,86.9 +667,99,99,87,,94.2,94.2 +668,60,66,64,great final presentation +10,73.4,73.4 +669,68,95,90,,84.9,84.9 +670,79,81,69,,75.6,75.6 +671,95,69,83,missed homework frequently -10,72.4,72.4 +672,68,0,38,"cheated on exam, gets 0pts",35.6,45.6 +673,93,82,85,missed homework frequently -10,76.5,76.5 +674,80,85,86,missed homework frequently -10,73.9,73.9 +675,0,87,99,"cheated on exam, gets 0pts",65.7,65.7 +676,96,79,92,great participation +10,99.3,99.3 +677,95,82,76,,83.5,93.5 +678,89,81,64,missed class frequently -10,66.6,66.6 +679,70,50,75,missed class frequently -10,56.0,56.0 +680,91,97,53,great participation +10,87.6,87.6 +681,65,94,92,,84.5,84.5 +682,64,91,97,,85.3,85.3 +683,94,90,90,great participation +10,100.0,100.0 +684,85,82,55,great participation +10,82.1,82.1 +685,88,91,80,,85.7,85.7 +686,99,95,99,great participation +10,100.0,100.0 +687,66,0,78,"cheated on exam, gets 0pts",51.0,51.0 +688,49,77,89,great participation +10,83.4,83.4 +689,88,69,89,,82.7,82.7 +690,77,54,70,,67.3,67.3 +691,74,80,90,,82.2,82.2 +692,86,97,95,,92.9,92.9 +693,97,88,74,great participation +10,95.1,95.1 +694,84,55,53,missed class frequently -10,52.9,52.9 +695,0,96,87,"cheated on exam, gets 0pts",63.6,63.6 +696,89,74,85,,82.9,72.9 +697,55,71,98,missed homework frequently -10,67.0,67.0 +698,78,76,72,,75.0,75.0 +699,99,77,81,great final presentation +10,95.2,95.2 +700,78,63,88,,77.5,77.5 +701,96,76,85,,85.6,85.6 +702,88,88,83,missed class frequently -10,76.0,76.0 +703,89,85,87,great final presentation +10,97.0,97.0 +704,89,78,94,,87.7,87.7 +705,93,81,77,great participation +10,93.0,93.0 +706,71,96,65,missed class frequently -10,66.1,66.1 +707,70,96,32,missed homework frequently -10,52.6,52.6 +708,95,87,82,missed class frequently -10,77.4,77.4 +709,81,56,58,,64.3,64.3 +710,91,69,90,,84.0,94.0 +711,97,84,87,great participation +10,99.1,89.1 +712,61,0,97,"cheated on exam, gets 0pts",57.1,57.1 +713,70,99,90,,86.7,86.7 +714,72,91,91,,85.3,85.3 +715,73,77,73,,74.2,74.2 +716,58,90,100,great participation +10,94.4,94.4 +717,77,51,70,,66.4,66.4 +718,78,58,86,great final presentation +10,85.2,85.2 +719,95,94,89,,92.3,92.3 +720,65,0,95,"cheated on exam, gets 0pts",57.5,67.5 +721,80,0,98,"cheated on exam, gets 0pts",63.2,63.2 +722,65,0,92,"cheated on exam, gets 0pts",56.3,56.3 +723,89,91,57,,76.8,76.8 +724,65,72,86,great participation +10,85.5,85.5 +725,74,81,74,,76.1,76.1 +726,73,68,61,missed homework frequently -10,56.7,56.7 +727,94,91,93,great participation +10,100.0,100.0 +728,66,82,82,missed homework frequently -10,67.2,67.2 +729,92,81,54,,73.5,73.5 +730,78,75,96,,84.3,84.3 +731,81,79,86,,82.4,72.4 +732,78,84,71,,77.0,77.0 +733,84,81,88,,84.7,84.7 +734,97,0,90,"cheated on exam, gets 0pts",65.1,65.1 +735,0,61,84,"cheated on exam, gets 0pts",51.9,51.9 +736,85,94,84,,87.3,87.3 +737,100,82,72,great participation +10,93.4,93.4 +738,64,70,86,,74.6,74.6 +739,68,74,40,missed homework frequently -10,48.6,48.6 +740,90,82,60,missed class frequently -10,65.6,65.6 +741,87,97,99,,94.8,94.8 +742,0,75,78,"cheated on exam, gets 0pts",53.7,53.7 +743,76,74,97,,83.8,83.8 +744,90,88,84,missed homework frequently -10,77.0,77.0 +745,92,72,94,,86.8,86.8 +746,87,83,87,missed class frequently -10,75.8,75.8 +747,87,82,57,,73.5,73.5 +748,70,92,96,missed homework frequently -10,77.0,77.0 +749,94,77,87,,86.1,86.1 +750,84,65,95,missed class frequently -10,72.7,72.7 +751,92,79,98,great final presentation +10,100.0,100.0 +752,53,89,78,,73.8,73.8 +753,0,83,76,"cheated on exam, gets 0pts",55.3,55.3 +754,82,69,75,great final presentation +10,85.3,85.3 +755,88,90,81,missed homework frequently -10,75.8,85.8 +756,52,75,62,great final presentation +10,72.9,72.9 +757,78,89,58,,73.3,73.3 +758,41,88,98,missed class frequently -10,67.9,67.9 +759,93,68,86,missed homework frequently -10,72.7,72.7 +760,92,75,93,missed homework frequently -10,77.3,67.3 +761,53,0,94,"cheated on exam, gets 0pts",53.5,53.5 +762,90,95,68,,82.7,82.7 +763,43,76,91,missed homework frequently -10,62.1,62.1 +764,96,73,89,missed homework frequently -10,76.3,76.3 +765,79,76,98,,85.7,85.7 +766,82,87,82,,83.5,73.5 +767,82,80,98,great final presentation +10,97.8,97.8 +768,93,96,90,,92.7,92.7 +769,88,96,79,great final presentation +10,96.8,86.8 +770,58,66,74,,66.8,66.8 +771,88,81,85,,84.7,84.7 +772,58,80,89,,77.0,77.0 +773,97,76,90,,87.9,87.9 +774,90,74,95,great participation +10,97.2,107.2 +775,79,86,99,great final presentation +10,99.1,99.1 +776,93,36,62,,63.5,63.5 +777,34,91,92,missed homework frequently -10,64.3,64.3 +778,83,62,55,,65.5,65.5 +779,81,75,77,,77.6,77.6 +780,80,77,98,missed class frequently -10,76.3,66.3 +781,99,97,70,,86.8,96.8 +782,83,99,80,,86.6,86.6 +783,80,72,97,,84.4,84.4 +784,0,68,87,"cheated on exam, gets 0pts",55.2,55.2 +785,71,95,45,,67.8,67.8 +786,92,89,82,great final presentation +10,97.1,97.1 +787,89,62,93,,82.5,82.5 +788,67,72,90,missed class frequently -10,67.7,67.7 +789,64,89,91,great final presentation +10,92.3,92.3 +790,0,79,68,"cheated on exam, gets 0pts",50.9,50.9 +791,80,91,60,,75.3,75.3 +792,96,0,62,"cheated on exam, gets 0pts",53.6,53.6 +793,70,92,93,missed homework frequently -10,75.8,75.8 +794,48,50,65,,55.4,55.4 +795,0,89,91,"cheated on exam, gets 0pts",63.1,63.1 +796,91,89,81,,86.4,86.4 +797,99,84,94,,92.5,92.5 +798,94,97,91,,93.7,93.7 +799,84,93,62,missed class frequently -10,67.9,67.9 +800,82,64,80,great final presentation +10,85.8,85.8 +801,78,60,88,great final presentation +10,86.6,86.6 +802,69,55,72,missed homework frequently -10,56.0,56.0 +803,71,92,94,,86.5,86.5 +804,59,84,83,missed class frequently -10,66.1,66.1 +805,74,96,92,missed class frequently -10,77.8,77.8 +806,80,95,58,,75.7,75.7 +807,96,64,92,great final presentation +10,94.8,84.8 +808,0,88,95,"cheated on exam, gets 0pts",64.4,64.4 +809,97,65,85,missed class frequently -10,72.6,62.600000000000000 +810,64,85,97,,83.5,83.5 +811,85,97,86,great participation +10,99.0,109.0 +812,85,96,48,,73.5,73.5 +813,87,58,34,missed homework frequently -10,47.1,47.1 +814,85,82,85,great participation +10,94.1,94.1 +815,93,74,91,,86.5,86.5 +816,58,75,91,missed class frequently -10,66.3,66.3 +817,98,73,83,,84.5,84.5 +818,92,84,92,great participation +10,99.6,99.6 +819,93,88,84,great participation +10,97.9,107.9 +820,88,80,93,,87.6,77.6 +821,72,94,88,,85.0,85.0 +822,58,90,97,,83.2,83.2 +823,70,97,42,,66.9,66.9 +824,73,86,64,,73.3,73.3 +825,76,88,94,great final presentation +10,96.8,96.8 +826,71,96,83,great final presentation +10,93.3,93.3 +827,74,58,92,great final presentation +10,86.4,86.4 +828,78,95,78,,83.1,83.1 +829,49,95,77,,74.0,74.0 +830,85,93,58,,76.6,76.6 +831,91,90,74,,83.9,83.9 +832,98,97,87,,93.3,93.3 +833,74,98,86,,86.0,86.0 +834,94,0,72,"cheated on exam, gets 0pts",57.0,57.0 +835,83,85,92,great final presentation +10,97.2,97.2 +836,85,87,90,,87.6,97.6 +837,65,0,57,"cheated on exam, gets 0pts",42.3,42.3 +838,72,0,80,"cheated on exam, gets 0pts",53.6,53.6 +839,90,76,81,,82.2,82.2 +840,0,65,88,"cheated on exam, gets 0pts",54.7,54.7 +841,89,97,96,missed homework frequently -10,84.2,84.2 +842,84,94,100,great final presentation +10,100.0,100.0 +843,95,80,86,,86.9,86.9 +844,69,90,86,missed homework frequently -10,72.1,72.1 +845,95,90,69,,83.1,83.1 +846,93,98,75,,87.3,87.3 +847,72,85,91,missed homework frequently -10,73.5,73.5 +848,85,74,87,missed homework frequently -10,72.5,72.5 +849,90,82,86,,86.0,86.0 +850,69,86,75,,76.5,76.5 +851,99,75,78,,83.4,83.4 +852,96,95,89,,92.9,102.9 +853,99,93,90,,93.6,93.6 +854,0,92,60,"cheated on exam, gets 0pts",51.6,51.6 +855,85,91,84,,86.4,86.4 +856,99,62,90,,84.3,84.3 +857,62,92,95,,84.2,84.2 +858,60,37,62,,53.9,53.9 +859,99,86,74,,85.1,85.1 +860,48,47,84,missed homework frequently -10,52.1,52.1 +861,70,84,76,,76.6,76.6 +862,88,91,53,,74.9,74.9 +863,100,83,74,,84.5,94.5 +864,74,61,87,,75.3,75.3 +865,98,71,93,,87.9,87.9 +866,51,75,87,,72.6,82.6 +867,98,85,82,great final presentation +10,97.7,87.7 +868,97,0,42,"cheated on exam, gets 0pts",45.9,45.9 +869,83,91,63,,77.4,77.4 +870,82,61,87,great participation +10,87.7,87.7 +871,92,84,62,,77.6,77.6 +872,86,89,82,,85.3,85.3 +873,73,74,76,missed class frequently -10,64.5,64.5 +874,87,0,96,"cheated on exam, gets 0pts",64.5,64.5 +875,52,86,87,,76.2,76.2 +876,71,94,95,great final presentation +10,97.5,97.5 +877,82,90,85,,85.6,85.6 +878,96,96,88,,92.8,92.8 +879,66,95,87,great final presentation +10,93.1,93.1 +880,68,80,74,,74.0,84.0 +881,53,89,88,great final presentation +10,87.8,87.8 +882,80,96,81,,85.2,85.2 +883,65,69,69,,67.8,77.8 +884,70,96,81,great final presentation +10,92.2,92.2 +885,86,85,84,missed class frequently -10,74.9,74.9 +886,64,85,97,missed class frequently -10,73.5,73.5 +887,87,0,70,"cheated on exam, gets 0pts",54.1,44.1 +888,94,90,99,missed class frequently -10,84.8,74.8 +889,66,74,80,missed class frequently -10,64.0,64.0 +890,62,98,93,great participation +10,95.2,95.2 +891,0,63,88,"cheated on exam, gets 0pts",54.1,54.1 +892,82,99,82,,87.1,87.1 +893,85,90,80,missed homework frequently -10,74.5,74.5 +894,78,73,98,,84.5,84.5 +895,91,68,88,,82.9,82.9 +896,65,84,58,great participation +10,77.9,87.9 +897,84,58,88,missed class frequently -10,67.8,67.8 +898,70,71,88,great final presentation +10,87.5,77.5 +899,71,78,80,great final presentation +10,86.7,86.7 +900,85,68,76,great participation +10,86.3,76.3 +901,94,78,96,great participation +10,100.0,100.0 +902,100,100,87,missed homework frequently -10,84.8,84.8 +903,96,73,94,great participation +10,98.3,98.3 +904,67,88,91,,82.9,82.9 +905,79,84,64,great final presentation +10,84.5,84.5 +906,86,87,77,,82.7,82.7 +907,83,94,78,missed homework frequently -10,74.3,64.3 +908,97,90,72,,84.9,84.9 +909,75,70,75,,73.5,73.5 +910,83,71,70,great participation +10,84.2,84.2 +911,87,76,72,,77.7,77.7 +912,73,58,57,great participation +10,72.1,72.1 +913,65,82,83,missed homework frequently -10,67.3,67.3 +914,69,72,89,,77.9,77.9 +915,100,91,83,great participation +10,100.0,100.0 +916,78,71,94,,82.3,82.3 +917,61,78,64,,67.3,67.3 +918,100,93,70,,85.9,85.9 +919,85,88,81,,84.3,84.3 +920,92,67,65,missed class frequently -10,63.7,63.7 +921,97,86,68,missed homework frequently -10,72.1,62.100000000000000 +922,89,95,73,,84.4,84.4 +923,87,80,65,great final presentation +10,86.1,86.1 +924,92,0,94,"cheated on exam, gets 0pts",65.2,65.2 +925,81,0,95,"cheated on exam, gets 0pts",62.3,62.3 +926,82,69,92,,82.1,82.1 +927,74,73,77,great final presentation +10,84.9,84.9 +928,75,67,78,great final presentation +10,83.8,83.8 +929,80,59,86,,76.1,76.1 +930,79,92,90,missed class frequently -10,77.3,77.3 +931,67,63,83,great participation +10,82.2,82.2 +932,88,88,79,,84.4,84.4 +933,76,87,60,missed homework frequently -10,62.900000000000000,62.900000000000000 +934,0,79,65,"cheated on exam, gets 0pts",49.7,49.7 +935,65,93,88,great final presentation +10,92.6,92.6 +936,73,71,83,,76.4,76.4 +937,73,64,88,,76.3,76.3 +938,94,88,79,great participation +10,96.2,106.2 +939,93,71,89,,84.8,84.8 +940,67,0,76,"cheated on exam, gets 0pts",50.5,50.5 +941,79,73,78,,76.8,76.8 +942,84,76,65,missed class frequently -10,64.0,64.0 +943,47,77,88,,72.4,72.4 +944,74,65,85,,75.7,75.7 +945,84,90,100,,92.2,92.2 +946,84,87,77,,82.1,82.1 +947,90,0,88,"cheated on exam, gets 0pts",62.2,72.2 +948,84,85,61,,75.1,75.1 +949,93,82,74,,82.1,72.1 +950,57,85,86,,77.0,77.0 +951,94,0,89,"cheated on exam, gets 0pts",63.8,63.8 +952,73,80,95,,83.9,83.9 +953,87,80,91,,86.5,76.5 +954,100,80,78,missed class frequently -10,75.2,75.2 +955,56,81,90,,77.1,77.1 +956,67,65,65,,65.6,65.6 +957,56,97,92,great final presentation +10,92.7,92.7 +958,79,0,75,"cheated on exam, gets 0pts",53.7,53.7 +959,66,95,95,,86.3,96.3 +960,0,79,96,"cheated on exam, gets 0pts",62.1,62.1 +961,88,80,89,,86.0,86.0 +962,87,74,95,missed class frequently -10,76.3,76.3 +963,71,92,83,,82.1,82.1 +964,68,82,97,,83.8,83.8 +965,74,72,97,missed class frequently -10,72.6,72.6 +966,90,92,88,great participation +10,99.8,109.8 +967,79,80,91,,84.1,84.1 +968,42,87,63,great participation +10,73.9,73.9 +969,71,0,82,"cheated on exam, gets 0pts",54.1,54.1 +970,78,48,96,,76.2,76.2 +971,74,85,88,great participation +10,92.9,82.9 +972,85,94,75,missed class frequently -10,73.7,63.7 +973,88,79,98,great final presentation +10,99.3,99.3 +974,37,83,69,missed homework frequently -10,53.6,53.6 +975,59,37,75,missed homework frequently -10,48.8,38.8 +976,0,97,60,"cheated on exam, gets 0pts",53.1,53.1 +977,74,80,91,,82.6,82.6 +978,88,66,74,,75.8,75.8 \ No newline at end of file From 6c16a1a920eed6a3ea9673b72cd34fd931ff27a6 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Wed, 28 Dec 2022 17:43:08 -0800 Subject: [PATCH 060/258] unused imports removed --- cleanlab/internal/regression_utils.py | 2 +- cleanlab/regression/rank.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py index 4e396f2d55..468edd30f5 100644 --- a/cleanlab/internal/regression_utils.py +++ b/cleanlab/internal/regression_utils.py @@ -4,7 +4,7 @@ import numpy as np from numpy.typing import ArrayLike -from typing import Tuple, Optional +from typing import Tuple def assert_valid_inputs( diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index cd6c474c93..b959e6e423 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -2,7 +2,7 @@ from cleanlab.outlier import OutOfDistribution from sklearn.neighbors import NearestNeighbors from cleanlab.internal.regression_utils import assert_valid_inputs -from typing import Dict, Callable, Optional +from typing import Dict, Callable from numpy.typing import ArrayLike """ Generates label quality scores for every sample in regression dataset """ From 54ae993ffa106a3d8eb092dce20e17571e60ca2b Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 29 Dec 2022 19:07:54 -0800 Subject: [PATCH 061/258] tutorial added --- docs/source/tutorials/index.rst | 1 + docs/source/tutorials/regression.ipynb | 667 +++++++++++++++ docs/source/tutorials/student_grades.csv | 980 ----------------------- 3 files changed, 668 insertions(+), 980 deletions(-) create mode 100644 docs/source/tutorials/regression.ipynb delete mode 100644 docs/source/tutorials/student_grades.csv diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst index e0d63a7e06..d87c7df96c 100644 --- a/docs/source/tutorials/index.rst +++ b/docs/source/tutorials/index.rst @@ -14,5 +14,6 @@ Tutorials multiannotator multilabel_classification token_classification + regression pred_probs_cross_val faq diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb new file mode 100644 index 0000000000..ace4d8aec7 --- /dev/null +++ b/docs/source/tutorials/regression.ipynb @@ -0,0 +1,667 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Label Quality Scores for Regression with Noisy Labels " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This quickstart tutorial shows how to use cleanlab for finding label quality scores in regression datasets. Using the approach mentioned here, you can find label quality scores in any regression dataset irrespective of modality i.e. tabular, text, image, etc. \n", + "\n", + "**This example will take you through the following:**\n", + "- Generate label quality scores for each example in the dataset. \n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Quickstart \n", + "\n", + "Cleanlab uses two inputs to generate scores for labels in the dataset:\n", + "- `labels`: NumPy array of given labels in the dataset. labels[i] should contain label for `i`-th example. \n", + "- `predictions`: NumPy array of predictions generated through your favorite regressor. predictions[i] should contain predicted value for `i`-th example. \n", + "\n", + "If you already have predictions from your regressor, you can generate label quality scores for each example using the code below: \n", + "\n", + "
\n", + "\n", + "```python \n", + "\n", + "from cleanlab.regression.rank import get_label_quality_scores\n", + "label_quality_scores = get_label_quality_scores(labels, predictions)\n", + "\n", + "```\n", + "
\n", + "" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Install dependencies and import them \n", + "You can use `pip` to install all the packages required for this tutorial as follows:\n", + "\n", + "`!pip install cleanlab xgboost`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: cleanlab in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (2.1.1)\n", + "Requirement already satisfied: xgboost in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (1.7.2)\n", + "Requirement already satisfied: scikit-learn>=0.18 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from cleanlab) (1.1.3)\n", + "Requirement already satisfied: pandas>=1.0.0 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from cleanlab) (1.5.1)\n", + "Requirement already satisfied: numpy>=1.11.3 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from cleanlab) (1.23.4)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from cleanlab) (2.1.0)\n", + "Requirement already satisfied: tqdm>=4.53.0 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from cleanlab) (4.64.1)\n", + "Requirement already satisfied: scipy in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from xgboost) (1.9.3)\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from pandas>=1.0.0->cleanlab) (2022.6)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from pandas>=1.0.0->cleanlab) (2.8.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from scikit-learn>=0.18->cleanlab) (3.1.0)\n", + "Requirement already satisfied: joblib>=1.0.0 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from scikit-learn>=0.18->cleanlab) (1.2.0)\n", + "Requirement already satisfied: six>=1.5 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas>=1.0.0->cleanlab) (1.16.0)\n" + ] + } + ], + "source": [ + "!pip install cleanlab xgboost" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Package installation (hidden on docs website).\n", + "# Package versions we used: xgboost==1.7.2\n", + "\n", + "dependencies = [\"cleanlab\", \"xgboost\"]\n", + "\n", + "if \"google.colab\" in str(get_ipython()): # Check if it's running in Google Colab\n", + " %pip install cleanlab # for colab\n", + " cmd = \" \".join([dep for dep in dependencies if dep != \"cleanlab\"])\n", + " %pip install $cmd\n", + "else:\n", + " missing_dependencies = []\n", + " for dependency in dependencies:\n", + " try:\n", + " __import__(dependency)\n", + " except ImportError:\n", + " missing_dependencies.append(dependency)\n", + "\n", + " if len(missing_dependencies) > 0:\n", + " print(\"Missing required dependencies:\")\n", + " print(*missing_dependencies, sep=\", \")\n", + " print(\"\\nPlease install them before running the rest of this notebook.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from cleanlab.regression.rank import get_label_quality_scores\n", + "\n", + "np.set_printoptions(suppress=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this part, we have added a support function to plot the dataset for a quick demonstration. You can use it to highlight the examples based on label_quality_scores. You can skip this part and move to the next section. " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
See the code for visualization **(click to expand)** \n", + "\n", + "```python \n", + "# Note: this pulldown is for docs.cleanlab.ai, if running on local Jupyter or colab, please ignore it. \n", + "\n", + "def plot_data(\n", + " data_x, data_y, circles, title, alpha=0.6, color=\"#1f77b4\", colorbar=False, xlabel=\"\", ylabel=\"\"\n", + "):\n", + " plt.figure(figsize=(14, 5))\n", + " data_x = data_x.to_numpy()\n", + " data_y = data_y.to_numpy()\n", + " plt.scatter(data_x, data_y, c=color, s=30)\n", + " for i in circles:\n", + " plt.plot(\n", + " data_x[i],\n", + " data_y[i],\n", + " \"o\",\n", + " markerfacecolor=\"none\",\n", + " markeredgecolor=\"red\",\n", + " markersize=10,\n", + " markeredgewidth=2.5,\n", + " alpha=alpha,\n", + " )\n", + " plt.title(title, fontsize=20)\n", + " plt.xlabel(xlabel)\n", + " plt.ylabel(ylabel)\n", + "\n", + " if colorbar:\n", + " plt.colorbar(orientation=\"vertical\")\n", + "\n", + "```\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_data(\n", + " data_x, data_y, circles, title, alpha=0.6, color=\"#1f77b4\", colorbar=False, xlabel=\"\", ylabel=\"\"\n", + "):\n", + " plt.figure(figsize=(14, 5))\n", + " data_x = data_x.to_numpy()\n", + " data_y = data_y.to_numpy()\n", + " plt.scatter(data_x, data_y, c=color, s=30)\n", + " for i in circles:\n", + " plt.plot(\n", + " data_x[i],\n", + " data_y[i],\n", + " \"o\",\n", + " markerfacecolor=\"none\",\n", + " markeredgecolor=\"red\",\n", + " markersize=10,\n", + " markeredgewidth=2.5,\n", + " alpha=alpha,\n", + " )\n", + " plt.title(title, fontsize=20)\n", + " plt.xlabel(xlabel)\n", + " plt.ylabel(ylabel)\n", + "\n", + " if colorbar:\n", + " plt.colorbar(orientation=\"vertical\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Import dataset and Generate predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File 'student_grades.csv' already there; not retrieving.\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
exam_1exam_2exam_3notestrue_gradegrade
0537793NaN76.276.2
1816480great participation +1085.585.5
2748897NaN87.487.4
3619478NaN77.777.7
4489091NaN77.877.8
\n", + "
" + ], + "text/plain": [ + " exam_1 exam_2 exam_3 notes true_grade grade\n", + "0 53 77 93 NaN 76.2 76.2\n", + "1 81 64 80 great participation +10 85.5 85.5\n", + "2 74 88 97 NaN 87.4 87.4\n", + "3 61 94 78 NaN 77.7 77.7\n", + "4 48 90 91 NaN 77.8 77.8" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "!wget -nc https://cleanlab-public.s3.amazonaws.com/Datasets/student_grades.csv\n", + "data = pd.read_csv(\"./student_grades.csv\", index_col=0)\n", + "data.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the data frame displayed above, `grade` represents the noisy grades and `true_grade` represents the ground truth. Please note that ground truth is usually not available in a real dataset. We have added it here for comparison and to demonstrate our method. Also, note that column `notes` have categorical information. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Generate true errors\n", + "true_errors_index = np.where(data[\"grade\"] != data[\"true_grade\"])[0]\n", + "plot_data(\n", + " data_x=data[\"exam_3\"],\n", + " data_y=data[\"grade\"],\n", + " circles=true_errors_index,\n", + " title=\"Messy Regression dataset\",\n", + " xlabel=\"exam_3 feature\",\n", + " ylabel=\"grade (Y value)\",\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the above plot, `grade (Y value)` is plotted against one of the features in the dataset (`exam_3`). We have circled the examples that were considered as `true_error` in **Red** . " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use sklearn API `XGBRegressor` from `xgboost` as the regressor for this tutorial. `xgboost` provides easy to use interface to process categorical variables. In order to make inputs compatible with `xgboost`, we need to divide data in `X` and labels `y`. This is demonstrated in the code below: " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# xgboost automatically factors categorical variable, you just need to mark the columns as category\n", + "data.notes = data.notes.astype(\"category\")\n", + "\n", + "# xgboost takes data and label seperately, so you will need to divide data accordingly.\n", + "X = data.drop([\"grade\", \"true_grade\"], axis=1)\n", + "y = data[\"grade\"]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start with initializing the model with relevant parameters. As mentioned earlier we are using `xgboost` for this tutorial. To handle categorical variables, we specifically need to set `enable_categorical` flag to `True`. Note that, support for the categorical variable is in the experimental stage and doesn't support the auto-selection of `tree_method`. Therefore, you will need to specify `tree_method` from supported types. More details can be found [here](https://xgboost.readthedocs.io/en/stable/tutorials/categorical.html)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# using default xgboost cv " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
train-rmse-meantrain-rmse-stdtest-rmse-meantest-rmse-std
014.2264951.36156314.1786821.087675
111.8619650.85277512.1019180.589881
210.6461010.62194310.7044870.317186
\n", + "
" + ], + "text/plain": [ + " train-rmse-mean train-rmse-std test-rmse-mean test-rmse-std\n", + "0 14.226495 1.361563 14.178682 1.087675\n", + "1 11.861965 0.852775 12.101918 0.589881\n", + "2 10.646101 0.621943 10.704487 0.317186" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import xgboost \n", + "from xgboost import DMatrix, XGBRegressor\n", + "from xgboost import XGBRegressor\n", + "from sklearn.model_selection import cross_val_predict\n", + "\n", + "SEED = 1\n", + "num_boost_round = 1000\n", + "num_crossval_folds = 5\n", + "\n", + "training_data = DMatrix(X, label=y, enable_categorical=True)\n", + "params = {\"booster\": \"gblinear\", \"objective\": \"reg:squarederror\"}\n", + "\n", + "cross_validation_results = xgboost.cv(params, \n", + " training_data, \n", + " num_boost_round=num_boost_round,\n", + " nfold=num_crossval_folds, \n", + " seed=SEED,\n", + " early_stopping_rounds=5) \n", + "\n", + "display(cross_validation_results.head(3))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "best_iteration = cross_validation_results['test-rmse-mean'].argmin()\n", + "\n", + "model = XGBRegressor(\n", + " tree_method= \"hist\",\n", + " n_estimators = best_iteration, \n", + " enable_categorical = True, \n", + " random_state = SEED)\n", + "\n", + "# get predictions\n", + "predictions = cross_val_predict(\n", + " estimator=model, X=X, y=y, cv=num_crossval_folds, method = \"predict\"\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Using cleanlab to generate label quality scores\n", + "\n", + "Once you have the predictions from the cross-validation. You can generate label quality scores using cleanlab by running just one line of code. " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# get label quality score for each example in the dataset using cleanlab\n", + "label_quality_scores = get_label_quality_scores(labels=y, predictions=predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_data(\n", + " data_x=data[\"exam_3\"],\n", + " data_y=data[\"grade\"],\n", + " circles=true_errors_index,\n", + " color=label_quality_scores,\n", + " title=\"Messy Regression dataset with label quality scores\",\n", + " colorbar=True,\n", + " xlabel=\"exam_3 feature\",\n", + " ylabel=\"grade (Y value)\",\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the above plot, we have colored each examples with their label quality scores generated using cleanlab. Examples are same as earlier plot displayed in the notebook. `grade (Y value)` is plotted against one of the features in the dataset (`exam_3`)\n", + "\n", + "**Red circle** represents the errors in `grade` with respect to the ground truth `true_grade`. You can observe that our method assign low scores to examples that were considered as `true_error`" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OUTRE: 0.9762667538411246\n", + "RESIDUAL: 0.9781736951073335\n" + ] + }, + { + "ename": "ValueError", + "evalue": "Label quality scores did not outperform alternative scores", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn [13], line 20\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mLabel quality scores did not perform well enough\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 17\u001b[0m \u001b[39mif\u001b[39;00m roc_auc_score(true_errors, \u001b[39m1\u001b[39m \u001b[39m-\u001b[39m label_quality_scores) \u001b[39m<\u001b[39m\u001b[39m=\u001b[39m roc_auc_score(\n\u001b[1;32m 18\u001b[0m true_errors, \u001b[39m1\u001b[39m \u001b[39m-\u001b[39m label_quality_scores_residual\n\u001b[1;32m 19\u001b[0m ):\n\u001b[0;32m---> 20\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mLabel quality scores did not outperform alternative scores\u001b[39m\u001b[39m\"\u001b[39m)\n", + "\u001b[0;31mValueError\u001b[0m: Label quality scores did not outperform alternative scores" + ] + } + ], + "source": [ + "# Note: This cell is only for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", + "from sklearn.metrics import roc_auc_score\n", + "\n", + "true_errors = (data[\"grade\"] != data[\"true_grade\"]).astype(int)\n", + "\n", + "# label_quality_scores = get_label_quality_scores(labels= y, predictions=predictions)\n", + "from cleanlab.regression.rank import get_outre_score_for_each_label\n", + "label_quality_scores = get_outre_score_for_each_label(labels=np.array(y), predictions=predictions, frac_neighbors=0.5)\n", + "label_quality_scores_residual = get_label_quality_scores(labels = y, predictions=predictions, method=\"residual\")\n", + "\n", + "if roc_auc_score(true_errors, 1 - label_quality_scores) < 0.5:\n", + " raise ValueError(\"Label quality scores did not perform well enough\")\n", + "\n", + "if roc_auc_score(true_errors, 1 - label_quality_scores) <= roc_auc_score(\n", + " true_errors, 1 - label_quality_scores_residual\n", + "):\n", + " raise ValueError(\"Label quality scores did not outperform alternative scores\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.8 ('ENV': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "1ed33b5e6ac3d9870092cd802185bba6fb7a8302b6022e7097221f18c33cb7b2" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/tutorials/student_grades.csv b/docs/source/tutorials/student_grades.csv deleted file mode 100644 index 347454bb87..0000000000 --- a/docs/source/tutorials/student_grades.csv +++ /dev/null @@ -1,980 +0,0 @@ -,exam_1,exam_2,exam_3,notes,true_grade,grade -0,53,77,93,,76.2,76.2 -1,81,64,80,great participation +10,85.5,85.5 -2,74,88,97,,87.4,87.4 -3,61,94,78,,77.7,77.7 -4,48,90,91,,77.8,77.8 -5,89,95,72,,84.0,84.0 -6,0,83,97,"cheated on exam, gets 0pts",63.7,63.7 -7,71,82,97,,84.7,84.7 -8,0,56,96,"cheated on exam, gets 0pts",55.2,55.2 -9,75,80,98,missed class frequently -10,75.7,75.7 -10,81,65,82,,76.6,76.6 -11,95,86,53,missed homework frequently -10,65.5,65.5 -12,89,72,98,missed homework frequently -10,77.5,77.5 -13,80,59,65,,67.7,77.7 -14,67,82,98,,83.9,83.9 -15,87,78,95,,87.5,87.5 -16,89,69,98,great participation +10,96.6,96.6 -17,53,72,46,missed class frequently -10,45.9,45.9 -18,46,84,89,missed homework frequently -10,64.6,64.6 -19,70,63,95,great participation +10,87.9,87.9 -20,63,93,94,,84.4,84.4 -21,90,75,90,missed homework frequently -10,75.5,75.5 -22,72,81,95,missed class frequently -10,73.9,73.9 -23,90,83,51,,72.3,72.3 -24,99,53,76,,76.0,76.0 -25,85,49,83,,73.4,73.4 -26,89,71,94,great final presentation +10,95.6,95.6 -27,86,80,89,,85.4,85.4 -28,75,94,79,,82.3,82.3 -29,61,95,94,,84.4,84.4 -30,74,64,91,,77.8,77.8 -31,98,92,96,,95.4,95.4 -32,95,83,78,,84.6,84.6 -33,92,63,91,missed homework frequently -10,72.9,72.9 -34,91,63,48,great participation +10,75.4,75.4 -35,69,94,95,great participation +10,96.9,96.9 -36,81,90,90,missed homework frequently -10,77.3,77.3 -37,98,69,90,,86.1,86.1 -38,82,94,99,missed class frequently -10,82.4,82.4 -39,73,57,96,,77.4,77.4 -40,98,73,86,great final presentation +10,95.7,95.7 -41,0,81,99,"cheated on exam, gets 0pts",63.9,63.9 -42,0,89,100,"cheated on exam, gets 0pts",66.7,66.7 -43,62,83,81,,75.9,75.9 -44,78,84,92,great final presentation +10,95.4,95.4 -45,79,84,92,,85.7,85.7 -46,65,69,92,missed homework frequently -10,67.0,67.0 -47,98,82,47,great final presentation +10,82.8,82.8 -48,92,0,89,"cheated on exam, gets 0pts",63.2,63.2 -49,84,46,95,,77.0,77.0 -50,95,85,98,,93.2,93.2 -51,86,88,90,great final presentation +10,98.2,98.2 -52,65,58,91,,73.3,73.3 -53,79,83,88,,83.8,83.8 -54,0,95,87,"cheated on exam, gets 0pts",63.3,63.3 -55,90,57,97,,82.9,82.9 -56,72,69,81,missed homework frequently -10,64.7,64.7 -57,70,85,96,,84.9,84.9 -58,89,59,79,missed homework frequently -10,66.0,56.0 -59,75,83,69,,75.0,75.0 -60,89,70,90,missed homework frequently -10,73.7,73.7 -61,81,85,95,,87.8,87.8 -62,76,70,73,great participation +10,83.0,83.0 -63,92,62,66,great participation +10,82.6,82.6 -64,94,98,99,,97.2,97.2 -65,96,99,96,great final presentation +10,100.0,100.0 -66,65,85,80,great participation +10,87.0,87.0 -67,90,79,85,great participation +10,94.7,94.7 -68,0,87,77,"cheated on exam, gets 0pts",56.9,56.9 -69,91,75,95,great participation +10,97.8,97.8 -70,88,77,92,,86.3,86.3 -71,86,64,96,,83.4,83.4 -72,88,67,74,,76.1,76.1 -73,76,59,65,,66.5,66.5 -74,96,68,59,great participation +10,82.8,82.8 -75,84,98,51,,75.0,75.0 -76,97,85,75,,84.6,84.6 -77,97,98,80,great final presentation +10,100.0,100.0 -78,89,74,71,,77.3,77.3 -79,72,90,93,,85.8,85.8 -80,49,0,78,"cheated on exam, gets 0pts",45.9,45.9 -81,88,93,78,great participation +10,95.5,95.5 -82,88,80,82,great participation +10,93.2,103.2 -83,75,91,68,great final presentation +10,87.0,87.0 -84,68,99,91,,86.5,86.5 -85,92,87,71,,82.1,72.1 -86,83,97,75,great final presentation +10,94.0,94.0 -87,87,100,94,,93.7,93.7 -88,86,95,77,,85.1,85.1 -89,79,0,76,"cheated on exam, gets 0pts",54.1,54.1 -90,35,100,61,great participation +10,74.9,74.9 -91,99,83,44,great final presentation +10,82.2,82.2 -92,96,77,79,missed class frequently -10,73.5,83.5 -93,78,94,81,,84.0,84.0 -94,58,76,94,great participation +10,87.8,87.8 -95,70,0,63,"cheated on exam, gets 0pts",46.2,46.2 -96,75,46,48,great participation +10,65.5,65.5 -97,77,84,95,,86.3,86.3 -98,81,56,54,,62.7,62.7 -99,76,96,78,great final presentation +10,92.8,102.8 -100,99,53,73,missed homework frequently -10,64.8,64.8 -101,93,91,84,great final presentation +10,98.8,98.8 -102,73,87,86,missed class frequently -10,72.4,72.4 -103,85,99,71,,83.6,83.6 -104,82,79,64,,73.9,73.9 -105,100,100,81,missed homework frequently -10,82.4,82.4 -106,99,87,99,great final presentation +10,100.0,100.0 -107,0,84,93,"cheated on exam, gets 0pts",62.4,62.4 -108,90,81,79,,82.9,82.9 -109,78,0,76,"cheated on exam, gets 0pts",53.8,53.8 -110,92,73,89,,85.1,85.1 -111,74,77,79,missed homework frequently -10,66.9,66.9 -112,98,93,92,,94.1,94.1 -113,87,82,56,,73.1,73.1 -114,96,56,70,,73.6,73.6 -115,85,88,27,,62.7,62.7 -116,81,89,78,missed homework frequently -10,72.2,62.2 -117,77,81,88,great participation +10,92.6,92.6 -118,78,53,96,missed homework frequently -10,67.7,67.7 -119,71,57,96,great participation +10,86.8,86.8 -120,58,97,89,great participation +10,92.1,92.1 -121,53,99,69,missed class frequently -10,63.2,73.2 -122,96,82,86,great participation +10,97.8,97.8 -123,71,90,92,great final presentation +10,95.1,95.1 -124,0,81,97,"cheated on exam, gets 0pts",63.1,63.1 -125,74,94,81,missed homework frequently -10,72.8,72.8 -126,73,73,76,missed class frequently -10,64.2,64.2 -127,0,78,80,"cheated on exam, gets 0pts",55.4,55.4 -128,86,92,86,great participation +10,97.8,97.8 -129,97,94,87,great final presentation +10,100.0,100.0 -130,92,81,97,great final presentation +10,100.0,100.0 -131,93,0,60,"cheated on exam, gets 0pts",51.9,51.9 -132,82,0,79,"cheated on exam, gets 0pts",56.2,56.2 -133,99,62,89,,83.9,83.9 -134,75,0,55,"cheated on exam, gets 0pts",44.5,44.5 -135,77,70,84,,77.7,77.7 -136,84,66,49,great final presentation +10,74.6,74.6 -137,76,81,96,,85.5,85.5 -138,68,63,96,missed class frequently -10,67.7,67.7 -139,71,54,99,missed homework frequently -10,67.1,67.1 -140,84,92,75,missed homework frequently -10,72.8,72.8 -141,90,84,97,great participation +10,100.0,110.0 -142,61,100,94,,85.9,85.9 -143,73,56,91,missed class frequently -10,65.1,65.1 -144,83,88,57,missed homework frequently -10,64.1,74.1 -145,69,77,57,great participation +10,76.6,76.6 -146,79,77,95,great participation +10,94.8,94.8 -147,0,97,90,"cheated on exam, gets 0pts",65.1,65.1 -148,58,68,75,,67.8,67.8 -149,63,82,53,,64.7,64.7 -150,76,78,70,,74.2,74.2 -151,0,82,77,"cheated on exam, gets 0pts",55.4,55.4 -152,91,79,93,great final presentation +10,98.2,98.2 -153,71,65,81,great participation +10,83.2,83.2 -154,71,0,75,"cheated on exam, gets 0pts",51.3,51.3 -155,97,57,67,great participation +10,83.0,83.0 -156,98,77,86,,86.9,86.9 -157,56,80,79,,72.4,72.4 -158,99,90,90,,92.7,92.7 -159,80,83,87,,83.7,83.7 -160,79,75,74,,75.8,75.8 -161,95,81,76,,83.2,83.2 -162,72,87,90,missed class frequently -10,73.7,73.7 -163,72,0,86,"cheated on exam, gets 0pts",56.0,66.0 -164,91,0,94,"cheated on exam, gets 0pts",64.9,64.9 -165,73,84,68,great participation +10,84.3,74.3 -166,0,96,90,"cheated on exam, gets 0pts",64.8,64.8 -167,98,89,71,missed class frequently -10,74.5,74.5 -168,93,65,96,missed class frequently -10,75.8,75.8 -169,79,91,89,missed homework frequently -10,76.6,76.6 -170,89,93,79,,86.2,86.2 -171,96,90,77,,86.6,86.6 -172,0,65,87,"cheated on exam, gets 0pts",54.3,54.3 -173,92,86,78,,84.6,84.6 -174,56,52,78,,63.6,63.6 -175,74,0,85,"cheated on exam, gets 0pts",56.2,66.2 -176,73,80,95,,83.9,83.9 -177,69,93,84,,82.2,82.2 -178,91,99,90,,93.0,93.0 -179,73,92,91,great final presentation +10,95.9,95.9 -180,94,41,91,great participation +10,86.9,86.9 -181,91,96,92,,92.9,92.9 -182,69,94,58,missed homework frequently -10,62.1,62.1 -183,94,79,89,,87.5,87.5 -184,0,92,98,"cheated on exam, gets 0pts",66.8,66.8 -185,89,92,79,,85.9,85.9 -186,92,86,81,,85.8,85.8 -187,82,85,89,,85.7,85.7 -188,82,77,92,missed class frequently -10,74.5,74.5 -189,99,65,87,,84.0,84.0 -190,73,95,67,,77.2,77.2 -191,98,94,90,great participation +10,100.0,100.0 -192,95,78,60,great participation +10,85.9,85.9 -193,90,67,77,missed class frequently -10,67.9,67.9 -194,96,0,90,"cheated on exam, gets 0pts",64.8,64.8 -195,86,80,89,,85.4,85.4 -196,51,78,84,,72.3,72.3 -197,99,75,88,,87.4,87.4 -198,93,58,70,great final presentation +10,83.3,83.3 -199,95,0,72,"cheated on exam, gets 0pts",57.3,57.3 -200,88,71,93,,84.9,84.9 -201,91,92,70,,82.9,82.9 -202,98,92,69,,84.6,74.6 -203,90,0,100,"cheated on exam, gets 0pts",67.0,57.0 -204,89,70,74,,77.3,77.3 -205,97,82,80,,85.7,85.7 -206,78,81,74,missed homework frequently -10,67.3,67.3 -207,77,87,83,,82.4,82.4 -208,86,99,97,,94.3,84.3 -209,92,92,85,great final presentation +10,99.2,99.2 -210,86,80,66,,76.2,76.2 -211,59,70,86,,73.1,73.1 -212,93,75,91,,86.8,86.8 -213,76,54,65,,65.0,65.0 -214,72,78,69,missed homework frequently -10,62.6,62.6 -215,58,90,80,,76.4,76.4 -216,77,84,86,,82.7,82.7 -217,92,66,65,missed homework frequently -10,63.400000000000000,63.400000000000000 -218,88,83,86,great final presentation +10,95.7,95.7 -219,65,0,88,"cheated on exam, gets 0pts",54.7,64.7 -220,89,72,60,great final presentation +10,82.3,82.3 -221,84,79,97,great final presentation +10,97.7,97.7 -222,96,64,65,,74.0,74.0 -223,96,75,92,great final presentation +10,98.1,98.1 -224,62,92,76,,76.6,76.6 -225,94,62,66,,73.2,73.2 -226,85,74,92,,84.5,84.5 -227,73,90,68,missed homework frequently -10,66.1,66.1 -228,81,76,97,great participation +10,95.9,95.9 -229,96,87,80,missed class frequently -10,76.9,76.9 -230,76,65,59,great final presentation +10,75.9,85.9 -231,83,85,90,great participation +10,96.4,96.4 -232,85,100,79,missed class frequently -10,77.1,77.1 -233,75,77,73,missed homework frequently -10,64.8,64.8 -234,84,94,47,missed homework frequently -10,62.2,62.2 -235,84,96,73,great participation +10,93.2,93.2 -236,65,85,50,,65.0,65.0 -237,90,54,80,,75.2,75.2 -238,68,99,99,great final presentation +10,99.7,99.7 -239,95,88,69,,82.5,82.5 -240,0,97,85,"cheated on exam, gets 0pts",63.1,63.1 -241,89,93,70,missed homework frequently -10,72.6,72.6 -242,68,83,76,,75.7,75.7 -243,79,81,89,great final presentation +10,93.6,93.6 -244,73,95,93,,87.6,87.6 -245,87,93,91,great participation +10,100.0,100.0 -246,86,71,68,,74.3,74.3 -247,95,96,64,,82.9,82.9 -248,99,95,67,,85.0,85.0 -249,75,78,97,,84.7,84.7 -250,85,72,70,,75.1,75.1 -251,89,74,85,missed class frequently -10,72.9,72.9 -252,74,94,84,great participation +10,94.0,94.0 -253,95,75,97,great final presentation +10,99.8,99.8 -254,83,93,77,,83.6,73.6 -255,78,58,86,great final presentation +10,85.2,85.2 -256,88,56,80,,75.2,75.2 -257,94,94,72,great participation +10,95.2,95.2 -258,84,87,96,great final presentation +10,99.7,99.7 -259,99,76,95,great participation +10,100.0,100.0 -260,96,94,89,,92.6,92.6 -261,67,80,51,,64.5,64.5 -262,91,92,74,,84.5,84.5 -263,77,96,78,,83.1,83.1 -264,99,86,95,missed class frequently -10,83.5,73.5 -265,72,82,100,great participation +10,96.2,96.2 -266,89,99,73,great final presentation +10,95.6,95.6 -267,67,76,33,,56.1,56.1 -268,46,70,77,,65.6,65.6 -269,46,92,66,,67.8,57.8 -270,87,78,82,missed homework frequently -10,72.3,72.3 -271,80,98,84,great participation +10,97.0,97.0 -272,91,92,95,missed homework frequently -10,82.9,82.9 -273,80,60,80,,74.0,74.0 -274,88,75,59,missed class frequently -10,62.5,62.5 -275,93,73,97,great final presentation +10,98.6,98.6 -276,86,67,76,missed homework frequently -10,66.3,66.3 -277,59,94,80,missed homework frequently -10,67.9,67.9 -278,61,77,88,,76.6,86.6 -279,88,56,82,great participation +10,86.0,86.0 -280,86,89,85,great final presentation +10,96.5,96.5 -281,72,90,98,,87.8,87.8 -282,68,93,73,,77.5,77.5 -283,94,93,91,missed homework frequently -10,82.5,82.5 -284,91,98,97,great participation +10,100.0,100.0 -285,91,0,89,"cheated on exam, gets 0pts",62.9,62.9 -286,58,54,37,,48.4,48.4 -287,93,88,79,missed homework frequently -10,75.9,75.9 -288,0,72,88,"cheated on exam, gets 0pts",56.8,56.8 -289,61,76,87,,75.9,65.9 -290,84,97,99,,93.9,93.9 -291,86,98,95,,93.2,103.2 -292,80,76,66,,73.2,73.2 -293,73,86,40,great final presentation +10,73.7,73.7 -294,0,50,85,"cheated on exam, gets 0pts",49.0,49.0 -295,73,64,87,missed homework frequently -10,65.9,75.9 -296,95,76,41,,67.7,77.7 -297,0,69,90,"cheated on exam, gets 0pts",56.7,56.7 -298,86,45,86,great final presentation +10,83.7,83.7 -299,64,93,70,,75.1,75.1 -300,87,57,76,,73.6,63.600000000000000 -301,61,65,100,missed homework frequently -10,67.8,67.8 -302,87,56,85,,76.9,76.9 -303,83,77,72,missed homework frequently -10,66.8,66.8 -304,78,89,81,,82.5,82.5 -305,95,94,89,,92.3,92.3 -306,84,96,91,great final presentation +10,100.0,90.0 -307,67,93,93,,85.2,95.2 -308,77,79,94,,84.4,84.4 -309,86,74,61,,72.4,72.4 -310,0,71,80,"cheated on exam, gets 0pts",53.3,53.3 -311,79,62,82,,75.1,75.1 -312,66,72,83,missed homework frequently -10,64.6,64.6 -313,79,89,86,great final presentation +10,94.8,94.8 -314,100,98,96,missed homework frequently -10,87.8,87.8 -315,73,82,69,missed class frequently -10,64.1,64.1 -316,76,84,67,,74.8,74.8 -317,0,84,72,"cheated on exam, gets 0pts",54.0,54.0 -318,0,92,56,"cheated on exam, gets 0pts",50.0,50.0 -319,99,83,88,great participation +10,99.8,99.8 -320,91,61,100,,85.6,85.6 -321,83,52,88,missed homework frequently -10,65.7,65.7 -322,83,93,54,,74.4,74.4 -323,0,93,88,"cheated on exam, gets 0pts",63.1,63.1 -324,95,82,54,,74.7,74.7 -325,86,67,98,,85.1,85.1 -326,91,89,83,missed class frequently -10,77.2,77.2 -327,100,73,84,missed homework frequently -10,75.5,75.5 -328,52,95,72,great participation +10,82.9,82.9 -329,64,73,79,great final presentation +10,82.7,82.7 -330,98,51,74,,74.3,74.3 -331,86,91,86,missed class frequently -10,77.5,77.5 -332,97,97,92,missed homework frequently -10,85.0,85.0 -333,88,89,80,missed class frequently -10,75.1,75.1 -334,54,80,81,,72.6,72.6 -335,90,70,68,,75.2,65.2 -336,78,63,75,great final presentation +10,82.3,82.3 -337,93,90,82,,87.7,87.7 -338,66,73,89,,77.3,77.3 -339,82,88,78,great final presentation +10,92.2,92.2 -340,83,87,88,missed homework frequently -10,76.2,86.2 -341,53,100,78,great final presentation +10,87.1,77.1 -342,93,70,97,,87.7,87.7 -343,93,73,82,,82.6,92.6 -344,93,79,97,great participation +10,100.0,100.0 -345,99,86,54,,77.1,77.1 -346,85,80,60,,73.5,73.5 -347,64,86,78,,76.2,76.2 -348,89,97,100,missed class frequently -10,85.8,85.8 -349,95,88,77,,85.7,85.7 -350,96,62,92,great participation +10,94.2,94.2 -351,84,0,80,"cheated on exam, gets 0pts",57.2,57.2 -352,88,71,94,great participation +10,95.3,95.3 -353,92,93,81,,87.9,87.9 -354,93,94,94,,93.7,93.7 -355,86,87,86,missed class frequently -10,76.3,76.3 -356,80,85,84,missed homework frequently -10,73.1,73.1 -357,0,73,89,"cheated on exam, gets 0pts",57.5,57.5 -358,89,78,88,great participation +10,95.3,95.3 -359,66,90,50,,66.8,66.8 -360,81,0,75,"cheated on exam, gets 0pts",54.3,54.3 -361,87,93,76,missed class frequently -10,74.4,74.4 -362,71,76,56,,66.5,66.5 -363,94,70,85,great participation +10,93.2,93.2 -364,90,60,78,,76.2,76.2 -365,66,60,86,,72.2,72.2 -366,73,96,92,,87.5,87.5 -367,0,49,93,"cheated on exam, gets 0pts",51.9,51.9 -368,97,81,100,great participation +10,100.0,100.0 -369,0,54,74,"cheated on exam, gets 0pts",45.8,45.8 -370,86,75,62,,73.1,83.1 -371,89,90,93,great final presentation +10,100.0,100.0 -372,65,94,94,,85.3,85.3 -373,91,99,79,great participation +10,98.6,98.6 -374,69,55,95,,75.2,75.2 -375,77,71,78,,75.6,75.6 -376,99,59,70,missed class frequently -10,65.4,65.4 -377,88,93,92,great final presentation +10,100.0,100.0 -378,81,92,77,,82.7,82.7 -379,84,74,91,great participation +10,93.8,93.8 -380,85,81,87,,84.6,84.6 -381,88,67,92,,83.3,83.3 -382,97,0,94,"cheated on exam, gets 0pts",66.7,66.7 -383,99,67,89,great final presentation +10,95.4,95.4 -384,95,77,87,,86.4,86.4 -385,81,95,77,,83.6,83.6 -386,72,80,69,,73.2,73.2 -387,95,59,90,great participation +10,92.2,92.2 -388,89,95,87,great participation +10,100.0,100.0 -389,81,56,90,great final presentation +10,87.1,87.1 -390,50,67,55,,57.1,57.1 -391,87,0,91,"cheated on exam, gets 0pts",62.5,62.5 -392,79,0,59,"cheated on exam, gets 0pts",47.3,57.3 -393,85,62,75,,74.1,74.1 -394,64,75,90,,77.7,67.7 -395,69,87,94,great participation +10,94.4,94.4 -396,87,77,96,,87.6,87.6 -397,80,92,84,,85.2,85.2 -398,52,80,50,missed class frequently -10,49.6,49.6 -399,83,70,71,,74.3,74.3 -400,77,95,85,,85.6,75.6 -401,72,80,93,,82.8,82.8 -402,89,67,95,great final presentation +10,94.8,84.8 -403,90,87,99,,92.7,92.7 -404,85,79,87,,84.0,84.0 -405,69,95,62,great participation +10,84.0,74.0 -406,69,92,99,missed homework frequently -10,77.9,77.9 -407,97,61,76,missed homework frequently -10,67.8,67.8 -408,0,76,75,"cheated on exam, gets 0pts",52.8,52.8 -409,91,74,88,great participation +10,94.7,94.7 -410,83,92,80,missed class frequently -10,74.5,74.5 -411,82,0,58,"cheated on exam, gets 0pts",47.8,47.8 -412,92,96,90,,92.4,92.4 -413,59,82,61,,66.7,66.7 -414,65,91,72,,75.6,75.6 -415,65,81,75,,73.8,73.8 -416,97,0,92,"cheated on exam, gets 0pts",65.9,65.9 -417,71,86,89,missed class frequently -10,72.7,82.7 -418,94,0,95,"cheated on exam, gets 0pts",66.2,66.2 -419,65,95,68,,75.2,75.2 -420,84,83,60,,74.1,74.1 -421,67,87,95,missed homework frequently -10,74.2,74.2 -422,88,69,100,,87.1,87.1 -423,65,98,93,,86.1,86.1 -424,78,57,85,,74.5,74.5 -425,92,97,44,,74.3,74.3 -426,53,49,88,,65.8,65.8 -427,83,88,88,,86.5,86.5 -428,83,79,93,great participation +10,95.8,95.8 -429,96,91,90,great final presentation +10,100.0,100.0 -430,76,91,100,great participation +10,100.0,100.0 -431,85,92,84,missed homework frequently -10,76.7,76.7 -432,92,90,77,great participation +10,95.4,95.4 -433,83,78,74,,77.9,77.9 -434,97,98,92,,95.3,95.3 -435,88,78,91,great final presentation +10,96.2,96.2 -436,95,41,56,,63.2,53.2 -437,97,86,54,missed homework frequently -10,66.5,66.5 -438,68,73,81,missed class frequently -10,64.7,64.7 -439,94,92,75,great participation +10,95.8,95.8 -440,96,84,74,,83.6,73.6 -441,86,85,89,missed homework frequently -10,76.9,76.9 -442,87,81,86,,84.8,84.8 -443,77,0,87,"cheated on exam, gets 0pts",57.9,57.9 -444,84,83,85,missed class frequently -10,74.1,74.1 -445,0,69,93,"cheated on exam, gets 0pts",57.9,57.9 -446,94,95,90,missed class frequently -10,82.7,82.7 -447,57,78,83,,73.7,73.7 -448,92,74,88,great final presentation +10,95.0,95.0 -449,72,91,92,,85.7,85.7 -450,0,77,81,"cheated on exam, gets 0pts",55.5,45.5 -451,87,83,78,,82.2,82.2 -452,69,85,79,,77.8,77.8 -453,73,74,96,,82.5,82.5 -454,51,65,97,,73.6,73.6 -455,90,99,90,,92.7,92.7 -456,85,0,96,"cheated on exam, gets 0pts",63.9,63.9 -457,71,81,80,,77.6,77.6 -458,88,80,91,missed class frequently -10,76.8,76.8 -459,80,89,88,,85.9,85.9 -460,74,86,93,great participation +10,95.2,95.2 -461,0,72,67,"cheated on exam, gets 0pts",48.4,48.4 -462,89,98,80,great participation +10,98.1,98.1 -463,92,92,76,,85.6,95.6 -464,62,95,52,missed homework frequently -10,57.900000000000000,57.900000000000000 -465,77,78,95,missed homework frequently -10,74.5,74.5 -466,93,91,52,,76.0,76.0 -467,84,97,89,great participation +10,99.9,89.9 -468,94,62,50,missed homework frequently -10,56.8,56.8 -469,56,76,60,,63.6,63.6 -470,80,95,61,,76.9,76.9 -471,90,95,75,great final presentation +10,95.5,95.5 -472,91,92,96,missed class frequently -10,83.3,83.3 -473,66,91,70,missed class frequently -10,65.1,65.1 -474,58,92,93,,82.2,82.2 -475,72,93,98,great participation +10,98.7,98.7 -476,80,96,83,missed homework frequently -10,76.0,76.0 -477,100,69,41,,67.1,67.1 -478,91,98,77,,87.5,87.5 -479,71,77,73,great final presentation +10,83.6,83.6 -480,90,73,91,,85.3,85.3 -481,66,95,86,,82.7,72.7 -482,67,88,89,,82.1,72.1 -483,50,67,94,great final presentation +10,82.7,82.7 -484,84,99,71,,83.3,83.3 -485,80,65,55,,65.5,65.5 -486,67,83,77,,75.8,75.8 -487,94,78,78,,82.8,82.8 -488,72,63,91,great final presentation +10,86.9,86.9 -489,90,94,79,missed homework frequently -10,76.8,76.8 -490,83,90,83,,85.1,85.1 -491,74,79,92,missed homework frequently -10,72.7,72.7 -492,93,75,93,,87.6,87.6 -493,72,88,96,,86.4,76.4 -494,90,100,89,great participation +10,100.0,100.0 -495,72,92,85,great participation +10,93.2,93.2 -496,60,92,55,,67.6,67.6 -497,83,79,67,,75.4,75.4 -498,93,96,94,great final presentation +10,100.0,100.0 -499,63,68,84,,72.9,72.9 -500,77,88,82,missed homework frequently -10,72.3,72.3 -501,94,82,97,great final presentation +10,100.0,100.0 -502,69,86,97,missed homework frequently -10,75.3,75.3 -503,90,77,99,great participation +10,99.7,99.7 -504,0,99,92,"cheated on exam, gets 0pts",66.5,66.5 -505,93,82,88,,87.7,87.7 -506,86,86,85,missed class frequently -10,75.6,75.6 -507,76,92,64,great participation +10,86.0,86.0 -508,94,97,93,great participation +10,100.0,100.0 -509,89,71,94,,85.6,85.6 -510,89,57,47,,62.6,62.6 -511,96,69,92,,86.3,86.3 -512,91,74,88,,84.7,84.7 -513,85,100,78,,86.7,86.7 -514,85,0,59,"cheated on exam, gets 0pts",49.1,49.1 -515,81,94,82,,85.3,85.3 -516,87,64,94,,82.9,82.9 -517,68,56,98,great final presentation +10,86.4,86.4 -518,56,79,85,,74.5,74.5 -519,81,63,59,missed class frequently -10,56.8,56.8 -520,88,83,83,,84.5,84.5 -521,85,93,84,,87.0,87.0 -522,87,79,82,great final presentation +10,92.6,92.6 -523,92,87,60,great participation +10,87.7,87.7 -524,81,97,100,,93.4,93.4 -525,78,85,84,,82.5,92.5 -526,94,83,88,great final presentation +10,98.3,98.3 -527,0,97,88,"cheated on exam, gets 0pts",64.3,64.3 -528,96,64,93,great final presentation +10,95.2,85.2 -529,69,87,51,,67.2,67.2 -530,81,67,100,great participation +10,94.4,94.4 -531,81,87,60,missed class frequently -10,64.4,64.4 -532,66,83,77,,75.5,75.5 -533,76,72,100,great final presentation +10,94.4,94.4 -534,92,73,85,great final presentation +10,93.5,93.5 -535,80,85,93,great participation +10,96.7,96.7 -536,86,81,88,great final presentation +10,95.3,95.3 -537,56,75,71,,67.7,67.7 -538,72,0,76,"cheated on exam, gets 0pts",52.0,52.0 -539,62,64,63,,63.0,63.0 -540,75,0,65,"cheated on exam, gets 0pts",48.5,48.5 -541,89,65,74,missed homework frequently -10,65.8,65.8 -542,91,0,68,"cheated on exam, gets 0pts",54.5,54.5 -543,72,81,80,,77.9,77.9 -544,87,72,50,great final presentation +10,77.7,77.7 -545,73,96,100,great participation +10,100.0,100.0 -546,78,94,83,,84.8,84.8 -547,94,82,83,,86.0,86.0 -548,95,83,100,,93.4,93.4 -549,76,83,86,,82.1,82.1 -550,93,56,74,,74.3,74.3 -551,91,75,100,great participation +10,99.8,99.8 -552,82,94,85,,86.8,86.8 -553,83,79,92,missed homework frequently -10,75.4,75.4 -554,92,44,66,,67.2,67.2 -555,78,0,86,"cheated on exam, gets 0pts",57.8,57.8 -556,80,96,80,,84.8,84.8 -557,87,48,91,,76.9,76.9 -558,78,89,68,,77.3,77.3 -559,100,100,99,,99.6,99.6 -560,43,0,85,"cheated on exam, gets 0pts",46.9,36.9 -561,0,89,90,"cheated on exam, gets 0pts",62.7,62.7 -562,88,74,96,great final presentation +10,97.0,97.0 -563,81,98,96,,92.1,92.1 -564,84,92,86,,87.2,87.2 -565,83,86,85,missed homework frequently -10,74.7,74.7 -566,67,98,65,missed homework frequently -10,65.5,65.5 -567,0,95,94,"cheated on exam, gets 0pts",66.1,66.1 -568,84,99,47,great final presentation +10,83.7,83.7 -569,91,98,75,missed class frequently -10,76.7,76.7 -570,85,91,83,great participation +10,96.0,96.0 -571,92,50,87,,77.4,77.4 -572,99,84,82,missed class frequently -10,77.7,77.7 -573,87,74,86,missed homework frequently -10,72.7,62.7 -574,71,64,62,great participation +10,75.3,75.3 -575,90,96,71,missed class frequently -10,74.2,84.2 -576,61,54,100,,74.5,74.5 -577,74,48,74,,66.2,66.2 -578,98,68,58,,73.0,73.0 -579,92,87,49,,73.3,73.3 -580,70,75,99,,83.1,83.1 -581,80,0,99,"cheated on exam, gets 0pts",63.6,63.6 -582,66,83,96,,83.1,83.1 -583,69,49,75,,65.4,65.4 -584,93,95,67,,83.2,83.2 -585,87,85,77,,82.4,82.4 -586,82,68,48,,64.2,64.2 -587,75,90,93,,86.7,76.7 -588,67,92,64,,73.3,73.3 -589,98,97,93,,95.7,95.7 -590,73,93,86,,84.2,74.2 -591,58,0,71,"cheated on exam, gets 0pts",45.8,45.8 -592,97,94,76,,87.7,87.7 -593,92,99,87,missed homework frequently -10,82.1,82.1 -594,73,87,92,,84.8,84.8 -595,79,60,84,,75.3,75.3 -596,62,54,71,,63.2,63.2 -597,98,97,68,missed homework frequently -10,75.7,75.7 -598,41,80,93,,73.5,73.5 -599,91,90,81,great participation +10,96.7,96.7 -600,96,83,73,great final presentation +10,92.9,92.9 -601,98,88,95,,93.8,93.8 -602,83,80,86,,83.3,83.3 -603,86,90,80,great final presentation +10,94.8,94.8 -604,90,67,77,missed class frequently -10,67.9,67.9 -605,59,68,65,,64.1,64.1 -606,76,85,74,,77.9,77.9 -607,71,89,96,great final presentation +10,96.4,96.4 -608,82,68,80,,77.0,77.0 -609,57,74,86,,73.7,73.7 -610,91,97,96,,94.8,94.8 -611,89,75,64,great final presentation +10,84.8,84.8 -612,69,87,70,missed homework frequently -10,64.8,64.8 -613,90,66,96,great final presentation +10,95.2,95.2 -614,92,0,71,"cheated on exam, gets 0pts",56.0,56.0 -615,71,89,90,,84.0,84.0 -616,53,0,76,"cheated on exam, gets 0pts",46.3,46.3 -617,90,79,89,,86.3,86.3 -618,92,95,92,,92.9,92.9 -619,81,100,74,,83.9,83.9 -620,92,71,93,great participation +10,96.1,96.1 -621,0,87,94,"cheated on exam, gets 0pts",63.7,63.7 -622,54,76,96,,77.4,67.4 -623,86,67,80,,77.9,77.9 -624,94,79,58,great final presentation +10,85.1,95.1 -625,57,89,55,,65.8,65.8 -626,81,81,91,,85.0,85.0 -627,71,94,89,,85.1,85.1 -628,95,83,83,,86.6,86.6 -629,99,87,71,,84.2,84.2 -630,81,86,95,great participation +10,98.1,98.1 -631,94,78,89,,87.2,87.2 -632,99,73,84,missed homework frequently -10,75.2,75.2 -633,56,90,71,missed class frequently -10,62.2,62.2 -634,79,83,92,,85.4,85.4 -635,94,77,82,great participation +10,94.1,94.1 -636,100,62,61,missed class frequently -10,63.0,63.0 -637,88,92,83,,87.2,87.2 -638,89,86,90,great final presentation +10,98.5,98.5 -639,81,71,54,missed homework frequently -10,57.2,67.2 -640,93,53,77,,74.6,74.6 -641,92,0,61,"cheated on exam, gets 0pts",52.0,52.0 -642,89,77,68,,77.0,77.0 -643,96,98,92,great final presentation +10,100.0,90.0 -644,79,98,84,,86.7,86.7 -645,93,47,79,great participation +10,83.6,83.6 -646,92,93,73,missed homework frequently -10,74.7,74.7 -647,67,69,84,missed class frequently -10,64.4,64.4 -648,94,79,79,great participation +10,93.5,93.5 -649,95,73,57,great final presentation +10,83.2,83.2 -650,91,91,78,great final presentation +10,95.8,95.8 -651,81,69,95,,83.0,73.0 -652,80,76,96,missed class frequently -10,75.2,75.2 -653,76,82,91,great final presentation +10,93.8,93.8 -654,78,95,54,missed homework frequently -10,63.5,63.5 -655,86,96,100,,94.6,104.6 -656,97,98,63,missed class frequently -10,73.7,73.7 -657,98,75,89,great final presentation +10,97.5,97.5 -658,79,73,72,,74.4,74.4 -659,38,86,75,great final presentation +10,77.2,77.2 -660,83,74,97,,85.9,95.9 -661,78,95,87,,86.7,76.7 -662,0,79,65,"cheated on exam, gets 0pts",49.7,49.7 -663,89,66,90,,82.5,82.5 -664,81,96,96,great participation +10,100.0,100.0 -665,68,65,89,,75.5,75.5 -666,86,85,89,,86.9,86.9 -667,99,99,87,,94.2,94.2 -668,60,66,64,great final presentation +10,73.4,73.4 -669,68,95,90,,84.9,84.9 -670,79,81,69,,75.6,75.6 -671,95,69,83,missed homework frequently -10,72.4,72.4 -672,68,0,38,"cheated on exam, gets 0pts",35.6,45.6 -673,93,82,85,missed homework frequently -10,76.5,76.5 -674,80,85,86,missed homework frequently -10,73.9,73.9 -675,0,87,99,"cheated on exam, gets 0pts",65.7,65.7 -676,96,79,92,great participation +10,99.3,99.3 -677,95,82,76,,83.5,93.5 -678,89,81,64,missed class frequently -10,66.6,66.6 -679,70,50,75,missed class frequently -10,56.0,56.0 -680,91,97,53,great participation +10,87.6,87.6 -681,65,94,92,,84.5,84.5 -682,64,91,97,,85.3,85.3 -683,94,90,90,great participation +10,100.0,100.0 -684,85,82,55,great participation +10,82.1,82.1 -685,88,91,80,,85.7,85.7 -686,99,95,99,great participation +10,100.0,100.0 -687,66,0,78,"cheated on exam, gets 0pts",51.0,51.0 -688,49,77,89,great participation +10,83.4,83.4 -689,88,69,89,,82.7,82.7 -690,77,54,70,,67.3,67.3 -691,74,80,90,,82.2,82.2 -692,86,97,95,,92.9,92.9 -693,97,88,74,great participation +10,95.1,95.1 -694,84,55,53,missed class frequently -10,52.9,52.9 -695,0,96,87,"cheated on exam, gets 0pts",63.6,63.6 -696,89,74,85,,82.9,72.9 -697,55,71,98,missed homework frequently -10,67.0,67.0 -698,78,76,72,,75.0,75.0 -699,99,77,81,great final presentation +10,95.2,95.2 -700,78,63,88,,77.5,77.5 -701,96,76,85,,85.6,85.6 -702,88,88,83,missed class frequently -10,76.0,76.0 -703,89,85,87,great final presentation +10,97.0,97.0 -704,89,78,94,,87.7,87.7 -705,93,81,77,great participation +10,93.0,93.0 -706,71,96,65,missed class frequently -10,66.1,66.1 -707,70,96,32,missed homework frequently -10,52.6,52.6 -708,95,87,82,missed class frequently -10,77.4,77.4 -709,81,56,58,,64.3,64.3 -710,91,69,90,,84.0,94.0 -711,97,84,87,great participation +10,99.1,89.1 -712,61,0,97,"cheated on exam, gets 0pts",57.1,57.1 -713,70,99,90,,86.7,86.7 -714,72,91,91,,85.3,85.3 -715,73,77,73,,74.2,74.2 -716,58,90,100,great participation +10,94.4,94.4 -717,77,51,70,,66.4,66.4 -718,78,58,86,great final presentation +10,85.2,85.2 -719,95,94,89,,92.3,92.3 -720,65,0,95,"cheated on exam, gets 0pts",57.5,67.5 -721,80,0,98,"cheated on exam, gets 0pts",63.2,63.2 -722,65,0,92,"cheated on exam, gets 0pts",56.3,56.3 -723,89,91,57,,76.8,76.8 -724,65,72,86,great participation +10,85.5,85.5 -725,74,81,74,,76.1,76.1 -726,73,68,61,missed homework frequently -10,56.7,56.7 -727,94,91,93,great participation +10,100.0,100.0 -728,66,82,82,missed homework frequently -10,67.2,67.2 -729,92,81,54,,73.5,73.5 -730,78,75,96,,84.3,84.3 -731,81,79,86,,82.4,72.4 -732,78,84,71,,77.0,77.0 -733,84,81,88,,84.7,84.7 -734,97,0,90,"cheated on exam, gets 0pts",65.1,65.1 -735,0,61,84,"cheated on exam, gets 0pts",51.9,51.9 -736,85,94,84,,87.3,87.3 -737,100,82,72,great participation +10,93.4,93.4 -738,64,70,86,,74.6,74.6 -739,68,74,40,missed homework frequently -10,48.6,48.6 -740,90,82,60,missed class frequently -10,65.6,65.6 -741,87,97,99,,94.8,94.8 -742,0,75,78,"cheated on exam, gets 0pts",53.7,53.7 -743,76,74,97,,83.8,83.8 -744,90,88,84,missed homework frequently -10,77.0,77.0 -745,92,72,94,,86.8,86.8 -746,87,83,87,missed class frequently -10,75.8,75.8 -747,87,82,57,,73.5,73.5 -748,70,92,96,missed homework frequently -10,77.0,77.0 -749,94,77,87,,86.1,86.1 -750,84,65,95,missed class frequently -10,72.7,72.7 -751,92,79,98,great final presentation +10,100.0,100.0 -752,53,89,78,,73.8,73.8 -753,0,83,76,"cheated on exam, gets 0pts",55.3,55.3 -754,82,69,75,great final presentation +10,85.3,85.3 -755,88,90,81,missed homework frequently -10,75.8,85.8 -756,52,75,62,great final presentation +10,72.9,72.9 -757,78,89,58,,73.3,73.3 -758,41,88,98,missed class frequently -10,67.9,67.9 -759,93,68,86,missed homework frequently -10,72.7,72.7 -760,92,75,93,missed homework frequently -10,77.3,67.3 -761,53,0,94,"cheated on exam, gets 0pts",53.5,53.5 -762,90,95,68,,82.7,82.7 -763,43,76,91,missed homework frequently -10,62.1,62.1 -764,96,73,89,missed homework frequently -10,76.3,76.3 -765,79,76,98,,85.7,85.7 -766,82,87,82,,83.5,73.5 -767,82,80,98,great final presentation +10,97.8,97.8 -768,93,96,90,,92.7,92.7 -769,88,96,79,great final presentation +10,96.8,86.8 -770,58,66,74,,66.8,66.8 -771,88,81,85,,84.7,84.7 -772,58,80,89,,77.0,77.0 -773,97,76,90,,87.9,87.9 -774,90,74,95,great participation +10,97.2,107.2 -775,79,86,99,great final presentation +10,99.1,99.1 -776,93,36,62,,63.5,63.5 -777,34,91,92,missed homework frequently -10,64.3,64.3 -778,83,62,55,,65.5,65.5 -779,81,75,77,,77.6,77.6 -780,80,77,98,missed class frequently -10,76.3,66.3 -781,99,97,70,,86.8,96.8 -782,83,99,80,,86.6,86.6 -783,80,72,97,,84.4,84.4 -784,0,68,87,"cheated on exam, gets 0pts",55.2,55.2 -785,71,95,45,,67.8,67.8 -786,92,89,82,great final presentation +10,97.1,97.1 -787,89,62,93,,82.5,82.5 -788,67,72,90,missed class frequently -10,67.7,67.7 -789,64,89,91,great final presentation +10,92.3,92.3 -790,0,79,68,"cheated on exam, gets 0pts",50.9,50.9 -791,80,91,60,,75.3,75.3 -792,96,0,62,"cheated on exam, gets 0pts",53.6,53.6 -793,70,92,93,missed homework frequently -10,75.8,75.8 -794,48,50,65,,55.4,55.4 -795,0,89,91,"cheated on exam, gets 0pts",63.1,63.1 -796,91,89,81,,86.4,86.4 -797,99,84,94,,92.5,92.5 -798,94,97,91,,93.7,93.7 -799,84,93,62,missed class frequently -10,67.9,67.9 -800,82,64,80,great final presentation +10,85.8,85.8 -801,78,60,88,great final presentation +10,86.6,86.6 -802,69,55,72,missed homework frequently -10,56.0,56.0 -803,71,92,94,,86.5,86.5 -804,59,84,83,missed class frequently -10,66.1,66.1 -805,74,96,92,missed class frequently -10,77.8,77.8 -806,80,95,58,,75.7,75.7 -807,96,64,92,great final presentation +10,94.8,84.8 -808,0,88,95,"cheated on exam, gets 0pts",64.4,64.4 -809,97,65,85,missed class frequently -10,72.6,62.600000000000000 -810,64,85,97,,83.5,83.5 -811,85,97,86,great participation +10,99.0,109.0 -812,85,96,48,,73.5,73.5 -813,87,58,34,missed homework frequently -10,47.1,47.1 -814,85,82,85,great participation +10,94.1,94.1 -815,93,74,91,,86.5,86.5 -816,58,75,91,missed class frequently -10,66.3,66.3 -817,98,73,83,,84.5,84.5 -818,92,84,92,great participation +10,99.6,99.6 -819,93,88,84,great participation +10,97.9,107.9 -820,88,80,93,,87.6,77.6 -821,72,94,88,,85.0,85.0 -822,58,90,97,,83.2,83.2 -823,70,97,42,,66.9,66.9 -824,73,86,64,,73.3,73.3 -825,76,88,94,great final presentation +10,96.8,96.8 -826,71,96,83,great final presentation +10,93.3,93.3 -827,74,58,92,great final presentation +10,86.4,86.4 -828,78,95,78,,83.1,83.1 -829,49,95,77,,74.0,74.0 -830,85,93,58,,76.6,76.6 -831,91,90,74,,83.9,83.9 -832,98,97,87,,93.3,93.3 -833,74,98,86,,86.0,86.0 -834,94,0,72,"cheated on exam, gets 0pts",57.0,57.0 -835,83,85,92,great final presentation +10,97.2,97.2 -836,85,87,90,,87.6,97.6 -837,65,0,57,"cheated on exam, gets 0pts",42.3,42.3 -838,72,0,80,"cheated on exam, gets 0pts",53.6,53.6 -839,90,76,81,,82.2,82.2 -840,0,65,88,"cheated on exam, gets 0pts",54.7,54.7 -841,89,97,96,missed homework frequently -10,84.2,84.2 -842,84,94,100,great final presentation +10,100.0,100.0 -843,95,80,86,,86.9,86.9 -844,69,90,86,missed homework frequently -10,72.1,72.1 -845,95,90,69,,83.1,83.1 -846,93,98,75,,87.3,87.3 -847,72,85,91,missed homework frequently -10,73.5,73.5 -848,85,74,87,missed homework frequently -10,72.5,72.5 -849,90,82,86,,86.0,86.0 -850,69,86,75,,76.5,76.5 -851,99,75,78,,83.4,83.4 -852,96,95,89,,92.9,102.9 -853,99,93,90,,93.6,93.6 -854,0,92,60,"cheated on exam, gets 0pts",51.6,51.6 -855,85,91,84,,86.4,86.4 -856,99,62,90,,84.3,84.3 -857,62,92,95,,84.2,84.2 -858,60,37,62,,53.9,53.9 -859,99,86,74,,85.1,85.1 -860,48,47,84,missed homework frequently -10,52.1,52.1 -861,70,84,76,,76.6,76.6 -862,88,91,53,,74.9,74.9 -863,100,83,74,,84.5,94.5 -864,74,61,87,,75.3,75.3 -865,98,71,93,,87.9,87.9 -866,51,75,87,,72.6,82.6 -867,98,85,82,great final presentation +10,97.7,87.7 -868,97,0,42,"cheated on exam, gets 0pts",45.9,45.9 -869,83,91,63,,77.4,77.4 -870,82,61,87,great participation +10,87.7,87.7 -871,92,84,62,,77.6,77.6 -872,86,89,82,,85.3,85.3 -873,73,74,76,missed class frequently -10,64.5,64.5 -874,87,0,96,"cheated on exam, gets 0pts",64.5,64.5 -875,52,86,87,,76.2,76.2 -876,71,94,95,great final presentation +10,97.5,97.5 -877,82,90,85,,85.6,85.6 -878,96,96,88,,92.8,92.8 -879,66,95,87,great final presentation +10,93.1,93.1 -880,68,80,74,,74.0,84.0 -881,53,89,88,great final presentation +10,87.8,87.8 -882,80,96,81,,85.2,85.2 -883,65,69,69,,67.8,77.8 -884,70,96,81,great final presentation +10,92.2,92.2 -885,86,85,84,missed class frequently -10,74.9,74.9 -886,64,85,97,missed class frequently -10,73.5,73.5 -887,87,0,70,"cheated on exam, gets 0pts",54.1,44.1 -888,94,90,99,missed class frequently -10,84.8,74.8 -889,66,74,80,missed class frequently -10,64.0,64.0 -890,62,98,93,great participation +10,95.2,95.2 -891,0,63,88,"cheated on exam, gets 0pts",54.1,54.1 -892,82,99,82,,87.1,87.1 -893,85,90,80,missed homework frequently -10,74.5,74.5 -894,78,73,98,,84.5,84.5 -895,91,68,88,,82.9,82.9 -896,65,84,58,great participation +10,77.9,87.9 -897,84,58,88,missed class frequently -10,67.8,67.8 -898,70,71,88,great final presentation +10,87.5,77.5 -899,71,78,80,great final presentation +10,86.7,86.7 -900,85,68,76,great participation +10,86.3,76.3 -901,94,78,96,great participation +10,100.0,100.0 -902,100,100,87,missed homework frequently -10,84.8,84.8 -903,96,73,94,great participation +10,98.3,98.3 -904,67,88,91,,82.9,82.9 -905,79,84,64,great final presentation +10,84.5,84.5 -906,86,87,77,,82.7,82.7 -907,83,94,78,missed homework frequently -10,74.3,64.3 -908,97,90,72,,84.9,84.9 -909,75,70,75,,73.5,73.5 -910,83,71,70,great participation +10,84.2,84.2 -911,87,76,72,,77.7,77.7 -912,73,58,57,great participation +10,72.1,72.1 -913,65,82,83,missed homework frequently -10,67.3,67.3 -914,69,72,89,,77.9,77.9 -915,100,91,83,great participation +10,100.0,100.0 -916,78,71,94,,82.3,82.3 -917,61,78,64,,67.3,67.3 -918,100,93,70,,85.9,85.9 -919,85,88,81,,84.3,84.3 -920,92,67,65,missed class frequently -10,63.7,63.7 -921,97,86,68,missed homework frequently -10,72.1,62.100000000000000 -922,89,95,73,,84.4,84.4 -923,87,80,65,great final presentation +10,86.1,86.1 -924,92,0,94,"cheated on exam, gets 0pts",65.2,65.2 -925,81,0,95,"cheated on exam, gets 0pts",62.3,62.3 -926,82,69,92,,82.1,82.1 -927,74,73,77,great final presentation +10,84.9,84.9 -928,75,67,78,great final presentation +10,83.8,83.8 -929,80,59,86,,76.1,76.1 -930,79,92,90,missed class frequently -10,77.3,77.3 -931,67,63,83,great participation +10,82.2,82.2 -932,88,88,79,,84.4,84.4 -933,76,87,60,missed homework frequently -10,62.900000000000000,62.900000000000000 -934,0,79,65,"cheated on exam, gets 0pts",49.7,49.7 -935,65,93,88,great final presentation +10,92.6,92.6 -936,73,71,83,,76.4,76.4 -937,73,64,88,,76.3,76.3 -938,94,88,79,great participation +10,96.2,106.2 -939,93,71,89,,84.8,84.8 -940,67,0,76,"cheated on exam, gets 0pts",50.5,50.5 -941,79,73,78,,76.8,76.8 -942,84,76,65,missed class frequently -10,64.0,64.0 -943,47,77,88,,72.4,72.4 -944,74,65,85,,75.7,75.7 -945,84,90,100,,92.2,92.2 -946,84,87,77,,82.1,82.1 -947,90,0,88,"cheated on exam, gets 0pts",62.2,72.2 -948,84,85,61,,75.1,75.1 -949,93,82,74,,82.1,72.1 -950,57,85,86,,77.0,77.0 -951,94,0,89,"cheated on exam, gets 0pts",63.8,63.8 -952,73,80,95,,83.9,83.9 -953,87,80,91,,86.5,76.5 -954,100,80,78,missed class frequently -10,75.2,75.2 -955,56,81,90,,77.1,77.1 -956,67,65,65,,65.6,65.6 -957,56,97,92,great final presentation +10,92.7,92.7 -958,79,0,75,"cheated on exam, gets 0pts",53.7,53.7 -959,66,95,95,,86.3,96.3 -960,0,79,96,"cheated on exam, gets 0pts",62.1,62.1 -961,88,80,89,,86.0,86.0 -962,87,74,95,missed class frequently -10,76.3,76.3 -963,71,92,83,,82.1,82.1 -964,68,82,97,,83.8,83.8 -965,74,72,97,missed class frequently -10,72.6,72.6 -966,90,92,88,great participation +10,99.8,109.8 -967,79,80,91,,84.1,84.1 -968,42,87,63,great participation +10,73.9,73.9 -969,71,0,82,"cheated on exam, gets 0pts",54.1,54.1 -970,78,48,96,,76.2,76.2 -971,74,85,88,great participation +10,92.9,82.9 -972,85,94,75,missed class frequently -10,73.7,63.7 -973,88,79,98,great final presentation +10,99.3,99.3 -974,37,83,69,missed homework frequently -10,53.6,53.6 -975,59,37,75,missed homework frequently -10,48.8,38.8 -976,0,97,60,"cheated on exam, gets 0pts",53.1,53.1 -977,74,80,91,,82.6,82.6 -978,88,66,74,,75.8,75.8 \ No newline at end of file From a25b2366d6213554522101ee1311419bac00d06e Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 29 Dec 2022 19:24:15 -0800 Subject: [PATCH 062/258] default, frac_neighbors 0.1 -> 0.5 --- cleanlab/regression/rank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index b959e6e423..6a5f6a3a0e 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -112,7 +112,7 @@ def get_outre_score_for_each_label( predictions: np.ndarray, *, residual_scale: float = 10, - frac_neighbors: float = 0.1, + frac_neighbors: float = 0.5, neighbor_metric: str = "euclidean", ) -> np.ndarray: """Returns OUTRE based label-quality scores. From 6bf61c6c25ef2f8687dbe2583b221795a902f6c2 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 29 Dec 2022 19:34:54 -0800 Subject: [PATCH 063/258] updated tutorial notebook --- docs/source/tutorials/regression.ipynb | 194 +++---------------------- 1 file changed, 24 insertions(+), 170 deletions(-) diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb index ace4d8aec7..03f0e27925 100644 --- a/docs/source/tutorials/regression.ipynb +++ b/docs/source/tutorials/regression.ipynb @@ -58,35 +58,6 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: cleanlab in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (2.1.1)\n", - "Requirement already satisfied: xgboost in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (1.7.2)\n", - "Requirement already satisfied: scikit-learn>=0.18 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from cleanlab) (1.1.3)\n", - "Requirement already satisfied: pandas>=1.0.0 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from cleanlab) (1.5.1)\n", - "Requirement already satisfied: numpy>=1.11.3 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from cleanlab) (1.23.4)\n", - "Requirement already satisfied: termcolor>=1.1.0 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from cleanlab) (2.1.0)\n", - "Requirement already satisfied: tqdm>=4.53.0 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from cleanlab) (4.64.1)\n", - "Requirement already satisfied: scipy in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from xgboost) (1.9.3)\n", - "Requirement already satisfied: pytz>=2020.1 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from pandas>=1.0.0->cleanlab) (2022.6)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from pandas>=1.0.0->cleanlab) (2.8.2)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from scikit-learn>=0.18->cleanlab) (3.1.0)\n", - "Requirement already satisfied: joblib>=1.0.0 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from scikit-learn>=0.18->cleanlab) (1.2.0)\n", - "Requirement already satisfied: six>=1.5 in /Users/krmayank/Desktop/Work/cleanlab/fork/cleanlab/ENV/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas>=1.0.0->cleanlab) (1.16.0)\n" - ] - } - ], - "source": [ - "!pip install cleanlab xgboost" - ] - }, - { - "cell_type": "code", - "execution_count": 2, "metadata": { "nbsphinx": "hidden" }, @@ -117,13 +88,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", + "from xgboost import XGBRegressor\n", + "from sklearn.model_selection import cross_val_predict\n", "from cleanlab.regression.rank import get_label_quality_scores\n", "\n", "np.set_printoptions(suppress=True)" @@ -178,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -216,19 +189,21 @@ "# 2. Import dataset and Generate predictions" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# fetch the data\n", + "!wget -nc https://cleanlab-public.s3.amazonaws.com/Datasets/student_grades.csv" + ] + }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File 'student_grades.csv' already there; not retrieving.\n", - "\n" - ] - }, { "data": { "text/html": [ @@ -323,7 +298,7 @@ } ], "source": [ - "!wget -nc https://cleanlab-public.s3.amazonaws.com/Datasets/student_grades.csv\n", + "# Load data\n", "data = pd.read_csv(\"./student_grades.csv\", index_col=0)\n", "data.head()" ] @@ -403,121 +378,22 @@ "Let's start with initializing the model with relevant parameters. As mentioned earlier we are using `xgboost` for this tutorial. To handle categorical variables, we specifically need to set `enable_categorical` flag to `True`. Note that, support for the categorical variable is in the experimental stage and doesn't support the auto-selection of `tree_method`. Therefore, you will need to specify `tree_method` from supported types. More details can be found [here](https://xgboost.readthedocs.io/en/stable/tutorials/categorical.html)." ] }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# using default xgboost cv " - ] - }, { "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
train-rmse-meantrain-rmse-stdtest-rmse-meantest-rmse-std
014.2264951.36156314.1786821.087675
111.8619650.85277512.1019180.589881
210.6461010.62194310.7044870.317186
\n", - "
" - ], - "text/plain": [ - " train-rmse-mean train-rmse-std test-rmse-mean test-rmse-std\n", - "0 14.226495 1.361563 14.178682 1.087675\n", - "1 11.861965 0.852775 12.101918 0.589881\n", - "2 10.646101 0.621943 10.704487 0.317186" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "import xgboost \n", - "from xgboost import DMatrix, XGBRegressor\n", - "from xgboost import XGBRegressor\n", - "from sklearn.model_selection import cross_val_predict\n", - "\n", "SEED = 1\n", - "num_boost_round = 1000\n", "num_crossval_folds = 5\n", "\n", - "training_data = DMatrix(X, label=y, enable_categorical=True)\n", - "params = {\"booster\": \"gblinear\", \"objective\": \"reg:squarederror\"}\n", - "\n", - "cross_validation_results = xgboost.cv(params, \n", - " training_data, \n", - " num_boost_round=num_boost_round,\n", - " nfold=num_crossval_folds, \n", - " seed=SEED,\n", - " early_stopping_rounds=5) \n", - "\n", - "display(cross_validation_results.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "best_iteration = cross_validation_results['test-rmse-mean'].argmin()\n", - "\n", "model = XGBRegressor(\n", " tree_method= \"hist\",\n", - " n_estimators = best_iteration, \n", + " n_estimators = 100, \n", " enable_categorical = True, \n", " random_state = SEED)\n", "\n", - "# get predictions\n", + "# get predictions using cross-validation\n", "predictions = cross_val_predict(\n", " estimator=model, X=X, y=y, cv=num_crossval_folds, method = \"predict\"\n", ")" @@ -535,7 +411,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -545,12 +421,12 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -584,40 +460,18 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": { "nbsphinx": "hidden" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "OUTRE: 0.9762667538411246\n", - "RESIDUAL: 0.9781736951073335\n" - ] - }, - { - "ename": "ValueError", - "evalue": "Label quality scores did not outperform alternative scores", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn [13], line 20\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mLabel quality scores did not perform well enough\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 17\u001b[0m \u001b[39mif\u001b[39;00m roc_auc_score(true_errors, \u001b[39m1\u001b[39m \u001b[39m-\u001b[39m label_quality_scores) \u001b[39m<\u001b[39m\u001b[39m=\u001b[39m roc_auc_score(\n\u001b[1;32m 18\u001b[0m true_errors, \u001b[39m1\u001b[39m \u001b[39m-\u001b[39m label_quality_scores_residual\n\u001b[1;32m 19\u001b[0m ):\n\u001b[0;32m---> 20\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mLabel quality scores did not outperform alternative scores\u001b[39m\u001b[39m\"\u001b[39m)\n", - "\u001b[0;31mValueError\u001b[0m: Label quality scores did not outperform alternative scores" - ] - } - ], + "outputs": [], "source": [ "# Note: This cell is only for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", "from sklearn.metrics import roc_auc_score\n", "\n", "true_errors = (data[\"grade\"] != data[\"true_grade\"]).astype(int)\n", "\n", - "# label_quality_scores = get_label_quality_scores(labels= y, predictions=predictions)\n", - "from cleanlab.regression.rank import get_outre_score_for_each_label\n", - "label_quality_scores = get_outre_score_for_each_label(labels=np.array(y), predictions=predictions, frac_neighbors=0.5)\n", + "label_quality_scores = get_label_quality_scores(labels=y, predictions=predictions)\n", "label_quality_scores_residual = get_label_quality_scores(labels = y, predictions=predictions, method=\"residual\")\n", "\n", "if roc_auc_score(true_errors, 1 - label_quality_scores) < 0.5:\n", From 15bfa43c2a34f4ad1333e3f3f13e44db1883fcf1 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 29 Dec 2022 22:00:38 -0800 Subject: [PATCH 064/258] review suggestion updated --- docs/source/tutorials/regression.ipynb | 223 +++++++++++++++++++++---- 1 file changed, 189 insertions(+), 34 deletions(-) diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb index 03f0e27925..7903e9ce8d 100644 --- a/docs/source/tutorials/regression.ipynb +++ b/docs/source/tutorials/regression.ipynb @@ -95,11 +95,24 @@ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", - "from xgboost import XGBRegressor\n", + "from xgboost import XGBRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# This cell is hidden on docs.cleanlab.ai\n", + "from sklearn.metrics import r2_score\n", "from sklearn.model_selection import cross_val_predict\n", "from cleanlab.regression.rank import get_label_quality_scores\n", "\n", - "np.set_printoptions(suppress=True)" + "np.set_printoptions(suppress=True)\n", + "SEED = np.random.RandomState(10203)" ] }, { @@ -151,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -191,9 +204,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File 'student_grades.csv' already there; not retrieving.\n", + "\n" + ] + } + ], "source": [ "# fetch the data\n", "!wget -nc https://cleanlab-public.s3.amazonaws.com/Datasets/student_grades.csv" @@ -201,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -292,7 +314,7 @@ "4 48 90 91 NaN 77.8 77.8" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -313,12 +335,12 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -329,12 +351,13 @@ ], "source": [ "# Generate true errors\n", - "true_errors_index = np.where(data[\"grade\"] != data[\"true_grade\"])[0]\n", + "true_errors = (data[\"grade\"] != data[\"true_grade\"]).astype(int)\n", + "true_errors_index = np.where(true_errors == 1)[0]\n", "plot_data(\n", " data_x=data[\"exam_3\"],\n", " data_y=data[\"grade\"],\n", " circles=true_errors_index,\n", - " title=\"Messy Regression dataset\",\n", + " title=\"Noisy regression dataset\",\n", " xlabel=\"exam_3 feature\",\n", " ylabel=\"grade (Y value)\",\n", ")" @@ -345,7 +368,123 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the above plot, `grade (Y value)` is plotted against one of the features in the dataset (`exam_3`). We have circled the examples that were considered as `true_error` in **Red** . " + "In the above plot, `grade (Y value)` is plotted against one of the features in the dataset (`exam_3`). We have circled the examples that were considered as `true_error` in **Red**. \n", + "\n", + "Let's check some of the errors in next cell. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Errors in dataset:'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
exam_1exam_2exam_3notestrue_gradegrade
13805965NaN67.777.7
58895979missed homework frequently -1066.056.0
82888082great participation +1093.2103.2
85928771NaN82.172.1
92967779missed class frequently -1073.583.5
\n", + "
" + ], + "text/plain": [ + " exam_1 exam_2 exam_3 notes true_grade grade\n", + "13 80 59 65 NaN 67.7 77.7\n", + "58 89 59 79 missed homework frequently -10 66.0 56.0\n", + "82 88 80 82 great participation +10 93.2 103.2\n", + "85 92 87 71 NaN 82.1 72.1\n", + "92 96 77 79 missed class frequently -10 73.5 83.5" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Some of the error in the dataset\n", + "display(\"Errors in dataset:\", data.loc[true_errors_index].head())\n", + "\n", + "# Dropping ground truth i.e. true_grade \n", + "data = data.drop(columns=[\"true_grade\"])" ] }, { @@ -358,7 +497,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -366,7 +505,7 @@ "data.notes = data.notes.astype(\"category\")\n", "\n", "# xgboost takes data and label seperately, so you will need to divide data accordingly.\n", - "X = data.drop([\"grade\", \"true_grade\"], axis=1)\n", + "X = data.drop(columns=[\"grade\"])\n", "y = data[\"grade\"]" ] }, @@ -380,11 +519,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "SEED = 1\n", "num_crossval_folds = 5\n", "\n", "model = XGBRegressor(\n", @@ -399,6 +537,32 @@ ")" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An additional benefit of cross-validation is that it facilitates more reliable evaluation of our model than a single training/validation split." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "R-squared on predictions from cross-validation: 0.902\n" + ] + } + ], + "source": [ + "roc = r2_score(y, predictions)\n", + "print(f\"R-squared on predictions from cross-validation: {roc:.3f}\")" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -411,7 +575,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -421,12 +585,12 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -441,7 +605,7 @@ " data_y=data[\"grade\"],\n", " circles=true_errors_index,\n", " color=label_quality_scores,\n", - " title=\"Messy Regression dataset with label quality scores\",\n", + " title=\"Noisy regression dataset colored by label quality scores\",\n", " colorbar=True,\n", " xlabel=\"exam_3 feature\",\n", " ylabel=\"grade (Y value)\",\n", @@ -460,7 +624,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "metadata": { "nbsphinx": "hidden" }, @@ -469,26 +633,17 @@ "# Note: This cell is only for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", "from sklearn.metrics import roc_auc_score\n", "\n", - "true_errors = (data[\"grade\"] != data[\"true_grade\"]).astype(int)\n", - "\n", "label_quality_scores = get_label_quality_scores(labels=y, predictions=predictions)\n", "label_quality_scores_residual = get_label_quality_scores(labels = y, predictions=predictions, method=\"residual\")\n", "\n", - "if roc_auc_score(true_errors, 1 - label_quality_scores) < 0.5:\n", + "auc = roc_auc_score(true_errors, 1 - label_quality_scores)\n", + "\n", + "if auc <= 0.5:\n", " raise ValueError(\"Label quality scores did not perform well enough\")\n", "\n", - "if roc_auc_score(true_errors, 1 - label_quality_scores) <= roc_auc_score(\n", - " true_errors, 1 - label_quality_scores_residual\n", - "):\n", + "if auc <= roc_auc_score(true_errors, 1 - label_quality_scores_residual):\n", " raise ValueError(\"Label quality scores did not outperform alternative scores\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From feb5797a66e7c5fe6128c63ff20d2ecf98474333 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 29 Dec 2022 22:48:02 -0800 Subject: [PATCH 065/258] suggestion in test corrected --- tests/test_regression.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/test_regression.py b/tests/test_regression.py index 355752dee3..12c70e5aed 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -1,13 +1,7 @@ -import numpy as np - -# import pandas as pd import pytest -from typing import Union, Sequence - +import numpy as np from cleanlab.regression import rank -ArrayLike = Union[np.ndarray, Sequence] - # To be used for all the tests labels = np.array([1, 2, 3, 4]) predictions = np.array([1, 3, 4, 5]) From 3a49cfd0c2de9dbefc893cb4ce6fadad75d5115b Mon Sep 17 00:00:00 2001 From: Mayank Kumar Date: Thu, 29 Dec 2022 22:50:45 -0800 Subject: [PATCH 066/258] copyright updated file regression_utils.py Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/internal/regression_utils.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py index 468edd30f5..3f878615a4 100644 --- a/cleanlab/internal/regression_utils.py +++ b/cleanlab/internal/regression_utils.py @@ -1,5 +1,22 @@ +# Copyright (C) 2017-2022 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . + + """ -Helper function internally used in cleanlab.regression +Helper functions internally used in cleanlab.regression. """ import numpy as np From cdfa82d05a447de283d77b52adba3f71c7c9dc6e Mon Sep 17 00:00:00 2001 From: Mayank Kumar Date: Thu, 29 Dec 2022 22:54:27 -0800 Subject: [PATCH 067/258] Error message suggestions updated file regression_utils.py Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/internal/regression_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py index 3f878615a4..d2c353fe57 100644 --- a/cleanlab/internal/regression_utils.py +++ b/cleanlab/internal/regression_utils.py @@ -59,7 +59,7 @@ def assert_valid_inputs( scoring_methods = ["residual", "outre"] if method not in scoring_methods: raise ValueError( - f"Passed method is not among allowed methods. Expected either of {scoring_methods}, got {method}." + f"Specified method '{method}' must be one of: {scoring_methods}." ) # return 1-D numpy array @@ -82,7 +82,7 @@ def check_dimension_and_datatype(check_input: np.ndarray, text: str) -> np.ndarr # Check if input is empty if not check_input.size: - raise ValueError(f"{text} is Empty, check input.") + raise ValueError(f"{text} cannot be empty array.") # Remove axis with length one check_input = np.squeeze(check_input) @@ -95,7 +95,7 @@ def check_dimension_and_datatype(check_input: np.ndarray, text: str) -> np.ndarr # Check if datatype is numeric if not np.issubdtype(check_input.dtype, np.number): - raise ValueError(f"Expected {text} to be Numeric, got {check_input.dtype}.") + raise ValueError(f"Expected {text} to contain numeric values, got values of type {check_input.dtype}.") return check_input @@ -104,4 +104,4 @@ def check_missing_values(check_input: np.ndarray, text: str): """Raise error if there are any missing values in Numpy array.""" if np.isnan(check_input).any(): - raise ValueError(f"{text} has missing values, check input.") + raise ValueError(f"{text} cannot contain missing values.") From 7a25675febac4d29c141140e148f2221eca2a723 Mon Sep 17 00:00:00 2001 From: Mayank Kumar Date: Thu, 29 Dec 2022 23:09:11 -0800 Subject: [PATCH 068/258] Copyright update Update cleanlab/regression/rank.py Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/regression/rank.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 6a5f6a3a0e..a3a49c4872 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -5,7 +5,29 @@ from typing import Dict, Callable from numpy.typing import ArrayLike -""" Generates label quality scores for every sample in regression dataset """ +# Copyright (C) 2017-2022 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . + + +""" +Methods to score the quality of each label in a regression dataset. These can be used to rank the examples whose Y-value most likely has an error. + +Note: Label quality scores are most accurate when they are computed based on out-of-sample `predictions` from your regression model. +To obtain out-of-sample predictions for every datapoint in your dataset, you can use :ref:`cross-validation `. This is encouraged to get better results. +""" EPS = 1e-30 From f4571b9a14b97c5c589bcd9371faf14adc004cd6 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 29 Dec 2022 23:22:59 -0800 Subject: [PATCH 069/258] Suggestions from code review 1. dependency restructuring 2. comments added --- cleanlab/regression/rank.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index a3a49c4872..ea14f536a2 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -1,10 +1,3 @@ -import numpy as np -from cleanlab.outlier import OutOfDistribution -from sklearn.neighbors import NearestNeighbors -from cleanlab.internal.regression_utils import assert_valid_inputs -from typing import Dict, Callable -from numpy.typing import ArrayLike - # Copyright (C) 2017-2022 Cleanlab Inc. # This file is part of cleanlab. # @@ -29,6 +22,15 @@ To obtain out-of-sample predictions for every datapoint in your dataset, you can use :ref:`cross-validation `. This is encouraged to get better results. """ +from typing import Dict, Callable +import numpy as np +from numpy.typing import ArrayLike +from sklearn.neighbors import NearestNeighbors + +from cleanlab.outlier import OutOfDistribution +from cleanlab.internal.regression_utils import assert_valid_inputs + +# Small value used to prevent division by zero EPS = 1e-30 From 22728d8be16e35acfba0bdc5bbf57ddca666c40a Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 29 Dec 2022 23:27:12 -0800 Subject: [PATCH 070/258] black formatting --- cleanlab/internal/regression_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py index d2c353fe57..dbf1c22384 100644 --- a/cleanlab/internal/regression_utils.py +++ b/cleanlab/internal/regression_utils.py @@ -58,9 +58,7 @@ def assert_valid_inputs( # Check if method is among allowed scoring method scoring_methods = ["residual", "outre"] if method not in scoring_methods: - raise ValueError( - f"Specified method '{method}' must be one of: {scoring_methods}." - ) + raise ValueError(f"Specified method '{method}' must be one of: {scoring_methods}.") # return 1-D numpy array return valid_labels, valid_predictions @@ -95,7 +93,9 @@ def check_dimension_and_datatype(check_input: np.ndarray, text: str) -> np.ndarr # Check if datatype is numeric if not np.issubdtype(check_input.dtype, np.number): - raise ValueError(f"Expected {text} to contain numeric values, got values of type {check_input.dtype}.") + raise ValueError( + f"Expected {text} to contain numeric values, got values of type {check_input.dtype}." + ) return check_input From 79a509f0d0bcdab86d84c77243d2abc61f38966a Mon Sep 17 00:00:00 2001 From: krmayankb Date: Thu, 29 Dec 2022 23:54:41 -0800 Subject: [PATCH 071/258] import cell correction --- docs/source/tutorials/regression.ipynb | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb index 7903e9ce8d..a37413bead 100644 --- a/docs/source/tutorials/regression.ipynb +++ b/docs/source/tutorials/regression.ipynb @@ -88,14 +88,17 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", - "from xgboost import XGBRegressor" + "from xgboost import XGBRegressor\n", + "from sklearn.metrics import r2_score\n", + "from sklearn.model_selection import cross_val_predict\n", + "from cleanlab.regression.rank import get_label_quality_scores" ] }, { @@ -107,10 +110,6 @@ "outputs": [], "source": [ "# This cell is hidden on docs.cleanlab.ai\n", - "from sklearn.metrics import r2_score\n", - "from sklearn.model_selection import cross_val_predict\n", - "from cleanlab.regression.rank import get_label_quality_scores\n", - "\n", "np.set_printoptions(suppress=True)\n", "SEED = np.random.RandomState(10203)" ] From 96d4ae0250f4ad65dc4448dcdd168ea28bd5d4bc Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 30 Dec 2022 09:28:12 -0800 Subject: [PATCH 072/258] estimator update to 10 --- docs/source/tutorials/regression.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb index a37413bead..a110f589da 100644 --- a/docs/source/tutorials/regression.ipynb +++ b/docs/source/tutorials/regression.ipynb @@ -526,7 +526,7 @@ "\n", "model = XGBRegressor(\n", " tree_method= \"hist\",\n", - " n_estimators = 100, \n", + " n_estimators = 10, \n", " enable_categorical = True, \n", " random_state = SEED)\n", "\n", @@ -661,7 +661,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.10.8 (main, Oct 13 2022, 09:48:40) [Clang 14.0.0 (clang-1400.0.29.102)]" }, "orig_nbformat": 4, "vscode": { From 1fe5b379de545c0c251a687aae0217e23b13a40b Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 30 Dec 2022 09:51:29 -0800 Subject: [PATCH 073/258] example in docstring updated --- cleanlab/regression/rank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index ea14f536a2..d6eb2b468b 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -74,7 +74,7 @@ def get_label_quality_scores( >>> predictions = np.array([2,2,5,4.1]) >>> label_quality_scores = get_label_quality_scores(labels, predictions) >>> label_quality_scores - array([0.36787944, 1. , 0.13533528, 0.90483742]) + array([0.00323821, 0.33692597, 0.00191686, 0.33692597]) """ # Check if inputs are valid From 56bc77169a88041173f73dd5c15acff6a45a8417 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 30 Dec 2022 10:27:16 -0800 Subject: [PATCH 074/258] chracterization test added --- tests/test_regression.py | 50 ++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/tests/test_regression.py b/tests/test_regression.py index 12c70e5aed..61741c09be 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -4,7 +4,12 @@ # To be used for all the tests labels = np.array([1, 2, 3, 4]) -predictions = np.array([1, 3, 4, 5]) +predictions = np.array([2, 2, 5, 4.1]) + +# Used for characterization tests +expected_score_outre = np.array([0.00323821, 0.33692597, 0.00191686, 0.33692597]) +expected_score_residual = np.array([0.36787944, 1.0, 0.13533528, 0.90483742]) +expected_scores = {"outre": expected_score_outre, "residual": expected_score_residual} # Inputs that are not array like aConstant = 1 @@ -14,6 +19,11 @@ aBool = True +@pytest.fixture +def non_array_input(): + return [aConstant, aString, aDict, aSet, aBool] + + # test with deafault parameters def test_output_shape_type(): scores = rank.get_label_quality_scores(labels=labels, predictions=predictions) @@ -21,24 +31,18 @@ def test_output_shape_type(): assert isinstance(scores, np.ndarray) -@pytest.mark.parametrize( - "aInput", - [aConstant, aString, aDict, aSet, aBool], -) -def test_labels_are_arraylike(aInput): - with pytest.raises(ValueError) as error: - rank.get_label_quality_scores(labels=aInput, predictions=predictions) - assert error.type == ValueError +def test_labels_are_arraylike(non_array_input): + for new_input in non_array_input: + with pytest.raises(ValueError) as error: + rank.get_label_quality_scores(labels=new_input, predictions=predictions) + assert error.type == ValueError -@pytest.mark.parametrize( - "aInput", - [aConstant, aString, aDict, aSet, aBool], -) -def test_predictionns_are_arraylike(aInput): - with pytest.raises(ValueError) as error: - rank.get_label_quality_scores(labels=labels, predictions=aInput) - assert error.type == ValueError +def test_predictionns_are_arraylike(non_array_input): + for new_input in non_array_input: + with pytest.raises(ValueError) as error: + rank.get_label_quality_scores(labels=labels, predictions=new_input) + assert error.type == ValueError # test for input shapes @@ -83,3 +87,15 @@ def test_method_pass_get_label_quality_scores(method): scores = rank.get_label_quality_scores(labels=labels, predictions=predictions, method=method) assert labels.shape == scores.shape assert isinstance(scores, np.ndarray) + + +@pytest.mark.parametrize( + "method", + [ + "residual", + "outre", + ], +) +def test_expected_scores(method): + scores = rank.get_label_quality_scores(labels=labels, predictions=predictions, method=method) + assert np.allclose(scores, expected_scores[method], atol=1e-08) From 3d71ae1ddc923875a769de208a00454bc949e0d1 Mon Sep 17 00:00:00 2001 From: krmayankb Date: Fri, 30 Dec 2022 14:26:19 -0800 Subject: [PATCH 075/258] notebook output cleared --- docs/source/tutorials/regression.ipynb | 273 ++----------------------- 1 file changed, 20 insertions(+), 253 deletions(-) diff --git a/docs/source/tutorials/regression.ipynb b/docs/source/tutorials/regression.ipynb index a110f589da..67b619f056 100644 --- a/docs/source/tutorials/regression.ipynb +++ b/docs/source/tutorials/regression.ipynb @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "nbsphinx": "hidden" }, @@ -88,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -103,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "nbsphinx": "hidden" }, @@ -163,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -203,18 +203,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File 'student_grades.csv' already there; not retrieving.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# fetch the data\n", "!wget -nc https://cleanlab-public.s3.amazonaws.com/Datasets/student_grades.csv" @@ -222,102 +213,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
exam_1exam_2exam_3notestrue_gradegrade
0537793NaN76.276.2
1816480great participation +1085.585.5
2748897NaN87.487.4
3619478NaN77.777.7
4489091NaN77.877.8
\n", - "
" - ], - "text/plain": [ - " exam_1 exam_2 exam_3 notes true_grade grade\n", - "0 53 77 93 NaN 76.2 76.2\n", - "1 81 64 80 great participation +10 85.5 85.5\n", - "2 74 88 97 NaN 87.4 87.4\n", - "3 61 94 78 NaN 77.7 77.7\n", - "4 48 90 91 NaN 77.8 77.8" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Load data\n", "data = pd.read_csv(\"./student_grades.csv\", index_col=0)\n", @@ -334,20 +232,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Generate true errors\n", "true_errors = (data[\"grade\"] != data[\"true_grade\"]).astype(int)\n", @@ -374,110 +261,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Errors in dataset:'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
exam_1exam_2exam_3notestrue_gradegrade
13805965NaN67.777.7
58895979missed homework frequently -1066.056.0
82888082great participation +1093.2103.2
85928771NaN82.172.1
92967779missed class frequently -1073.583.5
\n", - "
" - ], - "text/plain": [ - " exam_1 exam_2 exam_3 notes true_grade grade\n", - "13 80 59 65 NaN 67.7 77.7\n", - "58 89 59 79 missed homework frequently -10 66.0 56.0\n", - "82 88 80 82 great participation +10 93.2 103.2\n", - "85 92 87 71 NaN 82.1 72.1\n", - "92 96 77 79 missed class frequently -10 73.5 83.5" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Some of the error in the dataset\n", "display(\"Errors in dataset:\", data.loc[true_errors_index].head())\n", @@ -496,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -518,7 +304,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -546,17 +332,9 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "R-squared on predictions from cross-validation: 0.902\n" - ] - } - ], + "outputs": [], "source": [ "roc = r2_score(y, predictions)\n", "print(f\"R-squared on predictions from cross-validation: {roc:.3f}\")" @@ -574,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -584,20 +362,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "plot_data(\n", " data_x=data[\"exam_3\"],\n", @@ -623,7 +390,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "nbsphinx": "hidden" }, From 9bca4151300c83adac674fa98d3dffb8c6029dd6 Mon Sep 17 00:00:00 2001 From: Anish Athalye Date: Sat, 31 Dec 2022 10:36:32 -0500 Subject: [PATCH 076/258] Fix broken link --- docs/source/tutorials/image.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/tutorials/image.ipynb b/docs/source/tutorials/image.ipynb index 7f00bae63c..3d065f5d01 100644 --- a/docs/source/tutorials/image.ipynb +++ b/docs/source/tutorials/image.ipynb @@ -240,7 +240,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As some cleanlab features require scikit-learn compatibility, we adapt the above PyTorch neural net accordingly. [skorch](https://skorch.readthedocs.io) is a convenient package that helps with this. Alternatively, you can also easily wrap an arbitrary model to be scikit-learn compatible as demonstrated [here](https://github.com/cleanlab/cleanlab#use-cleanlab-with-any-model-tensorflow-pytorch-sklearn-xgboost-etc)." + "As some cleanlab features require scikit-learn compatibility, we adapt the above PyTorch neural net accordingly. [skorch](https://skorch.readthedocs.io) is a convenient package that helps with this. Alternatively, you can also easily wrap an arbitrary model to be scikit-learn compatible as demonstrated [here](https://github.com/cleanlab/cleanlab#use-cleanlab-with-any-model-for-most-ml-tasks)." ] }, { From 36b46c195fa20635bd526582fd1ec9124df0a1c2 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Sun, 1 Jan 2023 14:24:53 -0800 Subject: [PATCH 077/258] mention applications beyond label error detection in readme (#580) --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 429c8f5393..80dc4a1f35 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ![](https://raw.githubusercontent.com/cleanlab/assets/master/cleanlab/cleanlab_logo_open_source_transparent_optimized_size.png) -cleanlab automatically finds and fixes errors in any ML dataset. This data-centric AI package facilitates **machine learning with messy, real-world data** by providing **clean lab**els during training. +cleanlab automatically detects problems in a ML dataset. This data-centric AI package facilitates **machine learning with messy, real-world data** by providing **clean lab**els for robust training and flagging errors in your data. ```python @@ -23,6 +23,7 @@ cleanlab.dataset.health_summary(labels, confident_joint=cl.confident_joint) Get started with: [documentation](https://docs.cleanlab.ai/), [tutorials](https://docs.cleanlab.ai/stable/tutorials/image.html), [examples](https://github.com/cleanlab/examples), and [blogs](https://cleanlab.ai/blog/). - Learn to run cleanlab on your data in 5 minutes for classification with: [image](https://docs.cleanlab.ai/stable/tutorials/image.html), [text](https://docs.cleanlab.ai/stable/tutorials/text.html), [audio](https://docs.cleanlab.ai/stable/tutorials/audio.html), or [tabular](https://docs.cleanlab.ai/stable/tutorials/tabular.html) data. +- Use cleanlab to automatically: [find mislabeled data + train robust models](https://docs.cleanlab.ai/stable/tutorials/indepth_overview.html), [detect outliers](https://docs.cleanlab.ai/stable/tutorials/outliers.html), [estimate consensus + annotator-quality for multi-annotator datasets](https://docs.cleanlab.ai/stable/tutorials/multiannotator.html), [decide what data is best to (re)label next](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb). [![pypi](https://img.shields.io/pypi/v/cleanlab.svg)](https://pypi.org/pypi/cleanlab/) @@ -160,7 +161,8 @@ cleanlab is useful across a wide variety of Machine Learning tasks. Specific tas 2. [Multi-label classification](https://docs.cleanlab.ai/stable/tutorials/multilabel_classification.html) (e.g. image/document tagging) 3. [Token classification](https://docs.cleanlab.ai/stable/tutorials/token_classification.html) (e.g. entity recognition in text) 4. [Classification with data labeled by multiple annotators](https://docs.cleanlab.ai/stable/tutorials/multiannotator.html) -5. [Out of distribution detection](https://docs.cleanlab.ai/stable/tutorials/outliers.html) +5. [Active learning with multiple annotators](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb) (decide which data to label or re-label to improve model most) +6. [Out of distribution detection](https://docs.cleanlab.ai/stable/tutorials/outliers.html) For many other ML tasks, cleanlab can still help you improve your dataset if appropriately applied. From aa1542bd714cc10b45cf7a4e66e7cde9fb38db52 Mon Sep 17 00:00:00 2001 From: Sanjana Date: Mon, 2 Jan 2023 11:33:19 +0530 Subject: [PATCH 078/258] Drop python 3.6 support from dependencies in setup.py (#579) --- .gitignore | 3 ++- setup.py | 9 ++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 16915dde28..82fd660bf0 100644 --- a/.gitignore +++ b/.gitignore @@ -119,5 +119,6 @@ venv.bak/ /docs/source/notebooks/spoken_digits /docs/source/notebooks/pretrained_models -# VS Code +# Editor files .vscode/ +.idea/ diff --git a/setup.py b/setup.py index 71b685da50..e4b03739c8 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,6 @@ def run(self): # Get version number and store it in __version__ exec(open("cleanlab/version.py").read()) - setup( name="cleanlab", version=__version__, @@ -89,10 +88,10 @@ def run(self): # requirements files see: # https://packaging.python.org/en/latest/discussions/install-requires-vs-requirements/ install_requires=[ - "numpy>=1.11.3", - "scikit-learn>=0.18", + "numpy>=1.20.0", + "scikit-learn>=1.0", "tqdm>=4.53.0", - "pandas>=1.0.0", - "termcolor>=1.1.0", + "pandas>=1.1.5", + "termcolor>=2.0.0", ], ) From 1e078e0e02e2fd5060f266fa07995ae77cfafe14 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Tue, 3 Jan 2023 20:10:10 -0800 Subject: [PATCH 079/258] specify better default values --- cleanlab/regression/rank.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index d6eb2b468b..c9e9189032 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -135,7 +135,7 @@ def get_outre_score_for_each_label( labels: np.ndarray, predictions: np.ndarray, *, - residual_scale: float = 10, + residual_scale: float = 5, frac_neighbors: float = 0.5, neighbor_metric: str = "euclidean", ) -> np.ndarray: @@ -152,11 +152,11 @@ def get_outre_score_for_each_label( predictions: np.ndarray Predicted labels in the same format as expected by the :py:func:`get_label_quality_scores ` function. - residual_scale: float, default = 10 - Manipulates scale of the distribution of residual. + residual_scale: float, default = 5 + Multiplicative factor to adjust scale (standard deviation) of the residuals relative to the labels. - frac_neighbors: float, default = 0.1 - Fraction of examples that should be considered as n_neighbors to NearestNeighbors. + frac_neighbors: float, default = 0.5 + Fraction of examples in dataset that should be considered as `n_neighbors` in the ``NearestNeighbors`` object used internally to assess outliers. neighbor_metric: str, default = "euclidean" The parameter is passed to sklearn NearestNeighbors. # TODO add reference to sklearn.NearestNeighbor? From 0d9fee88a9a8cd786f25bdb7441110e0af1b970e Mon Sep 17 00:00:00 2001 From: "Curtis G. Northcutt" Date: Wed, 4 Jan 2023 17:31:16 -0800 Subject: [PATCH 080/258] add maximum line length (#583) and clarify where black settings are found in DEVELOPMENT guide Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- DEVELOPMENT.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 80063408ba..19a1d739f0 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -100,12 +100,14 @@ examples/run_all_notebooks.sh ## How to style new code contributions -cleanlab follows the [Black](https://black.readthedocs.io/) code style. This is +cleanlab follows the [Black](https://black.readthedocs.io/) code style (see [pyproject.toml](pyproject.toml)). This is enforced by CI, so please format your code by invoking `black` before submitting a pull request. Generally aim to follow the [PEP-8 coding style](https://peps.python.org/pep-0008/). Please do not use wildcard `import *` in any files, instead you should always import the specific functions that you need from a module. +All cleanlab code should have a maximum line length of 100 characters. + ### Pre-commit hook This repo uses the [pre-commit framework](https://pre-commit.com/) to easily From 044c5aa2e4a6e878a96b3e4e821af9e608e22388 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Thu, 5 Jan 2023 12:50:36 -0800 Subject: [PATCH 081/258] ignore flake8 flagging unused submodule imports --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 04a05501c6..b436333936 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,3 +5,4 @@ per-file-ignores = cleanlab/__init__.py: F401 cleanlab/token_classification/__init__.py: F401 cleanlab/benchmarking/__init__.py: F401 + cleanlab/regression/__init__.py: F401 From 7caed937842cedba91e148a4974cc6cd74276396 Mon Sep 17 00:00:00 2001 From: Ulyana Date: Thu, 5 Jan 2023 21:42:34 -0800 Subject: [PATCH 082/258] Update github actions (#589) update: github actions to v3, codecov to v3, setup-python to v4, setup node v2 to v3, setup cache v2 to v3 --- .github/workflows/ci.yml | 2 +- .github/workflows/gh-pages.yaml | 6 +++--- .github/workflows/links.yml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0896e2a082..ec5e8bb98a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,7 +37,7 @@ jobs: pip install tensorflow-cpu - name: Test with coverage run: pytest --verbose --cov=cleanlab/ --cov-config .coveragerc --cov-report=xml - - uses: codecov/codecov-action@v2 + - uses: codecov/codecov-action@v3 typecheck: name: Type check runs-on: ubuntu-latest diff --git a/.github/workflows/gh-pages.yaml b/.github/workflows/gh-pages.yaml index a1f2f84fa7..ae0ce12e53 100644 --- a/.github/workflows/gh-pages.yaml +++ b/.github/workflows/gh-pages.yaml @@ -28,12 +28,12 @@ jobs: sudo tar xzvf pandoc-2.19.2-linux-amd64.tar.gz --strip-components 1 -C /usr/local - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: "3.10" - name: Setup Node - uses: actions/setup-node@v2 + uses: actions/setup-node@v3 with: node-version: "16" @@ -45,7 +45,7 @@ jobs: run: echo "::set-output name=dir::$(pip cache dir)" - name: Cache dependencies - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml index fba7abc18c..469100c88e 100644 --- a/.github/workflows/links.yml +++ b/.github/workflows/links.yml @@ -12,7 +12,7 @@ jobs: - run: >- sudo apt-get install -y pandoc - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - run: | find . -name '*.html' -delete - run: | From 248bb91378773aabb5bf59c26a3c375a8bc85fac Mon Sep 17 00:00:00 2001 From: Hui Wen <45724323+huiwengoh@users.noreply.github.com> Date: Sat, 7 Jan 2023 00:53:02 +0800 Subject: [PATCH 083/258] Revamp text tutorial to use cleanlab Keras wrapper (#584) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/experimental/keras.py | 39 ++++- docs/source/tutorials/text.ipynb | 263 +++++++++++++------------------ tests/test_frameworks.py | 8 + 3 files changed, 150 insertions(+), 160 deletions(-) diff --git a/cleanlab/experimental/keras.py b/cleanlab/experimental/keras.py index d4d3e8cf03..96f2caf545 100644 --- a/cleanlab/experimental/keras.py +++ b/cleanlab/experimental/keras.py @@ -37,7 +37,9 @@ import tensorflow as tf import numpy as np +import pandas as pd from typing import Callable, Optional +from cleanlab.internal.validation import assert_valid_inputs class KerasWrapperModel: @@ -87,11 +89,26 @@ def get_params(self, deep=True): } def fit(self, X, y=None, **kwargs): - """Note that `X` dataset object must already contain the labels as is required for standard Keras fit. - You can optionally provide the labels again here as argument `y` to be compatible with sklearn, but they are ignored. + """Trains a Keras classifier. + + Parameters + ---------- + X : tf.Dataset or np.array or pd.DataFrame + If `X` is a tensorflow dataset object, it must already contain the labels as is required for standard Keras fit. + + y : np.array or pd.DataFrame, default = None + If `X` is a tensorflow dataset object, you can optionally provide the labels again here as argument `y` to be compatible with sklearn, + but they are ignored. + If `X` is a numpy array or pandas dataframe, the labels have to be passed in using this argument. """ + self.net = self.model(**self.model_kwargs) self.net.compile(**self.compile_kwargs) + + if isinstance(X, (np.ndarray, pd.DataFrame)): + assert_valid_inputs(X, y) + kwargs["y"] = y + self.net.fit(X, **kwargs) def predict_proba(self, X, *, apply_softmax=True, **kwargs): @@ -151,11 +168,25 @@ def get_params(self, deep=True): } def fit(self, X, y=None, **kwargs): - """Note that `X` dataset object must already contain the labels as is required for standard Keras fit. - You can optionally provide the labels again here as argument `y` to be compatible with sklearn, but they are ignored. + """Trains a Sequential Keras classifier. + + Parameters + ---------- + X : tf.Dataset or np.array or pd.DataFrame + If `X` is a tensorflow dataset object, it must already contain the labels as is required for standard Keras fit. + + y : np.array or pd.DataFrame, default = None + If `X` is a tensorflow dataset object, you can optionally provide the labels again here as argument `y` to be compatible with sklearn, + but they are ignored. + If `X` is a numpy array or pandas dataframe, the labels have to be passed in using this argument. """ self.net = tf.keras.models.Sequential(self.layers, self.name) self.net.compile(**self.compile_kwargs) + + if isinstance(X, (np.ndarray, pd.DataFrame)): + assert_valid_inputs(X, y) + kwargs["y"] = y + self.net.fit(X, **kwargs) def predict_proba(self, X, *, apply_softmax=True, **kwargs): diff --git a/docs/source/tutorials/text.ipynb b/docs/source/tutorials/text.ipynb index 3d39c1a936..8ec3ee27be 100644 --- a/docs/source/tutorials/text.ipynb +++ b/docs/source/tutorials/text.ipynb @@ -11,17 +11,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this 5-minute quickstart tutorial, we use cleanlab to find potential label errors in a text classification dataset of [IMDb movie reviews](https://ai.stanford.edu/~amaas/data/sentiment/). This dataset contains 50,000 text reviews, each labeled with a binary sentiment polarity label indicating whether the review is positive (1) or negative (0). cleanlab will shortlist _hundreds_ of examples that confuse our ML model the most; many of which are potential label errors, edge cases, or otherwise ambiguous examples.\n", + "In this 5-minute quickstart tutorial, we use cleanlab to find potential label errors in a text classification dataset of [IMDB movie reviews](https://ai.stanford.edu/~amaas/data/sentiment/). This dataset contains 50,000 text reviews, each labeled with a binary sentiment polarity label indicating whether the review is positive (1) or negative (0). cleanlab will shortlist _hundreds_ of examples that confuse our ML model the most; many of which are potential label errors, edge cases, or otherwise ambiguous examples.\n", "\n", "**Overview of what we'll do in this tutorial:**\n", "\n", - "- Build a simple TensorFlow & Keras neural net and wrap it with [SciKeras](https://www.adriangb.com/scikeras/) to make it scikit-learn compatible.\n", + "- Build a simple TensorFlow & Keras neural network and wrap it with cleanlab's `KerasWrapperSequential`. This wrapper class makes *any* Keras/Tensorflow model compatible with scikit-learn (and some advanced cleanlab functionality like `CleanLearning` is easier to run with scikit-learn-compatible models).\n", "\n", - "- Use this classifier to compute out-of-sample predicted probabilities, `pred_probs`, via cross validation.\n", + "- Use `CleanLearning` to automatically compute out-of-sample preddicted probabilites and identify potential label errors with the `find_label_issues` method.\n", "\n", - "- Identify potential label errors in the data with cleanlab's `find_label_issues` method.\n", - "\n", - "- Train a more robust version of the same neural net via cleanlab's `CleanLearning` wrapper.\n" + "- Train a more robust version of the same neural network after dropping the identified label errors using `CleanLearning`." ] }, { @@ -32,7 +30,9 @@ "Quickstart\n", "
\n", " \n", - "Already have an sklearn compatible `model`, text `data` and given `labels`? Run the code below to train your `model` and get label issues.\n", + "Already have an sklearn compatible `model`, `data` and given `labels`? Run the code below to train your `model` and get label issues using `CleanLearning`. \n", + " \n", + "You can subsequently use the same `CleanLearning` object to train a more robust model (only trained on the clean data) by calling the `.fit()` method and passing in the `label_issues` found earlier.\n", "\n", "\n", "
\n", @@ -42,10 +42,11 @@ "from cleanlab.classification import CleanLearning\n", "\n", "cl = CleanLearning(model)\n", - "_ = cl.fit(train_data, labels)\n", - "label_issues = cl.get_label_issues()\n", - "preds = cl.predict(test_data) # predictions from a version of your model \n", - " # trained on auto-cleaned data\n", + "label_issues = cl.find_label_issues(train_data, labels) # identify mislabeled examples \n", + " \n", + "cl.fit(train_data, labels, label_issues=label_issues)\n", + "preds = cl.predict(test_data) # predictions from a version of your model \n", + " # trained on auto-cleaned data\n", "\n", "\n", "```\n", @@ -88,7 +89,7 @@ "You can use `pip` to install all packages required for this tutorial as follows:\n", "\n", "```ipython3\n", - "!pip install sklearn tensorflow tensorflow-datasets scikeras\n", + "!pip install sklearn tensorflow tensorflow-datasets\n", "!pip install cleanlab\n", "# Make sure to install the version corresponding to this tutorial\n", "# E.g. if viewing master branch documentation:\n", @@ -106,9 +107,9 @@ "source": [ "# Package installation (hidden on docs.cleanlab.ai).\n", "# If running on Colab, may want to use GPU (select: Runtime > Change runtime type > Hardware accelerator > GPU)\n", - "# Package versions we used: tensorflow==2.9.1 scikeras==0.9.0 scikit-learn==1.1.3 tensorflow_datasets==4.5.2\n", + "# Package versions we used: tensorflow==2.9.1 scikit-learn==1.2.0 tensorflow_datasets==4.5.2\n", "\n", - "dependencies = [\"cleanlab\", \"sklearn\", \"tensorflow\", \"tensorflow_datasets\", \"scikeras\"]\n", + "dependencies = [\"cleanlab\", \"sklearn\", \"tensorflow\", \"tensorflow_datasets\"]\n", "\n", "# Supress outputs that may appear if tensorflow happens to be improperly installed: \n", "import os \n", @@ -148,7 +149,9 @@ "import tensorflow as tf \n", "from tensorflow.keras import layers \n", "import tensorflow_datasets as tfds \n", - "from scikeras.wrappers import KerasClassifier \n", + "\n", + "from cleanlab.classification import CleanLearning\n", + "from cleanlab.experimental.keras import KerasWrapperSequential\n", "\n", "SEED = 123456 # for reproducibility " ] @@ -194,11 +197,11 @@ "outputs": [], "source": [ "%%capture\n", + "raw_train_ds = tfds.load(name=\"imdb_reviews\", split=\"train\", batch_size=-1, as_supervised=True)\n", + "raw_test_ds = tfds.load(name=\"imdb_reviews\", split=\"test\", batch_size=-1, as_supervised=True)\n", "\n", - "raw_full_ds = tfds.load(\n", - " name=\"imdb_reviews\", split=(\"train+test\"), batch_size=-1, as_supervised=True\n", - ")\n", - "raw_full_texts, full_labels = tfds.as_numpy(raw_full_ds)" + "raw_train_texts, train_labels = tfds.as_numpy(raw_train_ds)\n", + "raw_test_texts, test_labels = tfds.as_numpy(raw_test_ds)" ] }, { @@ -207,15 +210,15 @@ "metadata": {}, "outputs": [], "source": [ - "num_classes = len(set(full_labels))\n", - "print(f\"Classes: {set(full_labels)}\")" + "num_classes = len(set(train_labels))\n", + "print(f\"Classes: {set(train_labels)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's print the first example." + "Let's print the first example in the train set." ] }, { @@ -225,18 +228,18 @@ "outputs": [], "source": [ "i = 0\n", - "print(f\"Example Label: {full_labels[i]}\")\n", - "print(f\"Example Text: {raw_full_texts[i]}\")" + "print(f\"Example Label: {train_labels[i]}\")\n", + "print(f\"Example Text: {raw_train_texts[i]}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The data are stored as two numpy arrays:\n", + "The data is stored as two numpy arrays for each the train and test set:\n", "\n", - "1. `raw_full_texts` for the movie reviews in text format,\n", - "2. `full_labels` for the labels.\n" + "1. `raw_train_texts` and `raw_test_texts` for the movie reviews in text format,\n", + "2. `train_labels` and `test_labels` for the labels.\n" ] }, { @@ -248,8 +251,8 @@ "\n", "You can easily replace the above with your own text dataset, and continue with the rest of the tutorial.\n", "\n", - "Your classes (and entries of `full_labels`) should be represented as integer indices 0, 1, ..., num_classes - 1.\n", - "For example, if your dataset has 7 examples from 3 classes, `full_labels` might be: `np.array([2,0,0,1,2,0,1])`\n", + "Your classes (and entries of `train_labels` / `test_labels`) should be represented as integer indices 0, 1, ..., num_classes - 1.\n", + "For example, if your dataset has 7 examples from 3 classes, `train_labels` might be: `np.array([2,0,0,1,2,0,1])`\n", "\n", "
\n" ] @@ -305,7 +308,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Adapting `vectorize_layer` to the text data creates a mapping of each token (i.e. word) to an integer index. Subsequently, we can vectorize our text data by using this mapping. Finally, we'll also convert our text data into a numpy array as required by cleanlab.\n" + "Adapting `vectorize_layer` to the text data creates a mapping of each token (i.e. word) to an integer index. Note that we only adapt the vectorization on the train set, as it is standard ML practice. \n", + "\n", + "Subsequently, we can vectorize our text data in the train and test sets by using this mapping. " ] }, { @@ -314,25 +319,27 @@ "metadata": {}, "outputs": [], "source": [ - "%%capture\n", + "vectorize_layer.reset_state()\n", + "vectorize_layer.adapt(raw_train_texts)\n", "\n", - "vectorize_layer.adapt(raw_full_texts)\n", - "full_texts = vectorize_layer(raw_full_texts)\n", - "full_texts = full_texts.numpy()" + "train_texts = vectorize_layer(raw_train_texts).numpy()\n", + "test_texts = vectorize_layer(raw_test_texts).numpy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Define a classification model and compute out-of-sample predicted probabilities\n" + "## 3. Define a classification model and use cleanlab to find potential label errors\n", + "\n", + "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Here, we build a simple neural network for classification with TensorFlow and Keras.\n" + "Here, we build a simple neural network for classification with TensorFlow and Keras. We will also wrap it with cleanlab's `KerasWrapperSequential` to make it compatible with sklearn (and hence`CleanLearning`). Note: you can wrap *any* existing Keras model this way, by just replacing `keras.Sequential` with `KerasWrapperSequential` in your code. \n" ] }, { @@ -341,9 +348,11 @@ "metadata": {}, "outputs": [], "source": [ - "def get_net():\n", - " net = tf.keras.Sequential(\n", - " [\n", + "def get_nn_model():\n", + " # simply replace `keras.Sequential(` with cleanlab's class in this line to make any keras model sklearn-compatible \n", + " # the rest of your existing keras code does not need to change at all \n", + " model = KerasWrapperSequential( \n", + " [ \n", " tf.keras.Input(shape=(None,), dtype=\"int64\"),\n", " layers.Embedding(max_features + 1, 16),\n", " layers.Dropout(0.2),\n", @@ -351,22 +360,24 @@ " layers.Dropout(0.2),\n", " layers.Dense(num_classes),\n", " layers.Softmax()\n", - " ]\n", - " ) # outputs probability that text belongs to class 1\n", - "\n", - " net.compile(\n", - " optimizer=\"adam\",\n", - " loss=tf.keras.losses.SparseCategoricalCrossentropy(),\n", - " metrics=tf.keras.metrics.CategoricalAccuracy(),\n", + " ], # outputs probability that text belongs to class 1\n", + " compile_kwargs= {\n", + " \"optimizer\":\"adam\",\n", + " \"loss\":tf.keras.losses.SparseCategoricalCrossentropy(),\n", + " \"metrics\":tf.keras.metrics.CategoricalAccuracy(),\n", + " },\n", " )\n", - " return net" + " \n", + " return model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "As some of cleanlab's feature requires scikit-learn compatibility, we will need to adapt the above TensorFlow & Keras neural net accordingly. [SciKeras](https://www.adriangb.com/scikeras/stable/) is a convenient package that makes this really easy.\n" + "We can define the `CleanLearning` object with the neural network model and use `find_label_issues` to identify potential label errors.\n", + "\n", + "`CleanLearning` provides a wrapper class that can easily be applied to any scikit-learn compatible model, which can be used to find potential label issues or train a more robust model if the original data contains noisy labels." ] }, { @@ -375,39 +386,38 @@ "metadata": {}, "outputs": [], "source": [ - "model = KerasClassifier(get_net(), epochs=10)" + "cv_n_folds = 3 # for efficiency; values like 5 or 10 will generally work better\n", + "num_epochs = 15 " ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "To identify label issues, cleanlab requires a probabilistic prediction from your model for every datapoint that should be considered. However these predictions will be _overfit_ (and thus unreliable) for datapoints the model was previously trained on. cleanlab is intended to only be used with **out-of-sample** predicted probabilities, i.e. on datapoints held-out from the model during the training.\n", - "\n", - "K-fold cross-validation is a straightforward way to produce out-of-sample predicted probabilities for every datapoint in the dataset, by training K copies of our model on different data subsets and using each copy to predict on the subset of data it did not see during training. We can obtain cross-validated out-of-sample predicted probabilities from any classifier via a scikit-learn simple wrapper:\n" + "model = get_nn_model()\n", + "cl = CleanLearning(model, cv_n_folds=cv_n_folds)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "num_crossval_folds = 3 # for efficiency; values like 5 or 10 will generally work better\n", - "pred_probs = cross_val_predict(\n", - " model,\n", - " full_texts,\n", - " full_labels,\n", - " cv=num_crossval_folds,\n", - " method=\"predict_proba\",\n", - ")" + "label_issues = cl.find_label_issues(X=train_texts, labels=train_labels, clf_kwargs={\"epochs\": num_epochs})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "An additional benefit of cross-validation is that it facilitates more reliable evaluation of our model than a single training/validation split." + "The `find_label_issues` method above will perform cross validation to compute out-of-sample predicted probabilites for each example, which is used to identify label issues.\n", + "\n", + "This method returns a dataframe containing a label quality score for each example. These numeric scores lie between 0 and 1, where lower scores indicate examples more likely to be mislabeled. The dataframe also contains a boolean column specifying whether or not each example is identified to have a label issue (indicating it is likely mislabeled)." ] }, { @@ -416,22 +426,24 @@ "metadata": {}, "outputs": [], "source": [ - "loss = log_loss(full_labels, pred_probs) # score to evaluate probabilistic predictions, lower values are better\n", - "print(f\"Cross-validated estimate of log loss: {loss:.3f}\")" + "label_issues.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 4. Use cleanlab to find potential label errors\n" + "We can get the subset of examples flagged with label issues, and also sort by label quality score to find the indices of the 10 most likely mislabeled examples in our dataset." ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "Based on the given labels and out-of-sample predicted probabilities, cleanlab can quickly help us identify label issues in our dataset. For a dataset with N examples from K classes, the labels should be a 1D array of length N and predicted probabilities should be a 2D (N x K) array. Here we request that the indices of the identified label issues should be sorted by cleanlab's self-confidence score, which measures the quality of each given label via the probability assigned it in our model's prediction.\n" + "identified_issues = label_issues[label_issues[\"is_label_issue\"] == True]\n", + "lowest_quality_labels = label_issues[\"label_quality\"].argsort()[:10].to_numpy()" ] }, { @@ -440,10 +452,9 @@ "metadata": {}, "outputs": [], "source": [ - "from cleanlab.filter import find_label_issues\n", - "\n", - "ranked_label_issues = find_label_issues(\n", - " labels=full_labels, pred_probs=pred_probs, return_indices_ranked_by=\"self_confidence\"\n", + "print(\n", + " f\"cleanlab found {len(identified_issues)} potential label errors in the dataset.\\n\"\n", + " f\"Here are indices of the top 10 most likely errors: \\n {lowest_quality_labels}\"\n", ")" ] }, @@ -454,18 +465,6 @@ "Let's review some of the most likely label errors:\n" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\n", - " f\"cleanlab found {len(ranked_label_issues)} potential label errors.\\n\"\n", - " f\"Here are indices of the top 10 most likely errors: \\n {ranked_label_issues[:10]}\"\n", - ")" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -481,7 +480,7 @@ "source": [ "def print_as_df(index):\n", " return pd.DataFrame(\n", - " {\"texts\": raw_full_texts[index], \"labels\": full_labels[index]},\n", + " {\"texts\": raw_train_texts[index], \"labels\": train_labels[index]},\n", " [index]\n", " )" ] @@ -514,7 +513,7 @@ "metadata": {}, "outputs": [], "source": [ - "print_as_df(44582)" + "print_as_df(22294)" ] }, { @@ -541,7 +540,7 @@ "metadata": {}, "outputs": [], "source": [ - "print_as_df(10404)" + "print_as_df(5204)" ] }, { @@ -570,7 +569,7 @@ "metadata": {}, "outputs": [], "source": [ - "print_as_df(30151)" + "print_as_df(15079)" ] }, { @@ -584,58 +583,33 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 5. Train a more robust model from noisy labels\n" + "## 4. Train a more robust model from noisy labels\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Fixing the label issues manually may be time-consuming, but at least cleanlab can filter these noisy examples and train a model on the remaining clean data for you automatically.\n", - "To demonstrate this, we first reload the dataset, this time with separate train and test splits.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_train_ds = tfds.load(name=\"imdb_reviews\", split=\"train\", batch_size=-1, as_supervised=True)\n", - "raw_test_ds = tfds.load(name=\"imdb_reviews\", split=\"test\", batch_size=-1, as_supervised=True)\n", - "\n", - "raw_train_texts, train_labels = tfds.as_numpy(raw_train_ds)\n", - "raw_test_texts, test_labels = tfds.as_numpy(raw_test_ds)" + "Fixing the label issues manually may be time-consuming, but cleanlab can filter these noisy examples and train a model on the remaining clean data for you automatically.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We featurize the raw text using the same `vectorize_layer` as before, but first, reset its state and adapt it only on the train set (as is proper ML practice). We finally convert the vectorized text data in the train/test sets into numpy arrays.\n" + "To establish a baseline, let's first train and evaluate our original neural network model.\n" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "vectorize_layer.reset_state()\n", - "vectorize_layer.adapt(raw_train_texts)\n", - "\n", - "train_texts = vectorize_layer(raw_train_texts)\n", - "test_texts = vectorize_layer(raw_test_texts)\n", - "\n", - "train_texts = train_texts.numpy()\n", - "test_texts = test_texts.numpy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's now train and evaluate our original neural network model.\n" + "baseline_model = get_nn_model() # note we first re-instantiate the model\n", + "baseline_model.fit(X=train_texts, y=train_labels, epochs=num_epochs)" ] }, { @@ -644,10 +618,7 @@ "metadata": {}, "outputs": [], "source": [ - "model = KerasClassifier(get_net(), epochs=10)\n", - "model.fit(train_texts, train_labels)\n", - "\n", - "preds = model.predict(test_texts)\n", + "preds = baseline_model.predict(test_texts)\n", "acc_og = accuracy_score(test_labels, preds)\n", "print(f\"\\n Test accuracy of original neural net: {acc_og}\")" ] @@ -656,42 +627,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "cleanlab provides a wrapper class that can easily be applied to any scikit-learn compatible model. Once wrapped, the resulting model can still be used in the exact same manner, but it will now train more robustly if the data have noisy labels.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from cleanlab.classification import CleanLearning\n", + "Now that we have a baseline, let's check if using `CleanLearning` improves our test accuracy.\n", "\n", - "model = KerasClassifier(get_net(), epochs=10) # Note we first re-instantiate the model\n", - "cl = CleanLearning(clf=model, seed=SEED) # cl has same methods/attributes as model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When we train the cleanlab-wrapped model, the following operations take place: The original model is trained in a cross-validated fashion to produce out-of-sample predicted probabilities. Then, these predicted probabilities are used to identify label issues, which are then removed from the dataset. Finally, the original model is trained once more on the remaining clean subset of the data.\n" + "`CleanLearning` provides a wrapper that can be applied to any scikit-learn compatible model. The resulting model object can be used in the same manner, but it will now train more robustly if the data has noisy labels.\n", + "\n", + "We can use the same `CleanLearning` object defined above, and pass the label issues we already computed into `.fit()` via the `label_issues` argument. This accelerates things; if we did not provide the label issues, then they would be recomputed via cross-validation. After that `CleanLearning` simply deletes the examples with label issues and retrains your model on the remaining data." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "_ = cl.fit(train_texts, train_labels)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can get predictions from the resulting cleanlab model and evaluate them, just like we did for our original neural network.\n" + "cl.fit(X=train_texts, labels=train_labels, label_issues=cl.get_label_issues(), clf_kwargs={\"epochs\": num_epochs})" ] }, { @@ -722,8 +673,8 @@ "source": [ "# Note: This cell is only for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", "\n", - "highlighted_indices = [44582, 10404, 30151] # check these examples were found in find_label_issues\n", - "if not all(x in ranked_label_issues for x in highlighted_indices):\n", + "highlighted_indices = [5204, 22294, 15079] # check these examples were found in find_label_issues\n", + "if not all(x in identified_issues.index for x in highlighted_indices):\n", " raise Exception(\"Some highlighted examples are missing from ranked_label_issues.\")\n", "\n", "# Also check that cleanlab has improved prediction accuracy\n", @@ -753,7 +704,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.8" } }, "nbformat": 4, diff --git a/tests/test_frameworks.py b/tests/test_frameworks.py index 9d9cdadbdb..09604cc88f 100644 --- a/tests/test_frameworks.py +++ b/tests/test_frameworks.py @@ -151,6 +151,10 @@ def test_tensorflow_sequential(batch_size, shuffle_config, data=DATA, hidden_uni assert issue_indices == data["error_indices"] assert err < 1e-3 + # Test wrapper works with numpy array + cl = CleanLearning(model) + cl.fit(data["X"], data["y"]) + @pytest.mark.skipif("not python_version_ok()", reason="need at least python 3.7") @pytest.mark.parametrize("batch_size,shuffle_config", [(1, 0), (32, 0), (32, 1), (32, 2)]) @@ -197,6 +201,10 @@ def make_model(num_features, num_classes): assert len(set(issue_indices) & set(data["error_indices"])) != 0 assert err < 1e-3 + # Test wrapper works with numpy array + cl = CleanLearning(model) + cl.fit(data["X"], data["y"]) + @pytest.mark.skipif("not python_version_ok()", reason="need at least python 3.7") @pytest.mark.parametrize("batch_size", [1, 32]) From fdfb0290016ed390c553695a03f5e539a4c38ff6 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Fri, 6 Jan 2023 08:55:42 -0800 Subject: [PATCH 084/258] clarify thresholding in issues_from_scores (#582) --- cleanlab/token_classification/rank.py | 12 +++++++++--- docs/source/tutorials/token_classification.ipynb | 4 ++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/cleanlab/token_classification/rank.py b/cleanlab/token_classification/rank.py index bd7a3f59d0..69fdb13931 100644 --- a/cleanlab/token_classification/rank.py +++ b/cleanlab/token_classification/rank.py @@ -154,9 +154,15 @@ def issues_from_scores( Converts scores output by :py:func:`token_classification.rank.get_label_quality_scores ` to a list of issues of similar format as output by :py:func:`token_classification.filter.find_label_issues `. - Only considers as issues those tokens with label quality score lower than `threshold`. - - Issues are sorted by label quality score, from most severe to least. + Issues are sorted by label quality score, from most to least severe. + + Only considers as issues those tokens with label quality score lower than `threshold`, + so this parameter determines the number of issues that are returned. + This method is intended for converting the most severely mislabeled examples to a format compatible with + ``summary`` methods like :py:func:`token_classification.summary.display_issues `. + This method does not estimate the number of label errors since the `threshold` is arbitrary, + for that instead use :py:func:`token_classification.filter.find_label_issues `, + which estimates the label errors via Confident Learning rather than score thresholding. Parameters ---------- diff --git a/docs/source/tutorials/token_classification.ipynb b/docs/source/tutorials/token_classification.ipynb index 9e261fd57e..938559540d 100644 --- a/docs/source/tutorials/token_classification.ipynb +++ b/docs/source/tutorials/token_classification.ipynb @@ -445,7 +445,7 @@ "id": "a35ef843", "metadata": {}, "source": [ - "### Find issue sentences with particular word \n", + "### Find sentences containing a particular mislabeled word \n", "\n", "You can also only focus on the subset of potentially problematic sentences where a particular token may have been mislabeled." ] @@ -470,7 +470,7 @@ "source": [ "### Sentence label quality score \n", "\n", - "For best reviewing label issues in a token classification dataset, you want to look at sentences one at a time. Here sentences more likely to contain a label error should be ranked earlier. Cleanlab can provide an overall label quality score for each sentence (ranging from 0 to 1) such that lower scores indicate sentences more likely to contain some mislabeled token. We can also obtain label quality scores for each individual token and decide which of these are label issues by thresholding them. This may be a superior approach if high precision (or high recall) is specifically preferred for your label error detection." + "For best reviewing label issues in a token classification dataset, you want to look at sentences one at a time. Here sentences more likely to contain a label error should be ranked earlier. Cleanlab can provide an overall label quality score for each sentence (ranging from 0 to 1) such that lower scores indicate sentences more likely to contain some mislabeled token. We can also obtain label quality scores for each individual token and manually decide which of these are label issues by thresholding them. For automatically estimating which tokens are mislabeled (and the number of label errors), you should use `find_label_issues()` instead. `get_label_quality_scores()` is useful if you only have time to review a few sentences and want to prioritize which, or if you're specifically aiming to detect label errors with high precision (or high recall) rather than overall estimation of the set of mislabeled tokens." ] }, { From d911acc7e89d47b283907d17ce2b3b274009725e Mon Sep 17 00:00:00 2001 From: Hui Wen <45724323+huiwengoh@users.noreply.github.com> Date: Sat, 7 Jan 2023 01:54:01 +0800 Subject: [PATCH 085/258] Remove temp scaling from single annotator case (#590) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/multiannotator.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cleanlab/multiannotator.py b/cleanlab/multiannotator.py index 214cd8b247..7b57e2e3cd 100644 --- a/cleanlab/multiannotator.py +++ b/cleanlab/multiannotator.py @@ -585,11 +585,10 @@ def get_active_learning_scores( num_classes = get_num_classes(pred_probs=pred_probs) - optimal_temp = find_best_temp_scaler(labels_multiannotator, pred_probs) - pred_probs = temp_scale_pred_probs(pred_probs, optimal_temp) - # if all examples are only labeled by a single annotator if labels_multiannotator.apply(lambda s: len(s.dropna()) == 1, axis=1).all(): + optimal_temp = 1.0 # do not temp scale for single annotator case, temperature is defined here for later use + assert_valid_inputs_multiannotator( labels_multiannotator, pred_probs, allow_single_label=True ) @@ -605,6 +604,9 @@ def get_active_learning_scores( avg_annotator_weight = np.mean(annotator_weight) else: + optimal_temp = find_best_temp_scaler(labels_multiannotator, pred_probs) + pred_probs = temp_scale_pred_probs(pred_probs, optimal_temp) + multiannotator_info = get_label_quality_multiannotator( labels_multiannotator, pred_probs, From fe9efda26933f5650cc256d4e11b686d9b66e73a Mon Sep 17 00:00:00 2001 From: Hui Wen <45724323+huiwengoh@users.noreply.github.com> Date: Sat, 7 Jan 2023 06:22:16 +0800 Subject: [PATCH 086/258] update docs dependencies (#593) --- docs/requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 00db3f8b9e..1c944c18a7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -15,8 +15,6 @@ matplotlib==3.5.1 skorch==0.11.0 tensorflow-datasets==4.5.2 tensorflow==2.9.1 -scikeras==0.9.0 -scikit-learn<1.2.0 speechbrain==0.5.12 tensorflow-io==0.26.0 huggingface_hub==0.7 From ebadffdebe4df80dc4c9a210c042c661f31bbc21 Mon Sep 17 00:00:00 2001 From: Ulyana Date: Fri, 6 Jan 2023 21:25:23 -0800 Subject: [PATCH 087/258] Use euclidean distance for identifying outliers for lower dimensional features (#581) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/outlier.py | 12 ++++++++++-- tests/test_outlier.py | 26 ++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/cleanlab/outlier.py b/cleanlab/outlier.py index b2190ada65..ad07691596 100644 --- a/cleanlab/outlier.py +++ b/cleanlab/outlier.py @@ -55,7 +55,8 @@ class OutOfDistribution: You can also pass in a subclass of ``sklearn.neighbors.NearestNeighbors`` which allows you to use faster approximate neighbor libraries as long as you wrap them behind the same sklearn API. If you specify ``knn`` here, there is no need to later call ``fit()`` before calling ``score()``. - If ``knn = None``, then by default: ``knn = sklearn.neighbors.NearestNeighbors(n_neighbors=k, metric="cosine").fit(features)`` + If ``knn = None``, then by default: ``knn = sklearn.neighbors.NearestNeighbors(n_neighbors=k, metric=dist_metric).fit(features)`` + where ``dist_metric == "cosine"`` if ``dim(features) > 3`` or ``dist_metric == "euclidean"`` otherwise. See: https://scikit-learn.org/stable/modules/neighbors.html * k : int, default=None Optional number of neighbors to use when calculating outlier score (average distance to neighbors). @@ -411,8 +412,15 @@ def _get_ood_features_scores( raise ValueError( f"Number of nearest neighbors k={k} cannot exceed the number of examples N={len(features)} passed into the estimator (knn)." ) - knn = NearestNeighbors(n_neighbors=k, metric="cosine").fit(features) + + if features.shape[1] > 3: # use euclidean distance for lower dimensional spaces + metric = "cosine" + else: + metric = "euclidean" + + knn = NearestNeighbors(n_neighbors=k, metric=metric).fit(features) features = None # features should be None in knn.kneighbors(features) to avoid counting duplicate data points + elif k is None: k = knn.n_neighbors diff --git a/tests/test_outlier.py b/tests/test_outlier.py index 545b008de5..6a07f16fdf 100644 --- a/tests/test_outlier.py +++ b/tests/test_outlier.py @@ -270,6 +270,26 @@ def test_class_public_func(): OOD_ood_already_fit.params["confident_thresholds"] == confident_thresholds ).all() # Assert not overwritten + # Testing fit uses correct metrics given feature dimensionality + X_small = np.random.rand(20, 3) + OOD_euclidean = OutOfDistribution() + OOD_euclidean.fit(features=X_small) + assert OOD_euclidean.params["knn"].metric == "euclidean" + X_small_with_ood = np.vstack([X_small, [999999.0] * 3]) + euclidean_score = OOD_euclidean.score(features=X_small_with_ood) + assert (np.max(euclidean_score) <= 1) and (np.min(euclidean_score) >= 0) + assert np.argmin(euclidean_score) == (euclidean_score.shape[0] - 1) + + # Re-run tests with high dimensional dataset + X_large = np.hstack([np.zeros((200, 400)), np.random.rand(200, 1)]) + OOD_cosine = OutOfDistribution() + OOD_cosine.fit(features=X_large) + assert OOD_cosine.params["knn"].metric == "cosine" + X_large_with_ood = np.vstack([X_large, [999999.0] * 401]) + cosine_score = OOD_cosine.score(features=X_large_with_ood) + assert (np.max(cosine_score) <= 1) and (np.min(cosine_score) >= 0) + assert np.argmin(cosine_score) == (cosine_score.shape[0] - 1) + #### TESTING SCORE ood_score = OOD_ood.score(pred_probs=pred_probs) outlier_score = OOD_outlier.score(features=features) @@ -329,14 +349,12 @@ def test_get_ood_features_scores(): X_test_with_ood = np.vstack([X_test, X_ood]) # Fit nearest neighbors on X_train - knn = NearestNeighbors(n_neighbors=5).fit(X_train) - + knn = NearestNeighbors(n_neighbors=5, metric="euclidean").fit(X_train) # Get KNN distance as outlier score k = 5 knn_distance_to_score, _ = outlier._get_ood_features_scores( features=X_test_with_ood, knn=knn, k=k ) - # Checking that X_ood has the smallest outlier score among all the datapoints assert np.argmin(knn_distance_to_score) == (knn_distance_to_score.shape[0] - 1) @@ -376,7 +394,7 @@ def test_default_k_and_model_get_ood_features_scores(): instantiated_k = 10 # Create NN class object with small instantiated k and fit on data - knn = NearestNeighbors(n_neighbors=instantiated_k, metric="cosine").fit(X_with_ood) + knn = NearestNeighbors(n_neighbors=instantiated_k, metric="euclidean").fit(X_with_ood) avg_knn_distances_default_model, _ = outlier._get_ood_features_scores( features=X_with_ood, From 7b589f6d63b8548cef79ffccce35e5d1ea6ccb1c Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Sat, 7 Jan 2023 10:55:57 +0530 Subject: [PATCH 088/258] updating copyright year to include 2023 (#594) --- README.md | 2 +- cleanlab/benchmarking/noise_generation.py | 2 +- cleanlab/classification.py | 2 +- cleanlab/count.py | 2 +- cleanlab/dataset.py | 2 +- cleanlab/experimental/cifar_cnn.py | 2 +- cleanlab/experimental/coteaching.py | 2 +- cleanlab/experimental/fasttext.py | 2 +- cleanlab/experimental/keras.py | 10 +++++----- cleanlab/experimental/mnist_pytorch.py | 2 +- cleanlab/filter.py | 2 +- cleanlab/internal/label_quality_utils.py | 2 +- cleanlab/internal/latent_algebra.py | 8 ++++---- cleanlab/internal/multiannotator_utils.py | 2 +- cleanlab/internal/multilabel_scorer.py | 2 +- cleanlab/internal/multilabel_utils.py | 2 +- cleanlab/internal/token_classification_utils.py | 2 +- cleanlab/internal/util.py | 2 +- cleanlab/internal/validation.py | 2 +- cleanlab/multiannotator.py | 2 +- cleanlab/multilabel_classification.py | 2 +- cleanlab/outlier.py | 2 +- cleanlab/rank.py | 2 +- cleanlab/token_classification/filter.py | 2 +- cleanlab/token_classification/rank.py | 2 +- cleanlab/token_classification/summary.py | 2 +- cleanlab/version.py | 2 +- tests/test_classification.py | 2 +- tests/test_dataset.py | 2 +- tests/test_filter_count.py | 2 +- tests/test_frameworks.py | 4 ++-- tests/test_latent_algebra.py | 2 +- tests/test_multilabel_classification.py | 2 +- tests/test_outlier.py | 2 +- tests/test_rank.py | 2 +- 35 files changed, 43 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 80dc4a1f35..ad8387034e 100644 --- a/README.md +++ b/README.md @@ -556,7 +556,7 @@ Join our [\#help Slack channel](https://cleanlab.ai/slack) and message one of ou ## License -Copyright (c) 2017-2022 Cleanlab Inc. +Copyright (c) 2017-2023 Cleanlab Inc. cleanlab is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. diff --git a/cleanlab/benchmarking/noise_generation.py b/cleanlab/benchmarking/noise_generation.py index 47937c4453..4c71b57264 100644 --- a/cleanlab/benchmarking/noise_generation.py +++ b/cleanlab/benchmarking/noise_generation.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/classification.py b/cleanlab/classification.py index 3ce2362bcd..407afef0f6 100644 --- a/cleanlab/classification.py +++ b/cleanlab/classification.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/count.py b/cleanlab/count.py index 65db814bfb..ce24e559b4 100644 --- a/cleanlab/count.py +++ b/cleanlab/count.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/dataset.py b/cleanlab/dataset.py index 40e2905a3a..9020b34539 100644 --- a/cleanlab/dataset.py +++ b/cleanlab/dataset.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/experimental/cifar_cnn.py b/cleanlab/experimental/cifar_cnn.py index 15e82d1b8e..13c08d35d3 100644 --- a/cleanlab/experimental/cifar_cnn.py +++ b/cleanlab/experimental/cifar_cnn.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/experimental/coteaching.py b/cleanlab/experimental/coteaching.py index b6247ee57d..8d46f523f5 100644 --- a/cleanlab/experimental/coteaching.py +++ b/cleanlab/experimental/coteaching.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/experimental/fasttext.py b/cleanlab/experimental/fasttext.py index 6247e3cf97..ad1a7923a4 100644 --- a/cleanlab/experimental/fasttext.py +++ b/cleanlab/experimental/fasttext.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/experimental/keras.py b/cleanlab/experimental/keras.py index 96f2caf545..4e34567b47 100644 --- a/cleanlab/experimental/keras.py +++ b/cleanlab/experimental/keras.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify @@ -16,9 +16,9 @@ """ Wrapper class you can use to make any Keras model compatible with :py:class:`CleanLearning ` and sklearn. -Use :py:class:`KerasWrapperModel` to wrap existing functional API code for ``keras.Model`` objects, +Use :py:class:`KerasWrapperModel` to wrap existing functional API code for ``keras.Model`` objects, and :py:class:`KerasWrapperSequential` to wrap existing ``tf.keras.models.Sequential`` objects. -Most of the instance methods of this class work the same as the ones for the wrapped Keras model, +Most of the instance methods of this class work the same as the ones for the wrapped Keras model, see the `Keras documentation `_ for details. This is a good example of making any bespoke neural network compatible with cleanlab. @@ -30,8 +30,8 @@ * If this class lacks certain functionality, you can alternatively try `scikeras `_. * Unlike scikeras, our `KerasWrapper` classes can operate directly on ``tensorflow.data.Dataset`` objects (like regular Keras models). * To call ``fit()`` on a tensorflow ``Dataset`` object with a Keras model, the ``Dataset`` should already be batched. -* Check out our `example `_ using this class: `huggingface_keras_imdb `_ -* Our `unit tests `_ also provide basic usage examples. +* Check out our `example `_ using this class: `huggingface_keras_imdb `_ +* Our `unit tests `_ also provide basic usage examples. """ diff --git a/cleanlab/experimental/mnist_pytorch.py b/cleanlab/experimental/mnist_pytorch.py index bdc9b6ffba..3166b68f16 100644 --- a/cleanlab/experimental/mnist_pytorch.py +++ b/cleanlab/experimental/mnist_pytorch.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/filter.py b/cleanlab/filter.py index 7a65da6622..e507011416 100644 --- a/cleanlab/filter.py +++ b/cleanlab/filter.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/internal/label_quality_utils.py b/cleanlab/internal/label_quality_utils.py index 2fbead808d..7317d134c0 100644 --- a/cleanlab/internal/label_quality_utils.py +++ b/cleanlab/internal/label_quality_utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/internal/latent_algebra.py b/cleanlab/internal/latent_algebra.py index 20e1c62de2..af3b0ec177 100644 --- a/cleanlab/internal/latent_algebra.py +++ b/cleanlab/internal/latent_algebra.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify @@ -18,8 +18,8 @@ """ Contains mathematical functions relating the latent terms, ``P(given_label)``, ``P(given_label | true_label)``, ``P(true_label | given_label)``, ``P(true_label)``, etc. together. -For every function here, if the inputs are exact, the output is guaranteed to be exact. -Every function herein is the computational equivalent of a mathematical equation having a closed, exact form. +For every function here, if the inputs are exact, the output is guaranteed to be exact. +Every function herein is the computational equivalent of a mathematical equation having a closed, exact form. If the inputs are inexact, the error will of course propagate. Throughout `K` denotes the number of classes in the classification task. """ @@ -150,7 +150,7 @@ def compute_noise_matrix_from_inverse(ps, inverse_noise_matrix, *, py=None) -> n Returns ------- - noise_matrix : np.ndarray + noise_matrix : np.ndarray Array of shape ``(K, K)``, where `K` = number of classes, whose columns sum to 1. A conditional probability matrix of the form ``P(label=k_s|true_label=k_y)`` containing the fraction of examples in every class, labeled as every other class. diff --git a/cleanlab/internal/multiannotator_utils.py b/cleanlab/internal/multiannotator_utils.py index 42e34d7903..93cec654fc 100644 --- a/cleanlab/internal/multiannotator_utils.py +++ b/cleanlab/internal/multiannotator_utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/internal/multilabel_scorer.py b/cleanlab/internal/multilabel_scorer.py index 287a7a7e34..dd58312a2f 100644 --- a/cleanlab/internal/multilabel_scorer.py +++ b/cleanlab/internal/multilabel_scorer.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/internal/multilabel_utils.py b/cleanlab/internal/multilabel_utils.py index 22df37cf8f..57fb5da17b 100644 --- a/cleanlab/internal/multilabel_utils.py +++ b/cleanlab/internal/multilabel_utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/internal/token_classification_utils.py b/cleanlab/internal/token_classification_utils.py index fdc012e3a6..ab384294b4 100644 --- a/cleanlab/internal/token_classification_utils.py +++ b/cleanlab/internal/token_classification_utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/internal/util.py b/cleanlab/internal/util.py index ea7ef5f8c3..79ee8e2337 100644 --- a/cleanlab/internal/util.py +++ b/cleanlab/internal/util.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/internal/validation.py b/cleanlab/internal/validation.py index 5ce017d1d7..dce6cc9a6c 100644 --- a/cleanlab/internal/validation.py +++ b/cleanlab/internal/validation.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/multiannotator.py b/cleanlab/multiannotator.py index 7b57e2e3cd..34ff224255 100644 --- a/cleanlab/multiannotator.py +++ b/cleanlab/multiannotator.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/multilabel_classification.py b/cleanlab/multilabel_classification.py index 45307adbfa..f2d3a3c135 100644 --- a/cleanlab/multilabel_classification.py +++ b/cleanlab/multilabel_classification.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/outlier.py b/cleanlab/outlier.py index ad07691596..57011788f8 100644 --- a/cleanlab/outlier.py +++ b/cleanlab/outlier.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/rank.py b/cleanlab/rank.py index ef2e03b5a3..ae0f455a86 100644 --- a/cleanlab/rank.py +++ b/cleanlab/rank.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/token_classification/filter.py b/cleanlab/token_classification/filter.py index fcaf25b2fe..d10ee050e5 100644 --- a/cleanlab/token_classification/filter.py +++ b/cleanlab/token_classification/filter.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/token_classification/rank.py b/cleanlab/token_classification/rank.py index 69fdb13931..40d379e0f9 100644 --- a/cleanlab/token_classification/rank.py +++ b/cleanlab/token_classification/rank.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/token_classification/summary.py b/cleanlab/token_classification/summary.py index dc93de720b..bf1fe45580 100644 --- a/cleanlab/token_classification/summary.py +++ b/cleanlab/token_classification/summary.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/cleanlab/version.py b/cleanlab/version.py index 7c36fadd30..14c5fc056f 100644 --- a/cleanlab/version.py +++ b/cleanlab/version.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/tests/test_classification.py b/tests/test_classification.py index 0f0277f098..41c9a47b69 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 4532ed6534..a96d26967a 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/tests/test_filter_count.py b/tests/test_filter_count.py index d7984534f7..c48d4f1995 100644 --- a/tests/test_filter_count.py +++ b/tests/test_filter_count.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/tests/test_frameworks.py b/tests/test_frameworks.py index 09604cc88f..e4fccb7eae 100644 --- a/tests/test_frameworks.py +++ b/tests/test_frameworks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify @@ -16,7 +16,7 @@ """ Scripts to test cleanlab usage with deep learning frameworks: -pytorch, skorch, tensorflow, keras +pytorch, skorch, tensorflow, keras """ import pytest diff --git a/tests/test_latent_algebra.py b/tests/test_latent_algebra.py index bfa5e0f0ca..b55ea711e5 100644 --- a/tests/test_latent_algebra.py +++ b/tests/test_latent_algebra.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/tests/test_multilabel_classification.py b/tests/test_multilabel_classification.py index 74ce2e361c..9454b884d3 100644 --- a/tests/test_multilabel_classification.py +++ b/tests/test_multilabel_classification.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/tests/test_outlier.py b/tests/test_outlier.py index 6a07f16fdf..8d330a5892 100644 --- a/tests/test_outlier.py +++ b/tests/test_outlier.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify diff --git a/tests/test_rank.py b/tests/test_rank.py index 53344217c9..3d3c2e1155 100644 --- a/tests/test_rank.py +++ b/tests/test_rank.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 Cleanlab Inc. +# Copyright (C) 2017-2023 Cleanlab Inc. # This file is part of cleanlab. # # cleanlab is free software: you can redistribute it and/or modify From 4a5d0653fea0c0342eacb05042f112e702ce9d5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Sat, 7 Jan 2023 07:45:50 -0700 Subject: [PATCH 089/258] Handle missing type parameters for generic type "ndarray" (#587) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor(multi-label): 🏷️ parametrize `NDArray` A generic type variable, `T`, that represents the precision of a data type. While the data types for probabilities and scores are identical, it conceptually makes sense to define separate type aliases for each. `np` is only used in defining the floating data type, which is only used for type hints. * mention shape of array in the docstring description + other clarity improvements Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/multilabel_classification.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/cleanlab/multilabel_classification.py b/cleanlab/multilabel_classification.py index f2d3a3c135..d436d72dc8 100644 --- a/cleanlab/multilabel_classification.py +++ b/cleanlab/multilabel_classification.py @@ -20,8 +20,9 @@ Unlike in standard multi-class classification, predicted class probabilities from model need not sum to 1 for each row in multi-label classification. """ -import numpy as np -from typing import List +import numpy as np # noqa: F401: Imported for type annotations +import numpy.typing as npt +from typing import List, TypeVar from cleanlab.internal.validation import assert_valid_inputs from cleanlab.internal.util import get_num_classes @@ -29,14 +30,17 @@ from cleanlab.internal.multilabel_utils import int2onehot +T = TypeVar("T", bound=npt.NBitBase) + + def get_label_quality_scores( labels: List, - pred_probs: np.ndarray, + pred_probs: npt.NDArray["np.floating[T]"], *, method: str = "self_confidence", adjust_pred_probs: bool = False, aggregator_kwargs: dict = {"method": "exponential_moving_average", "alpha": 0.8} -) -> np.ndarray: +) -> npt.NDArray["np.floating[T]"]: """Computes a label quality score each example in a multi-label classification dataset. Scores are between 0 and 1 with lower scores indicating examples whose label more likely contains an error. @@ -57,12 +61,11 @@ def get_label_quality_scores( *Format requirements*: For dataset with K classes, individual class labels must be integers in 0, 1, ..., K-1. pred_probs : np.ndarray - An array of shape ``(N, K)`` of model-predicted probabilities, - ``P(label=k|x)``. Each row of this matrix corresponds - to an example `x` and contains the model-predicted probabilities that - `x` belongs to each possible class, for each of the K classes. The - columns must be ordered such that these probabilities correspond to - class 0, 1, ..., K-1. In multi-label classification, the rows of `pred_probs` need not sum to 1. + A 2D array of shape ``(N, K)`` of model-predicted class probabilities ``P(label=k|x)``. + Each row of this matrix corresponds to an example `x` and contains the predicted probabilities + that `x` belongs to each possible class, for each of the K classes. + The columns of this array must be ordered such that these probabilities correspond to class 0, 1, ..., K-1. + In multi-label classification (where classes are not mutually exclusive), the rows of `pred_probs` need not sum to 1. Note ---- From feb369692437834302d805c6a1d6dde89021ca66 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Mon, 9 Jan 2023 09:31:11 -0800 Subject: [PATCH 090/258] decide -> suggest --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ad8387034e..52e303655f 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ cleanlab.dataset.health_summary(labels, confident_joint=cl.confident_joint) Get started with: [documentation](https://docs.cleanlab.ai/), [tutorials](https://docs.cleanlab.ai/stable/tutorials/image.html), [examples](https://github.com/cleanlab/examples), and [blogs](https://cleanlab.ai/blog/). - Learn to run cleanlab on your data in 5 minutes for classification with: [image](https://docs.cleanlab.ai/stable/tutorials/image.html), [text](https://docs.cleanlab.ai/stable/tutorials/text.html), [audio](https://docs.cleanlab.ai/stable/tutorials/audio.html), or [tabular](https://docs.cleanlab.ai/stable/tutorials/tabular.html) data. -- Use cleanlab to automatically: [find mislabeled data + train robust models](https://docs.cleanlab.ai/stable/tutorials/indepth_overview.html), [detect outliers](https://docs.cleanlab.ai/stable/tutorials/outliers.html), [estimate consensus + annotator-quality for multi-annotator datasets](https://docs.cleanlab.ai/stable/tutorials/multiannotator.html), [decide what data is best to (re)label next](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb). +- Use cleanlab to automatically: [find mislabeled data + train robust models](https://docs.cleanlab.ai/stable/tutorials/indepth_overview.html), [detect outliers](https://docs.cleanlab.ai/stable/tutorials/outliers.html), [estimate consensus + annotator-quality for multi-annotator datasets](https://docs.cleanlab.ai/stable/tutorials/multiannotator.html), [suggest which data is best to (re)label next](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb). [![pypi](https://img.shields.io/pypi/v/cleanlab.svg)](https://pypi.org/pypi/cleanlab/) @@ -161,7 +161,7 @@ cleanlab is useful across a wide variety of Machine Learning tasks. Specific tas 2. [Multi-label classification](https://docs.cleanlab.ai/stable/tutorials/multilabel_classification.html) (e.g. image/document tagging) 3. [Token classification](https://docs.cleanlab.ai/stable/tutorials/token_classification.html) (e.g. entity recognition in text) 4. [Classification with data labeled by multiple annotators](https://docs.cleanlab.ai/stable/tutorials/multiannotator.html) -5. [Active learning with multiple annotators](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb) (decide which data to label or re-label to improve model most) +5. [Active learning with multiple annotators](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb) (suggest which data to label or re-label to improve model most) 6. [Out of distribution detection](https://docs.cleanlab.ai/stable/tutorials/outliers.html) For many other ML tasks, cleanlab can still help you improve your dataset if appropriately applied. From 71e21a983a561641b3794790fd9e5334e7846817 Mon Sep 17 00:00:00 2001 From: Hui Wen <45724323+huiwengoh@users.noreply.github.com> Date: Tue, 10 Jan 2023 06:12:19 +0800 Subject: [PATCH 091/258] remove temp scaling from ensemble active learning when data has single annotator (#597) --- cleanlab/multiannotator.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/cleanlab/multiannotator.py b/cleanlab/multiannotator.py index 34ff224255..bf653e1da0 100644 --- a/cleanlab/multiannotator.py +++ b/cleanlab/multiannotator.py @@ -700,15 +700,12 @@ def get_active_learning_scores_ensemble( num_classes = get_num_classes(pred_probs=pred_probs[0]) # temp scale pred_probs - optimal_temp = np.full(len(pred_probs), np.NaN) - for i in range(len(pred_probs)): - curr_pred_probs = pred_probs[i] - curr_optimal_temp = find_best_temp_scaler(labels_multiannotator, curr_pred_probs) - pred_probs[i] = temp_scale_pred_probs(curr_pred_probs, curr_optimal_temp) - optimal_temp[i] = curr_optimal_temp # if all examples are only labeled by a single annotator if labels_multiannotator.apply(lambda s: len(s.dropna()) == 1, axis=1).all(): + # do not temp scale for single annotator case, temperature is defined here for later use + optimal_temp = np.full(len(pred_probs), 1.0) + assert_valid_inputs_multiannotator( labels_multiannotator, pred_probs, ensemble=True, allow_single_label=True ) @@ -725,6 +722,13 @@ def get_active_learning_scores_ensemble( avg_annotator_weight = np.mean(annotator_weight) else: + optimal_temp = np.full(len(pred_probs), np.NaN) + for i in range(len(pred_probs)): + curr_pred_probs = pred_probs[i] + curr_optimal_temp = find_best_temp_scaler(labels_multiannotator, curr_pred_probs) + pred_probs[i] = temp_scale_pred_probs(curr_pred_probs, curr_optimal_temp) + optimal_temp[i] = curr_optimal_temp + multiannotator_info = get_label_quality_multiannotator_ensemble( labels_multiannotator, pred_probs, From 503a57adaf6318c4149f71523f7a232dbc34c4e4 Mon Sep 17 00:00:00 2001 From: unna97 <31486108+unna97@users.noreply.github.com> Date: Tue, 10 Jan 2023 19:16:16 +0530 Subject: [PATCH 092/258] Adding type hints for mypy strict compatibility (#585) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Handling missing type & docs in cleanlab/token_classification_utils.py :label: - Added typehints for the cleanlabs/token_classification_utils.py file for mypy strict mode - Fixed corresponding docstrings refers #587 * Handling missing type & docs in cleanlab/multilabel_classification.py :label: - Fixing all errors for mypy strict for the file * docs(internal): fix "number of new classes" variable in docstring Co-authored-by: Elías Snorrason --- cleanlab/internal/token_classification_utils.py | 14 ++++++++++---- cleanlab/multilabel_classification.py | 6 +++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/cleanlab/internal/token_classification_utils.py b/cleanlab/internal/token_classification_utils.py index ab384294b4..525e3d9077 100644 --- a/cleanlab/internal/token_classification_utils.py +++ b/cleanlab/internal/token_classification_utils.py @@ -22,7 +22,11 @@ import string import numpy as np from termcolor import colored -from typing import List, Optional, Callable, Tuple +from typing import List, Optional, Callable, Tuple, TypeVar +import numpy.typing as npt + + +T = TypeVar("T", bound=npt.NBitBase) def get_sentence(words: List[str]) -> str: @@ -171,14 +175,16 @@ def mapping(entities: List[int], maps: List[int]) -> List[int]: return list(map(f, entities)) -def merge_probs(probs: np.ndarray, maps: List[int]) -> np.ndarray: +def merge_probs( + probs: npt.NDArray["np.floating[T]"], maps: List[int] +) -> npt.NDArray["np.floating[T]"]: """ Merges model-predictive probabilities with desired mapping Parameters ---------- probs: - np.array of shape `(N, K)`, where N is the number of tokens, and K is the number of classes for the model + A 2D np.array of shape `(N, K)`, where N is the number of tokens, and K is the number of classes for the model maps: a list of mapped index, such that the probability of the token being in the i'th class is mapped to the @@ -188,7 +194,7 @@ def merge_probs(probs: np.ndarray, maps: List[int]) -> np.ndarray: Returns --------- probs_merged: - np.array of shape ``(N, K')``, where `K` is the number of new classes. Probabilities are merged and + A 2D np.array of shape ``(N, K')``, where `K'` is the number of new classes. Probabilities are merged and re-normalized if necessary. Examples diff --git a/cleanlab/multilabel_classification.py b/cleanlab/multilabel_classification.py index d436d72dc8..b49b6203da 100644 --- a/cleanlab/multilabel_classification.py +++ b/cleanlab/multilabel_classification.py @@ -22,7 +22,7 @@ import numpy as np # noqa: F401: Imported for type annotations import numpy.typing as npt -from typing import List, TypeVar +from typing import List, TypeVar, Dict, Any from cleanlab.internal.validation import assert_valid_inputs from cleanlab.internal.util import get_num_classes @@ -34,12 +34,12 @@ def get_label_quality_scores( - labels: List, + labels: List[List[int]], pred_probs: npt.NDArray["np.floating[T]"], *, method: str = "self_confidence", adjust_pred_probs: bool = False, - aggregator_kwargs: dict = {"method": "exponential_moving_average", "alpha": 0.8} + aggregator_kwargs: Dict[str, Any] = {"method": "exponential_moving_average", "alpha": 0.8} ) -> npt.NDArray["np.floating[T]"]: """Computes a label quality score each example in a multi-label classification dataset. From 1cf563c2a1dc752c0e4ec4ac9c815f795860f04e Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Fri, 20 Jan 2023 04:55:31 +0900 Subject: [PATCH 093/258] fix typo in outliers.ipynb (#603) occuring -> occurring --- docs/source/tutorials/outliers.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/tutorials/outliers.ipynb b/docs/source/tutorials/outliers.ipynb index f4888055f1..8f3fe5df71 100644 --- a/docs/source/tutorials/outliers.ipynb +++ b/docs/source/tutorials/outliers.ipynb @@ -420,7 +420,7 @@ "source": [ "### Scoring outliers in a given dataset (training data)\n", "\n", - "Fitting cleanlab's ``OutOfDistribution`` class on ``feature_embeddings`` will find any naturally occuring outliers in a given dataset. These examples are atypical images that look strange or different from the majority of examples in the dataset. In our case, these correspond to odd-looking images of animals that do not resemble typical animals depicted in **cifar10**. This method produces a score in [0,1] for each example, where lower values correspond to more atypical examples (more likely out-of-distribution)." + "Fitting cleanlab's ``OutOfDistribution`` class on ``feature_embeddings`` will find any naturally occurring outliers in a given dataset. These examples are atypical images that look strange or different from the majority of examples in the dataset. In our case, these correspond to odd-looking images of animals that do not resemble typical animals depicted in **cifar10**. This method produces a score in [0,1] for each example, where lower values correspond to more atypical examples (more likely out-of-distribution)." ] }, { From a31b3cac78d0d1a23be1810903c0f319363610c5 Mon Sep 17 00:00:00 2001 From: Chris Mauck <38672284+cmauck10@users.noreply.github.com> Date: Thu, 19 Jan 2023 15:34:36 -0600 Subject: [PATCH 094/258] tags in links (#604) --- README.md | 6 +++--- docs/source/index.rst | 2 +- docs/source/tutorials/faq.ipynb | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 52e303655f..8cefb1e2e9 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Get started with: [documentation](https://docs.cleanlab.ai/), [tutorials](https: [![docs](https://img.shields.io/static/v1?logo=github&style=flat&color=pink&label=docs&message=cleanlab)](https://docs.cleanlab.ai/) [![Slack Community](https://img.shields.io/static/v1?logo=slack&style=flat&color=white&label=slack&message=community)](https://cleanlab.ai/slack) [![Twitter](https://img.shields.io/twitter/follow/CleanlabAI?style=social)](https://twitter.com/CleanlabAI) -[![Cleanlab Studio](https://raw.githubusercontent.com/cleanlab/assets/master/shields/cl-studio-shield.svg)](https://cleanlab.ai/studio) +[![Cleanlab Studio](https://raw.githubusercontent.com/cleanlab/assets/master/shields/cl-studio-shield.svg)](https://cleanlab.ai/studio/?utm_source=github&utm_medium=readme&utm_campaign=clostostudio) ----- @@ -535,9 +535,9 @@ To understand/cite other cleanlab functionality not described above, check out o - [NeurIPS 2021 paper: Pervasive Label Errors in Test Sets Destabilize Machine Learning Benchmarks](https://arxiv.org/abs/2103.14749) -- [Cleanlab Studio](https://cleanlab.ai/studio): No-code Data Improvement +- [Cleanlab Studio](https://cleanlab.ai/studio/?utm_source=github&utm_medium=readme&utm_campaign=clostostudio): No-code Data Improvement -While this open-source library **finds** data issues, an interface is needed to efficiently **fix** these issues in your dataset. [Cleanlab Studio](https://cleanlab.ai/studio) is a no-code platform to find and fix problems in real-world ML datasets. Studio automatically runs optimized versions of the algorithms from this open-source library on top of AutoML models fit to your data, and presents detected issues in a smart data editing interface. Think of it like a data cleaning assistant that helps you quickly improve the quality of your data (via AI/automation + streamlined UX). +While this open-source library **finds** data issues, an interface is needed to efficiently **fix** these issues in your dataset. [Cleanlab Studio](https://cleanlab.ai/studio/?utm_source=github&utm_medium=readme&utm_campaign=clostostudio) is a no-code platform to find and fix problems in real-world ML datasets. Studio automatically runs optimized versions of the algorithms from this open-source library on top of AutoML models fit to your data, and presents detected issues in a smart data editing interface. Think of it like a data cleaning assistant that helps you quickly improve the quality of your data (via AI/automation + streamlined UX). ## Join our community diff --git a/docs/source/index.rst b/docs/source/index.rst index bb36ac8a0c..7e62505e44 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -160,4 +160,4 @@ Please see our `contributing guidelines PyPI Conda - Cleanlab Studio + Cleanlab Studio diff --git a/docs/source/tutorials/faq.ipynb b/docs/source/tutorials/faq.ipynb index fdd9ddc04a..989f9b3711 100644 --- a/docs/source/tutorials/faq.ipynb +++ b/docs/source/tutorials/faq.ipynb @@ -202,7 +202,7 @@ "metadata": {}, "source": [ "We caution against just blindly taking the predicted label for granted, many of these suggestions may be wrong! \n", - "You will be able to produce a much better version of your dataset interactively using [Cleanlab Studio](https://cleanlab.ai/studio/), which helps you efficiently fix issues like this in large datasets." + "You will be able to produce a much better version of your dataset interactively using [Cleanlab Studio](https://cleanlab.ai/studio/?utm_source=github&utm_medium=docs&utm_campaign=clostostudio), which helps you efficiently fix issues like this in large datasets." ] }, { @@ -372,8 +372,8 @@ "id": "1a117547", "metadata": {}, "source": [ - "These questions are automatically handled for you in [Cleanlab Studio](https://cleanlab.ai/studio) -- our platform for no-code data improvement.\n", - "While this open-source library **finds** data issues, an interface is needed to efficiently **fix** these issues in your dataset. [Cleanlab Studio](https://cleanlab.ai/studio) is a no-code platform to find and fix problems in real-world ML datasets. Studio automatically runs optimized versions of the algorithms from this open-source library on top of AutoML models fit to your data, and presents detected issues in a smart data editing interface. Think of it like a data cleaning assistant that helps you quickly improve the quality of your data (via AI/automation + streamlined UX)." + "These questions are automatically handled for you in [Cleanlab Studio](https://cleanlab.ai/studio/?utm_source=github&utm_medium=docs&utm_campaign=clostostudio) -- our platform for no-code data improvement.\n", + "While this open-source library **finds** data issues, an interface is needed to efficiently **fix** these issues in your dataset. [Cleanlab Studio](https://cleanlab.ai/studio/?utm_source=github&utm_medium=docs&utm_campaign=clostostudio) is a no-code platform to find and fix problems in real-world ML datasets. Studio automatically runs optimized versions of the algorithms from this open-source library on top of AutoML models fit to your data, and presents detected issues in a smart data editing interface. Think of it like a data cleaning assistant that helps you quickly improve the quality of your data (via AI/automation + streamlined UX)." ] }, { From 2b6c56424d176e0cccd6ea4f523c8a1b791a2d4a Mon Sep 17 00:00:00 2001 From: clu0 <33559427+clu0@users.noreply.github.com> Date: Thu, 19 Jan 2023 20:14:03 -0500 Subject: [PATCH 095/258] 10x speedup in find_label_issues on linux via better multiprocessing (#596) and bug fixes related to partition vs sort Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/filter.py | 249 ++++++++++++++++++------------------- requirements-dev.txt | 1 + tests/test_filter_count.py | 51 ++++++++ 3 files changed, 176 insertions(+), 125 deletions(-) diff --git a/cleanlab/filter.py b/cleanlab/filter.py index e507011416..6e31786f6c 100644 --- a/cleanlab/filter.py +++ b/cleanlab/filter.py @@ -26,11 +26,11 @@ import numpy as np from sklearn.metrics import confusion_matrix import multiprocessing -from multiprocessing.sharedctypes import RawArray import sys import warnings from typing import Any, Dict, List, Optional, Tuple, Union from functools import reduce +import platform from cleanlab.count import calibrate_confident_joint from cleanlab.rank import ( @@ -46,9 +46,8 @@ from cleanlab.internal.multilabel_utils import stack_complement, get_onehot_num_classes, int2onehot from cleanlab.typing import LabelLike -# tqdm is a module used to print time-to-complete when multiprocessing is used. -# This module is not necessary, and therefore is not a package dependency, but -# when installed it improves user experience for large datasets. +# tqdm is a package to print time-to-complete when multiprocessing is used. +# This package is not necessary, but when installed improves user experience for large datasets. try: import tqdm @@ -59,6 +58,19 @@ w = """To see estimated completion times for methods in cleanlab.filter, "pip install tqdm".""" warnings.warn(w) +# psutil is a package used to count physical cores for multiprocessing +# This package is not necessary, because we can always fall back to logical cores as the default +try: + import psutil + + psutil_exists = True +except ImportError as e: # pragma: no cover + psutil_exists = False + +# global variable for find_label_issues multiprocessing +pred_probs_by_class: Dict[int, np.ndarray] +prune_count_matrix_cols: Dict[int, np.ndarray] + def find_label_issues( labels: LabelLike, @@ -187,7 +199,7 @@ class 0, 1, ..., K-1. n_jobs : optional Number of processing threads used by multiprocessing. Default ``None`` - sets to the number of cores on your CPU. + sets to the number of cores on your CPU (physical cores if you have ``psutil`` package installed, otherwise logical cores). Set this to 1 to *disable* parallel processing (if its causing issues). Windows users may see a speed-up with ``n_jobs=1``. @@ -247,9 +259,34 @@ class 0, 1, ..., K-1. "filter_by 'confident_learning' or 'predicted_neq_given' is not supported (yet) when setting 'num_to_remove_per_class'" ) + K = get_num_classes( + labels=labels, pred_probs=pred_probs, label_matrix=confident_joint, multi_label=multi_label + ) + # Boolean set to true if dataset is large + big_dataset = K * len(labels) > 1e8 + # Set-up number of multiprocessing threads + # On Windows/macOS, when multi_label is True, multiprocessing is much slower + # even for faily large input arrays, so we default to n_jobs=1 in this case + os_name = platform.system() if n_jobs is None: - n_jobs = multiprocessing.cpu_count() + if multi_label and os_name != "Linux": + n_jobs = 1 + else: + if psutil_exists: + n_jobs = psutil.cpu_count(logical=False) # physical cores + elif big_dataset: + print( + "To default `n_jobs` to the number of physical cores for multiprocessing in find_label_issues(), please: `pip install psutil`.\n" + "Note: You can safely ignore this message. `n_jobs` only affects runtimes, results will be the same no matter its value.\n" + "Since psutil is not installed, `n_jobs` was set to the number of logical cores by default.\n" + "Disable this message by either installing psutil or specifying the `n_jobs` argument." + ) # pragma: no cover + if not n_jobs: + # either psutil does not exist + # or psutil can return None when physical cores cannot be determined + # switch to logical cores + n_jobs = multiprocessing.cpu_count() else: assert n_jobs >= 1 @@ -272,13 +309,8 @@ class 0, 1, ..., K-1. ) # Else this is standard multi-class classification - K = get_num_classes( - labels=labels, pred_probs=pred_probs, label_matrix=confident_joint, multi_label=multi_label - ) # Number of examples in each class of labels label_counts = value_counts_fill_missing_classes(labels, K, multi_label=multi_label) - # Boolean set to true if dataset is large - big_dataset = K * len(labels) > 1e8 # Ensure labels are of type np.ndarray() labels = np.asarray(labels) if confident_joint is None or filter_by == "confident_learning": @@ -310,83 +342,66 @@ class 0, 1, ..., K-1. prune_count_matrix = round_preserving_row_totals(tmp) # Prepare multiprocessing shared data - if n_jobs > 1: - _labels = RawArray("I", labels) # type: ignore - _label_counts = RawArray("I", label_counts) # type: ignore - _prune_count_matrix = RawArray("I", prune_count_matrix.flatten()) # type: ignore - _pred_probs = RawArray("f", pred_probs.flatten()) # type: ignore - else: # Multiprocessing is turned off. Create tuple with all parameters - args = ( - labels, - label_counts, - prune_count_matrix, - pred_probs, - multi_label, - min_examples_per_class, - ) + # On Linux, multiprocessing is started with fork, + # so data can be shared with global vairables + COW + # On Window/macOS, processes are started with spawn, + # so data will need to be pickled to the subprocesses through input args + chunksize = max(1, K // n_jobs) + if n_jobs == 1 or os_name == "Linux": + global pred_probs_by_class, prune_count_matrix_cols + pred_probs_by_class = {k: pred_probs[labels == k] for k in range(K)} + prune_count_matrix_cols = {k: prune_count_matrix[:, k] for k in range(K)} + args = [[k, min_examples_per_class, None] for k in range(K)] + else: + args = [ + [k, min_examples_per_class, [pred_probs[labels == k], prune_count_matrix[:, k]]] + for k in range(K) + ] # Perform Pruning with threshold probabilities from BFPRT algorithm in O(n) # Operations are parallelized across all CPU processes if filter_by == "prune_by_class" or filter_by == "both": - if n_jobs > 1: # parallelize - with multiprocessing.Pool( - n_jobs, - initializer=_init, - initargs=( - _labels, - _label_counts, - _prune_count_matrix, - prune_count_matrix.shape, - _pred_probs, - pred_probs.shape, - multi_label, - min_examples_per_class, - ), - ) as p: + if n_jobs > 1: + with multiprocessing.Pool(n_jobs) as p: if verbose: # pragma: no cover print("Parallel processing label issues by class.") sys.stdout.flush() if big_dataset and tqdm_exists: label_issues_masks_per_class = list( - tqdm.tqdm(p.imap(_prune_by_class, range(K)), total=K), + tqdm.tqdm(p.imap(_prune_by_class, args, chunksize=chunksize), total=K) ) else: - label_issues_masks_per_class = p.map(_prune_by_class, range(K)) - else: # n_jobs = 1, so no parallelization - label_issues_masks_per_class = [_prune_by_class(k, args) for k in range(K)] - label_issues_mask = np.stack(label_issues_masks_per_class).any(axis=0) + label_issues_masks_per_class = p.map(_prune_by_class, args, chunksize=chunksize) + else: + label_issues_masks_per_class = [_prune_by_class(arg) for arg in args] + + label_issues_mask = np.zeros(len(labels), dtype=bool) + for k, mask in enumerate(label_issues_masks_per_class): + if len(mask) > 1: + label_issues_mask[labels == k] = mask if filter_by == "both": label_issues_mask_by_class = label_issues_mask if filter_by == "prune_by_noise_rate" or filter_by == "both": - if n_jobs > 1: # parallelize - with multiprocessing.Pool( - n_jobs, - initializer=_init, - initargs=( - _labels, - _label_counts, - _prune_count_matrix, - prune_count_matrix.shape, - _pred_probs, - pred_probs.shape, - multi_label, - min_examples_per_class, - ), - ) as p: + if n_jobs > 1: + with multiprocessing.Pool(n_jobs) as p: if verbose: # pragma: no cover print("Parallel processing label issues by noise rate.") sys.stdout.flush() if big_dataset and tqdm_exists: label_issues_masks_per_class = list( - tqdm.tqdm(p.imap(_prune_by_count, range(K)), total=K) + tqdm.tqdm(p.imap(_prune_by_count, args, chunksize=chunksize), total=K) ) else: - label_issues_masks_per_class = p.map(_prune_by_count, range(K)) - else: # n_jobs = 1, so no parallelization - label_issues_masks_per_class = [_prune_by_count(k, args) for k in range(K)] - label_issues_mask = np.stack(label_issues_masks_per_class).any(axis=0) + label_issues_masks_per_class = p.map(_prune_by_count, args, chunksize=chunksize) + else: + label_issues_masks_per_class = [_prune_by_count(arg) for arg in args] + + label_issues_mask = np.zeros(len(labels), dtype=bool) + for k, mask in enumerate(label_issues_masks_per_class): + if len(mask) > 1: + label_issues_mask[labels == k] = mask if filter_by == "both": label_issues_mask = label_issues_mask & label_issues_mask_by_class @@ -891,7 +906,7 @@ def _get_shared_data() -> Any: # pragma: no cover # TODO figure out what the types inside args are. -def _prune_by_class(k: int, args=None) -> np.ndarray: +def _prune_by_class(args: list) -> np.ndarray: """multiprocessing Helper function for find_label_issues() that assumes globals and produces a mask for class k for each example by removing the examples with *smallest probability* of @@ -902,41 +917,34 @@ def _prune_by_class(k: int, args=None) -> np.ndarray: k : int (between 0 and num classes - 1) The class of interest.""" - if args: # Single processing - params are passed in - ( - labels, - label_counts, - prune_count_matrix, - pred_probs, - multi_label, - min_examples_per_class, - ) = args - else: # Multiprocessing - data is shared across sub-processes - ( - labels, - label_counts, - prune_count_matrix, - pred_probs, - multi_label, - min_examples_per_class, - ) = _get_shared_data() + k, min_examples_per_class, arrays = args + if arrays is None: + pred_probs = pred_probs_by_class[k] + prune_count_matrix = prune_count_matrix_cols[k] + else: + pred_probs = arrays[0] + prune_count_matrix = arrays[1] - if label_counts[k] > min_examples_per_class: # No prune if not at least min_examples_per_class - num_issues = label_counts[k] - prune_count_matrix[k][k] + label_counts = pred_probs.shape[0] + label_issues = np.zeros(label_counts, dtype=bool) + if label_counts > min_examples_per_class: # No prune if not at least min_examples_per_class + num_issues = label_counts - prune_count_matrix[k] # Get return_indices_ranked_by of the smallest prob of class k for examples with noisy label k - label_filter = np.array([k in lst for lst in labels]) if multi_label else labels == k - class_probs = pred_probs[:, k] - rank = np.partition(class_probs[label_filter], num_issues)[num_issues] - return label_filter & (class_probs < rank) - else: - warnings.warn( - f"May not flag all label issues in class: {k}, it has too few examples (see argument: `min_examples_per_class`)" - ) - return np.zeros(len(labels), dtype=bool) + # rank = np.partition(class_probs, num_issues)[num_issues] + if num_issues >= 1: + class_probs = pred_probs[:, k] + order = np.argsort(class_probs) + label_issues[order[:num_issues]] = True + return label_issues + + warnings.warn( + f"May not flag all label issues in class: {k}, it has too few examples (see argument: `min_examples_per_class`)" + ) + return label_issues # TODO figure out what the types inside args are. -def _prune_by_count(k: int, args=None) -> np.ndarray: +def _prune_by_count(args: list) -> np.ndarray: """multiprocessing Helper function for find_label_issues() that assumes globals and produces a mask for class k for each example by removing the example with noisy label k having *largest margin*, @@ -948,43 +956,34 @@ def _prune_by_count(k: int, args=None) -> np.ndarray: k : int (between 0 and num classes - 1) The true_label class of interest.""" - if args: # Single processing - params are passed in - ( - labels, - label_counts, - prune_count_matrix, - pred_probs, - multi_label, - min_examples_per_class, - ) = args - else: # Multiprocessing - data is shared across sub-processes - ( - labels, - label_counts, - prune_count_matrix, - pred_probs, - multi_label, - min_examples_per_class, - ) = _get_shared_data() + k, min_examples_per_class, arrays = args + if arrays is None: + pred_probs = pred_probs_by_class[k] + prune_count_matrix = prune_count_matrix_cols[k] + else: + pred_probs = arrays[0] + prune_count_matrix = arrays[1] - label_issues_mask = np.zeros(len(pred_probs), dtype=bool) - pred_probs_k = pred_probs[:, k] - K = get_num_classes(labels, pred_probs, multi_label=multi_label) - if label_counts[k] <= min_examples_per_class: # No prune if not at least min_examples_per_class + label_counts = pred_probs.shape[0] + label_issues_mask = np.zeros(label_counts, dtype=bool) + if label_counts <= min_examples_per_class: warnings.warn( f"May not flag all label issues in class: {k}, it has too few examples (see `min_examples_per_class` argument)" ) - return np.zeros(len(labels), dtype=bool) - for j in range(K): # j is true label index (k is noisy label index) - num2prune = prune_count_matrix[j][k] + return label_issues_mask + + K = pred_probs.shape[1] + if K < 1: + raise ValueError("Must have at least 1 class.") + for j in range(K): + num2prune = prune_count_matrix[j] # Only prune for noise rates, not diagonal entries if k != j and num2prune > 0: # num2prune's largest p(true class k) - p(noisy class k) # for x with true label j - margin = pred_probs[:, j] - pred_probs_k - label_filter = np.array([k in lst for lst in labels]) if multi_label else labels == k - cut = -np.partition(-margin[label_filter], num2prune - 1)[num2prune - 1] - label_issues_mask = label_issues_mask | (label_filter & (margin >= cut)) + margin = pred_probs[:, j] - pred_probs[:, k] + order = np.argsort(-margin) + label_issues_mask[order[:num2prune]] = True return label_issues_mask diff --git a/requirements-dev.txt b/requirements-dev.txt index 76a5b7898c..d6c7a87a63 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,3 +12,4 @@ torch torchvision skorch tensorflow +psutil diff --git a/tests/test_filter_count.py b/tests/test_filter_count.py index c48d4f1995..36a391009d 100644 --- a/tests/test_filter_count.py +++ b/tests/test_filter_count.py @@ -875,6 +875,57 @@ def test_missing_classes(): assert all(filter.find_label_issues(labels, pred_probs, filter_by=fb) == issues) +@pytest.mark.filterwarnings("ignore:WARNING!") +def test_find_label_issues_match_multiprocessing(): + # minimal version of this test was run in test_missing_classes + # here testing with larger input matrices + + # test with ground truth labels: + n = 5000 # consider replacing this with larger value + # some past bugs observed only with larger sample-sizes like n=200000 + m = 100 + labels = np.ones(n, dtype=int) + labels[(n // 2) :] = 0 + pred_probs = np.zeros((n, 4)) + pred_probs[:, 0] = 0.95 + pred_probs[:, 1] = 0.05 + pred_probs[0, 0] = 0.94 + pred_probs[0, 1] = 0.06 + ground_truth = np.ones(n, dtype=bool) + ground_truth[(n // 2) :] = False + ground_truth[0] = False # leave one example for min_example_per_class + # TODO: consider also testing this line without psutil installed + issues = filter.find_label_issues(labels, pred_probs) + issues1 = filter.find_label_issues(labels, pred_probs, n_jobs=1) + issues2 = filter.find_label_issues(labels, pred_probs, n_jobs=2) + assert all(issues == ground_truth) + assert all(issues == issues1) + assert all(issues == issues2) + issues = filter.find_label_issues(labels, pred_probs, filter_by="prune_by_class") + issues1 = filter.find_label_issues(labels, pred_probs, n_jobs=1, filter_by="prune_by_class") + issues2 = filter.find_label_issues(labels, pred_probs, n_jobs=2, filter_by="prune_by_class") + assert all(issues == ground_truth) + assert all(issues == issues1) + assert all(issues == issues2) + + # test with random labels + normalize = np.random.randint(low=1, high=100, size=[n, m], dtype=np.uint8) + pred_probs = np.zeros((n, m)) + for i, col in enumerate(normalize): + pred_probs[i] = col / np.sum(col) + labels = np.repeat(np.arange(m), n // m) + issues = filter.find_label_issues(labels, pred_probs) + issues1 = filter.find_label_issues(labels, pred_probs, n_jobs=1) + issues2 = filter.find_label_issues(labels, pred_probs, n_jobs=2) + assert all(issues == issues1) + assert all(issues == issues2) + issues = filter.find_label_issues(labels, pred_probs, filter_by="prune_by_class") + issues1 = filter.find_label_issues(labels, pred_probs, n_jobs=1, filter_by="prune_by_class") + issues2 = filter.find_label_issues(labels, pred_probs, n_jobs=2, filter_by="prune_by_class") + assert all(issues == issues1) + assert all(issues == issues2) + + @pytest.mark.parametrize( "return_indices_ranked_by", [None, "self_confidence", "normalized_margin", "confidence_weighted_entropy"], From b9b981dc01609182e874a38de7d831e99edff4a6 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Thu, 19 Jan 2023 20:57:56 -0500 Subject: [PATCH 096/258] add cleanlab/projects to ignored URLs check list --- .github/workflows/links.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml index 469100c88e..fbe9060f69 100644 --- a/.github/workflows/links.yml +++ b/.github/workflows/links.yml @@ -24,3 +24,4 @@ jobs: empty_alt_ignore: true url_ignore_re: | ^https:\/\/docs\.github\.com\/ + ^https:\/\/github\.com\/cleanlab\/cleanlab\/projects\/ From 9f655a7f6a39550e321dbc8a96a4289f40e7e59e Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Thu, 19 Jan 2023 21:04:56 -0500 Subject: [PATCH 097/258] remove slash at end (#606) --- .github/workflows/links.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml index fbe9060f69..8c40f9237b 100644 --- a/.github/workflows/links.yml +++ b/.github/workflows/links.yml @@ -24,4 +24,4 @@ jobs: empty_alt_ignore: true url_ignore_re: | ^https:\/\/docs\.github\.com\/ - ^https:\/\/github\.com\/cleanlab\/cleanlab\/projects\/ + ^https:\/\/github\.com\/cleanlab\/cleanlab\/projects From f9d32b1352730807231cb8bdf3ec2a88e46d8a68 Mon Sep 17 00:00:00 2001 From: Chris Mauck <38672284+cmauck10@users.noreply.github.com> Date: Tue, 24 Jan 2023 20:12:34 -0600 Subject: [PATCH 098/258] Update tabular tutorial with better language (#609) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- docs/source/tutorials/tabular.ipynb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/tutorials/tabular.ipynb b/docs/source/tutorials/tabular.ipynb index 94e577c9fe..8f3ce43b5f 100644 --- a/docs/source/tutorials/tabular.ipynb +++ b/docs/source/tutorials/tabular.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this 5-minute quickstart tutorial, we use cleanlab with scikit-learn models to find potential label errors in a classification dataset with tabular (numeric/categorical) features. Here we study the [German Credit](https://www.openml.org/d/31) dataset which contains 1,000 individuals described by 20 features, each labeled as either \"good\" or \"bad\" credit risk. cleanlab automatically shortlists _hundreds_ of examples from this dataset that confuse our ML model; many of which are potential label errors (due to annotator mistakes), edge cases, and otherwise ambiguous examples.\n", + "In this 5-minute quickstart tutorial, we use cleanlab with scikit-learn models to find potential label errors in a classification dataset with tabular (numeric/categorical) features. Tabular (or *structured*) data are typically organized in a row/column format and stored in a SQL database or file types like: CSV, Excel, or Parquet. Here we study the [German Credit](https://www.openml.org/d/31) dataset which contains 1,000 individuals described by 20 features, each labeled as either \"good\" or \"bad\" credit risk. cleanlab automatically shortlists _hundreds_ of examples from this dataset that confuse our ML model; many of which are potential label errors (due to annotator mistakes), edge cases, and otherwise ambiguous examples.\n", "\n", "**Overview of what we'll do in this tutorial:**\n", "\n", @@ -151,7 +151,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We first load the data features and labels.\n" + "We first load the data features and labels (which are possibly noisy).\n" ] }, { @@ -238,7 +238,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To identify label issues, cleanlab requires a probabilistic prediction from your model for every datapoint. However, these predictions will be _overfitted_ (and thus unreliable) for examples the model was previously trained on. cleanlab is intended to only be used with **out-of-sample** predicted probabilities, i.e., on examples held out from the model during the training.\n", + "To find potential labeling errors, cleanlab requires a probabilistic prediction from your model for every datapoint. However, these predictions will be _overfitted_ (and thus unreliable) for examples the model was previously trained on. cleanlab is intended to only be used with **out-of-sample** predicted probabilities, i.e., on examples held out from the model during the training.\n", "\n", "K-fold cross-validation is a straightforward way to produce out-of-sample predicted probabilities for every datapoint in the dataset by training K copies of our model on different data subsets and using each copy to predict on the subset of data it did not see during training. An additional benefit of cross-validation is that it provides a more reliable evaluation of our model than a single training/validation split. We can obtain cross-validated out-of-sample predicted probabilities from any classifier via a simple scikit-learn wrapper:\n" ] @@ -272,7 +272,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Based on the given labels and out-of-sample predicted probabilities, cleanlab can quickly help us identify label issues in our dataset. For a dataset with N examples from K classes, the labels should be a 1D array of length N and predicted probabilities should be a 2D (N x K) array. Here we request that the indices of the identified label issues be sorted by cleanlab's self-confidence score, which measures the quality of each given label via the probability assigned to it in our model's prediction." + "Based on the given labels and out-of-sample predicted probabilities, cleanlab can quickly help us identify poorly labeled instances in our data table. For a dataset with N examples from K classes, the labels should be a 1D array of length N and predicted probabilities should be a 2D (N x K) array. Here we request that the indices of the identified label issues be sorted by cleanlab's self-confidence score, which measures the quality of each given label via the probability assigned to it in our model's prediction." ] }, { @@ -310,7 +310,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "These examples appear the most suspicious to our model and should be carefully re-examined. Perhaps the original annotators missed something when deciding on the labels for these individuals.\n" + "These examples appear the most suspicious to our model and should be carefully re-examined. Perhaps the original annotators missed something when deciding on the labels for these individuals. This is a straightforward approach to visualize the rows in a data table that might be mislabeled.\n" ] }, { @@ -480,7 +480,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.8" } }, "nbformat": 4, From 888246e2d5e133d089783713f28915a7e1f9a89a Mon Sep 17 00:00:00 2001 From: Ulyana Date: Fri, 27 Jan 2023 14:47:02 -0800 Subject: [PATCH 099/258] Improve num_label_issues usage of confident_joint to match find_label_issues (#610) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/count.py | 66 +++++++++++++++++------ cleanlab/filter.py | 9 +++- tests/test_filter_count.py | 108 ++++++++++++++++++++++++++++++------- 3 files changed, 149 insertions(+), 34 deletions(-) diff --git a/cleanlab/count.py b/cleanlab/count.py index ce24e559b4..188f874c7a 100644 --- a/cleanlab/count.py +++ b/cleanlab/count.py @@ -87,7 +87,7 @@ def num_label_issues( Array of estimated class label error statisics used for identifying label issues, in same format expected by :py:func:`filter.find_label_issues ` function. The `confident_joint` can be computed using :py:func:`count.compute_confident_joint `. - If not provided, it is internally computed from the given (noisy) `labels` and `pred_probs`. + It is internally computed from the given (noisy) `labels` and `pred_probs`. estimation_method : Method for estimating the number of label issues in dataset by counting the examples in the off-diagonal of the `confident_joint` ``P(label=i, true_label=j)``. @@ -100,8 +100,11 @@ def num_label_issues( two cases: 1. As we add more label and data quality scoring functions in :py:mod:`cleanlab.rank`, this approach will always work. 2. If you have a custom score to rank your data by label quality and you just need to know the cut-off of likely label issues. + - ``'off_diagonal_custom'``: Counts the number of examples in the off-diagonal of a provided `confident_joint` matrix. + - TL;DR: use this method to get the most accurate estimate of number of label issues when you don't need the indices of the label issues. + TL;DR: Use this method to get the most accurate estimate of number of label issues when you don't need the indices of the label issues. + Note: ``'off_diagonal'`` may sometimes underestimate issues for data with few classes, so consider using ``'off_diagonal_calibrated'`` instead if your data has < 4 classes. multi_label : bool, optional Set ``False`` if your dataset is for regular (multi-class) classification, where each example belongs to exactly one class. @@ -114,6 +117,14 @@ def num_label_issues( The estimated number of examples with label issues in the dataset. """ valid_methods = ["off_diagonal", "off_diagonal_calibrated"] + if isinstance(confident_joint, np.ndarray): + warn_str = ( + "The supplied `confident_joint` is ignored as `confident_joint` is recomuputed internally using " + "the supplied `labels` and `pred_probs`. If you still want to use custom `confident_joint` call function " + "with `estimation_method='off_diagonal_custom'`." + ) + warnings.warn(warn_str) + if multi_label: return _num_label_issues_multilabel( labels=labels, @@ -123,29 +134,54 @@ def num_label_issues( labels = labels_to_array(labels) assert_valid_inputs(X=None, y=labels, pred_probs=pred_probs) - if confident_joint is None: - # Original non-calibrated counts of confidently correctly and incorrectly labeled examples. - computed_confident_joint = compute_confident_joint( - labels=labels, pred_probs=pred_probs, calibrate=False + if estimation_method == "off_diagonal": + _, cl_error_indices = compute_confident_joint( + labels=labels, + pred_probs=pred_probs, + calibrate=False, + return_indices_of_off_diagonals=True, ) - else: - computed_confident_joint = confident_joint - assert isinstance(computed_confident_joint, np.ndarray) + label_issues_mask = np.zeros(len(labels), dtype=bool) + for idx in cl_error_indices: + label_issues_mask[idx] = True - if estimation_method == "off_diagonal": - num_issues: int = np.sum(computed_confident_joint) - np.trace(computed_confident_joint) + # Remove label issues if given label == model prediction + pred = pred_probs.argmax(axis=1) + for i, pred_label in enumerate(pred): + if pred_label == labels[i]: + label_issues_mask[i] = False + num_issues = np.sum(label_issues_mask) elif estimation_method == "off_diagonal_calibrated": + calculated_confident_joint = compute_confident_joint( + labels=labels, + pred_probs=pred_probs, + calibrate=True, + ) + assert isinstance(calculated_confident_joint, np.ndarray) # Estimate_joint calibrates the row sums to match the prior distribution of given labels and normalizes to sum to 1 - joint = estimate_joint(labels, pred_probs, confident_joint=computed_confident_joint) + joint = estimate_joint(labels, pred_probs, confident_joint=calculated_confident_joint) frac_issues = 1.0 - joint.trace() num_issues = np.rint(frac_issues * len(labels)).astype(int) + elif estimation_method == "off_diagonal_custom": + if not isinstance(confident_joint, np.ndarray): + raise ValueError( + f""" + No `confident_joint` provided. For 'estimation_method' = {estimation_method} you need to provide pre-calculated + `confident_joint` matrix. Use a different `estimation_method` if you want the `confident_joint` matrix to + be calculated for you. + """ + ) + else: + joint = estimate_joint(labels, pred_probs, confident_joint=confident_joint) + frac_issues = 1.0 - joint.trace() + num_issues = np.rint(frac_issues * len(labels)).astype(int) else: raise ValueError( f""" - {estimation_method} is not a valid estimation method! - Please choose a valid estimation method: {valid_methods} - """ + {estimation_method} is not a valid estimation method! + Please choose a valid estimation method: {valid_methods} + """ ) return num_issues diff --git a/cleanlab/filter.py b/cleanlab/filter.py index 6e31786f6c..0e07ea191a 100644 --- a/cleanlab/filter.py +++ b/cleanlab/filter.py @@ -245,7 +245,7 @@ class 0, 1, ..., K-1. frac_noise != 1.0 or num_to_remove_per_class is not None ): warn_str = ( - "WARNING! frac_noise and num_to_remove_per_class parameters are only supported" + "frac_noise and num_to_remove_per_class parameters are only supported" " for filter_by 'prune_by_noise_rate', 'prune_by_class', and 'both'. They " "are not supported for methods 'confident_learning' or " "'predicted_neq_given'." @@ -258,6 +258,13 @@ class 0, 1, ..., K-1. raise ValueError( "filter_by 'confident_learning' or 'predicted_neq_given' is not supported (yet) when setting 'num_to_remove_per_class'" ) + if filter_by is "confident_learning" and isinstance(confident_joint, np.ndarray): + warn_str = ( + "The supplied `confident_joint` is ignored when `filter_by = 'confident_learning'`; confident joint will be " + "re-estimated from the given labels. To use your supplied `confident_joint`, please specify a different " + "`filter_by` value." + ) + warnings.warn(warn_str) K = get_num_classes( labels=labels, pred_probs=pred_probs, label_matrix=confident_joint, multi_label=multi_label diff --git a/tests/test_filter_count.py b/tests/test_filter_count.py index 36a391009d..81962b9a81 100644 --- a/tests/test_filter_count.py +++ b/tests/test_filter_count.py @@ -715,22 +715,51 @@ def test_find_label_issue_filters_match_origin_functions(): assert "not supported" in str(e) -@pytest.mark.parametrize("confident_joint", [None, True]) -def test_num_label_issues(confident_joint): - cj_calibrated_off_diag_sum = data["cj"].sum() - data["cj"].trace() - n = count.num_label_issues( - labels=data["labels"], - pred_probs=data["pred_probs"], - confident_joint=data["cj"], +def test_num_label_issues_different_estimation_types(): + # these numbers are hardcoded as data[] does not create a difference in both functions + y = np.array([0, 1, 1, 1, 1, 0, 0, 1, 0]) + pred_probs = np.array( + [ + [0.7110397298505661, 0.2889602701494339], + [0.6367131487519773, 0.36328685124802274], + [0.7571834730987641, 0.24281652690123584], + [0.6394163729473307, 0.3605836270526695], + [0.5853684039196656, 0.4146315960803345], + [0.6675968116482668, 0.33240318835173316], + [0.7240647829106976, 0.2759352170893023], + [0.740474240697777, 0.25952575930222266], + [0.7148252196621883, 0.28517478033781196], + ] + ) + + n3 = count.num_label_issues( + labels=y, + pred_probs=pred_probs, + estimation_method="off_diagonal_calibrated", + ) + + n2 = count.num_label_issues( + labels=y, + pred_probs=pred_probs, estimation_method="off_diagonal", - ) # data["cj"] is already calibrated and estimation method does not do extra calibration + ) + + f2 = filter.find_label_issues(labels=y, pred_probs=pred_probs, filter_by="confident_learning") + + assert np.sum(f2) == n2 + assert n3 != n2 - n1 = count.num_label_issues( + +@pytest.mark.filterwarnings() +def test_num_label_issues(): + cj_calibrated_off_diag_sum = data["cj"].sum() - data["cj"].trace() + + n1 = count.num_label_issues( # should throw warning as cj is passed in but also recalculated labels=data["labels"], pred_probs=data["pred_probs"], confident_joint=data["cj"], estimation_method="off_diagonal_calibrated", - ) # data["cj"] is already calibrated but recalibrating it should not change the values + ) n2 = count.num_label_issues( labels=data["labels"], @@ -738,21 +767,47 @@ def test_num_label_issues(confident_joint): estimation_method="off_diagonal_calibrated", ) # this should calculate and calibrate the confident joint into same matrix as data["cj"] - # data["cj"] is already calibrated and estimation method does not do extra calibration - assert n == cj_calibrated_off_diag_sum - # data["cj"] is already calibrated but recalibrating it should not change the values - assert n == n1 + n_custom = count.num_label_issues( + labels=data["labels"], + pred_probs=data["pred_probs"], + confident_joint=data["cj"], + estimation_method="off_diagonal_custom", + ) + + ones_joint = np.ones_like(data["cj"]) + n_custom_bad = count.num_label_issues( + labels=data["labels"], + pred_probs=data["pred_probs"], + confident_joint=ones_joint, + estimation_method="off_diagonal_custom", + ) + + # data["cj"] is already calibrated and recalibrating it should not change the values + assert n2 == cj_calibrated_off_diag_sum # should calculate and calibrate the confident joint into same matrix as data["cj"] - assert n == n2 + assert n1 == n2 + # estimation_method='off_diagonal_custom' should use the passed in confident joint correctly + assert n_custom == n1 + assert n_custom_bad != n1 - f = filter.find_label_issues( + f = filter.find_label_issues( # this should throw warning since cj passed in and filter by confident_learning labels=data["labels"], pred_probs=data["pred_probs"], confident_joint=data["cj"] ) assert sum(f) == 35 - f1 = filter.find_label_issues( - labels=data["labels"], pred_probs=data["pred_probs"], filter_by="confident_learning" + f1 = filter.find_label_issues( # this should throw warning since cj passed in and filter by confident_learning + labels=data["labels"], + pred_probs=data["pred_probs"], + filter_by="confident_learning", + confident_joint=data["cj"], + ) + + n = count.num_label_issues( # should throw warning as cj is passed in but also recalculated + labels=data["labels"], + pred_probs=data["pred_probs"], + confident_joint=data["cj"], + estimation_method="off_diagonal", ) n3 = count.num_label_issues( @@ -761,6 +816,7 @@ def test_num_label_issues(confident_joint): ) assert sum(f1) == n3 # values should be equivalent for `filter_by='confident_learning'` + assert n == n3 # passing in cj should not affect calculation # check wrong estimation_method throws ValueError try: @@ -778,6 +834,22 @@ def test_num_label_issues(confident_joint): estimation_method="not_a_real_method", ) + # check not passing in cj with estimation_method_custom throws ValueError + try: + count.num_label_issues( + labels=data["labels"], + pred_probs=data["pred_probs"], + estimation_method="off_diagonal_custom", + ) + except Exception as e: + assert "you need to provide pre-calculated" in str(e) + with pytest.raises(ValueError) as e: + count.num_label_issues( + labels=data["labels"], + pred_probs=data["pred_probs"], + estimation_method="off_diagonal_custom", + ) + @pytest.mark.parametrize("confident_joint", [None, True]) def test_num_label_issues_multilabel(confident_joint): From 5f9fd95c454dffa10b8cb089384b88d87e53ed50 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Sun, 29 Jan 2023 23:48:49 -0800 Subject: [PATCH 100/258] update crowdlab paper name --- docs/source/tutorials/multiannotator.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/tutorials/multiannotator.ipynb b/docs/source/tutorials/multiannotator.ipynb index 2fb03e5e38..a4263da465 100644 --- a/docs/source/tutorials/multiannotator.ipynb +++ b/docs/source/tutorials/multiannotator.ipynb @@ -715,9 +715,9 @@ "\n", "## How does cleanlab.multiannotator work?\n", "\n", - "All estimates above are produced via the CROWDLAB algorithm, described and benchmarked in this paper:\n", + "All estimates above are produced via the CROWDLAB algorithm, described in this paper that contains extensive benchmarks which show CROWDLAB can produce better estimates than popular methods like Dawid-Skene and GLAD:\n", "\n", - "[Utilizing supervised models to infer consensus labels and their quality from data with multiple annotators](https://arxiv.org/abs/2210.06812)" + "[CROWDLAB: Supervised learning to infer consensus labels and quality scores for data with multiple annotators](https://arxiv.org/abs/2210.06812)" ] }, { From d6f40d89b85c1ed347fe3bfcd4b09fd6785daf0e Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Mon, 30 Jan 2023 10:55:11 -0800 Subject: [PATCH 101/258] crowdlab paper name update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8cefb1e2e9..1fc52ef554 100644 --- a/README.md +++ b/README.md @@ -515,7 +515,7 @@ cleanlab is based on peer-reviewed research. Here are relevant papers to cite if
CROWDLAB for data with multiple annotators (NeurIPS '22) (click to show bibtex) @inproceedings{goh2022crowdlab, - title={Utilizing supervised models to infer consensus labels and their quality from data with multiple annotators}, + title={CROWDLAB: Supervised learning to infer consensus labels and quality scores for data with multiple annotators}, author={Goh, Hui Wen and Tkachenko, Ulyana and Mueller, Jonas}, booktitle={NeurIPS Human in the Loop Learning Workshop}, year={2022} From 5c1ba64bea3892709a7f558c04725c468e09974f Mon Sep 17 00:00:00 2001 From: Sanjana Date: Mon, 30 Jan 2023 15:56:40 -0800 Subject: [PATCH 102/258] Removed duplicate classifier from setup.py (#612) --- setup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.py b/setup.py index e4b03739c8..edeefce993 100644 --- a/setup.py +++ b/setup.py @@ -62,10 +62,8 @@ def run(self): "Programming Language :: Python", "Topic :: Software Development", "Topic :: Scientific/Engineering", - "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Mathematics", "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Topic :: Software Development", "Topic :: Software Development :: Libraries", "Topic :: Software Development :: Libraries :: Python Modules", ], From 0c02ec953bc980e8c1d6ca970c3090c4da8d1384 Mon Sep 17 00:00:00 2001 From: "Curtis G. Northcutt" Date: Tue, 31 Jan 2023 23:26:20 -0800 Subject: [PATCH 103/258] Add two methods to filter.find_label_issues (#595) "low_self_confidence" or "low_ normalized_margin" Co-authored-by: huiwengoh <45724323+huiwengoh@users.noreply.github.com> Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/count.py | 9 +++- cleanlab/filter.py | 98 ++++++++++++++++++++++++++++++++------ tests/test_filter_count.py | 62 ++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 16 deletions(-) diff --git a/cleanlab/count.py b/cleanlab/count.py index 188f874c7a..de86abd9f6 100644 --- a/cleanlab/count.py +++ b/cleanlab/count.py @@ -205,12 +205,19 @@ def _num_label_issues_multilabel( ------- num_issues : int The estimated number of examples with label issues in the multi-label dataset. + + Note: We set the filter_by method as 'confident_learning' to match the non-multilabel case + (analog to the off_diagonal estimation method) """ from cleanlab.filter import find_label_issues issues_idx = find_label_issues( - labels=labels, pred_probs=pred_probs, confident_joint=confident_joint, multi_label=True + labels=labels, + pred_probs=pred_probs, + confident_joint=confident_joint, + multi_label=True, + filter_by="confident_learning", # specified to match num_label_issues ) return sum(issues_idx) diff --git a/cleanlab/filter.py b/cleanlab/filter.py index 0e07ea191a..bcc33e1037 100644 --- a/cleanlab/filter.py +++ b/cleanlab/filter.py @@ -32,10 +32,8 @@ from functools import reduce import platform -from cleanlab.count import calibrate_confident_joint -from cleanlab.rank import ( - order_label_issues, -) +from cleanlab.count import calibrate_confident_joint, num_label_issues +from cleanlab.rank import order_label_issues, get_label_quality_scores import cleanlab.internal.multilabel_scorer as ml_scorer from cleanlab.internal.validation import assert_valid_inputs from cleanlab.internal.util import ( @@ -141,7 +139,7 @@ class 0, 1, ..., K-1. label quality score (see :py:func:`rank.get_label_quality_scores `). - filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given'}, default='prune_by_noise_rate' + filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given', 'low_normalized_margin', 'low_self_confidence'}, default='prune_by_noise_rate' Method to determine which examples are flagged as having label issue, so you can filter/prune them from the dataset. Options: - ``'prune_by_noise_rate'``: filters examples with *high probability* of being mislabeled for every non-diagonal in the confident joint (see `prune_counts_matrix` in `filter.py`). These are the examples where (with high confidence) the given label is unlikely to match the predicted label for the example. @@ -149,6 +147,8 @@ class 0, 1, ..., K-1. - ``'both'``: filters only those examples that would be filtered by both ``'prune_by_noise_rate'`` and ``'prune_by_class'``. - ``'confident_learning'``: filters the examples counted as part of the off-diagonals of the confident joint. These are the examples that are confidently predicted to be a different label than their given label. - ``'predicted_neq_given'``: filters examples for which the predicted class (i.e. argmax of the predicted probabilities) does not match the given label. + - ``'low_normalized_margin'``: filters the examples with *smallest* normalized margin label quality score. The number of issues returned matches :py:func:`count.num_label_issues `. + - ``'low_self_confidence'``: filters the examples with *smallest* self confidence label quality score. The number of issues returned matches :py:func:`count.num_label_issues `. multi_label : bool, optional If ``True``, labels should be an iterable (e.g. list) of iterables, containing a @@ -223,6 +223,8 @@ class 0, 1, ..., K-1. rank_by_kwargs = {} assert filter_by in [ + "low_normalized_margin", + "low_self_confidence", "prune_by_noise_rate", "prune_by_class", "both", @@ -241,22 +243,32 @@ class 0, 1, ..., K-1. allow_one_class=allow_one_class, ) - if filter_by in ["confident_learning", "predicted_neq_given"] and ( - frac_noise != 1.0 or num_to_remove_per_class is not None - ): + if filter_by in [ + "confident_learning", + "predicted_neq_given", + "low_normalized_margin", + "low_self_confidences", + ] and (frac_noise != 1.0 or num_to_remove_per_class is not None): warn_str = ( "frac_noise and num_to_remove_per_class parameters are only supported" " for filter_by 'prune_by_noise_rate', 'prune_by_class', and 'both'. They " - "are not supported for methods 'confident_learning' or " - "'predicted_neq_given'." + "are not supported for methods 'confident_learning', 'predicted_neq_given', " + "'low_normalized_margin' or 'low_self_confidence'." ) warnings.warn(warn_str) if (num_to_remove_per_class is not None) and ( - filter_by in ["confident_learning", "predicted_neq_given"] + filter_by + in [ + "confident_learning", + "predicted_neq_given", + "low_normalized_margin", + "low_self_confidences", + ] ): - # TODO - add support for these two filters + # TODO - add support for these filters raise ValueError( - "filter_by 'confident_learning' or 'predicted_neq_given' is not supported (yet) when setting 'num_to_remove_per_class'" + "filter_by 'confident_learning', 'predicted_neq_given', 'low_normalized_margin' " + "or 'low_self_confidence' is not supported (yet) when setting 'num_to_remove_per_class'" ) if filter_by is "confident_learning" and isinstance(confident_joint, np.ndarray): warn_str = ( @@ -329,6 +341,25 @@ class 0, 1, ..., K-1. multi_label=multi_label, return_indices_of_off_diagonals=True, ) + + if filter_by in ["low_normalized_margin", "low_self_confidence"]: + # TODO: consider setting adjust_pred_probs to true based on benchmarks (or adding it kwargs, or ignoring and leaving as false by default) + scores = get_label_quality_scores( + labels, + pred_probs, + method=filter_by[4:], + adjust_pred_probs=False, + ) + num_errors = num_label_issues( + labels, pred_probs, multi_label=multi_label # TODO: Check usage of multilabel + ) + # Find label issues O(nlogn) solution (mapped to boolean mask later in the method) + cl_error_indices = np.argsort(scores)[:num_errors] + # The following is the O(n) fastest solution (check for one-off errors), but the problem is if lots of the scores are identical you will overcount, + # you can end up returning more or less and they aren't ranked in the boolean form so there's no way to drop the highest scores randomly + # boundary = np.partition(scores, num_errors)[num_errors] # O(n) solution + # label_issues_mask = scores <= boundary + if filter_by in ["prune_by_noise_rate", "prune_by_class", "both"]: # Create `prune_count_matrix` with the number of examples to remove in each class and # leave at least min_examples_per_class examples per class. @@ -413,7 +444,7 @@ class 0, 1, ..., K-1. if filter_by == "both": label_issues_mask = label_issues_mask & label_issues_mask_by_class - if filter_by == "confident_learning": + if filter_by in ["confident_learning", "low_normalized_margin", "low_self_confidence"]: label_issues_mask = np.zeros(len(labels), dtype=bool) for idx in cl_error_indices: label_issues_mask[idx] = True @@ -462,6 +493,42 @@ def _find_label_issues_multilabel( This is done via a one-vs-rest reduction for each class and the results are subsequently aggregated across all classes. Here `labels` must be formatted as an iterable of iterables, e.g. ``List[List[int]]``. """ + + if filter_by in ["low_normalized_margin", "low_self_confidence"]: + num_errors = sum( + find_label_issues( + labels=labels, + pred_probs=pred_probs, + confident_joint=confident_joint, + multi_label=True, + filter_by="confident_learning", + ) + ) + + y_one, num_classes = get_onehot_num_classes(labels, pred_probs) + label_quality_scores = ml_scorer.get_label_quality_scores( + labels=y_one, + pred_probs=pred_probs, + ) + + cl_error_indices = np.argsort(label_quality_scores)[:num_errors] + label_issues_mask = np.zeros(len(labels), dtype=bool) + for idx in cl_error_indices: + label_issues_mask[idx] = True + + if return_indices_ranked_by is not None: + label_quality_scores_issues = ml_scorer.get_label_quality_scores( + labels=y_one[label_issues_mask], + pred_probs=pred_probs[label_issues_mask], + method=ml_scorer.MultilabelScorer( + base_scorer=ml_scorer.ClassLabelScorer.from_str(return_indices_ranked_by), + ), + base_scorer_kwargs=rank_by_kwargs, + ) + return cl_error_indices[np.argsort(label_quality_scores_issues)] + + return label_issues_mask + per_class_issues = _find_multilabel_issues_per_class( labels, pred_probs, @@ -529,7 +596,8 @@ class 0, 1, ..., K-1. They need not sum to 1.0 rank_by_kwargs : dict, optional Refer to documentation for this argument in filter.find_label_issues() for details. - filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given'}, default='prune_by_noise_rate' + filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given', + 'low_normalized_margin', 'low_self_confidences'}, default='prune_by_noise_rate' Refer to documentation for this argument in filter.find_label_issues() for details. frac_noise : float, default=1.0 diff --git a/tests/test_filter_count.py b/tests/test_filter_count.py index 81962b9a81..4e0de2c08a 100644 --- a/tests/test_filter_count.py +++ b/tests/test_filter_count.py @@ -865,6 +865,7 @@ def test_num_label_issues_multilabel(confident_joint): labels=dataset["labels"], pred_probs=dataset["pred_probs"], confident_joint=dataset["cj"] if confident_joint else None, + filter_by="confident_learning", multi_label=True, ) assert sum(f) == n @@ -1131,3 +1132,64 @@ def test_estimate_py_and_noise_matrices_missing_classes(): ] ) _ = estimate_py_and_noise_matrices_from_probabilities(labels, pred_probs3) + + +def test_low_filter_by_methods(): + dataset = data + num_issues = count.num_label_issues(dataset["labels"], dataset["pred_probs"]) + + # test filter by low_normalized_margin, check num issues is same as using count.num_label_issues + label_issues_nm = filter.find_label_issues( + dataset["labels"], dataset["pred_probs"], filter_by="low_normalized_margin" + ) + assert sum(label_issues_nm) == num_issues + + # test filter by low_self_confidence, check num issues is same as using count.num_label_issues + label_issues_sc = filter.find_label_issues( + dataset["labels"], + dataset["pred_probs"], + filter_by="low_self_confidence", + return_indices_ranked_by="normalized_margin", + ) + assert len(label_issues_sc) == num_issues + + label_issues_sc_sort = filter.find_label_issues( + dataset["labels"], + dataset["pred_probs"], + filter_by="low_self_confidence", + return_indices_ranked_by="confidence_weighted_entropy", + ) + assert set(label_issues_sc) == set(label_issues_sc_sort) + + +def test_low_filter_by_methods_multilabel(): + dataset = multilabel_data + num_issues = count.num_label_issues(dataset["labels"], dataset["pred_probs"], multi_label=True) + + # test filter by low_normalized_margin, check num issues is same as using count.num_label_issues + label_issues_nm = filter.find_label_issues( + dataset["labels"], + dataset["pred_probs"], + filter_by="low_normalized_margin", + multi_label=True, + ) + assert sum(label_issues_nm) == num_issues + + # test filter by low_self_confidence, check num issues is same as using count.num_label_issues + label_issues_sc = filter.find_label_issues( + dataset["labels"], + dataset["pred_probs"], + filter_by="low_self_confidence", + multi_label=True, + return_indices_ranked_by="confidence_weighted_entropy", + ) + assert len(label_issues_sc) == num_issues + + label_issues_sc_sort = filter.find_label_issues( + dataset["labels"], + dataset["pred_probs"], + filter_by="low_self_confidence", + multi_label=True, + return_indices_ranked_by="self_confidence", + ) + assert set(label_issues_sc) == set(label_issues_sc_sort) From 35d5323479b340c6076f0136c8e0f7d161762dad Mon Sep 17 00:00:00 2001 From: Ulyana Date: Wed, 1 Feb 2023 20:07:31 -0800 Subject: [PATCH 104/258] Fix dictionary type annotation for OutOfDistribution object (#616) * Also improve params dict default to be None --- cleanlab/outlier.py | 8 +++++--- tests/test_outlier.py | 5 +++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cleanlab/outlier.py b/cleanlab/outlier.py index 57011788f8..0ac6ce5d4b 100644 --- a/cleanlab/outlier.py +++ b/cleanlab/outlier.py @@ -93,7 +93,7 @@ class OutOfDistribution: OUTLIER_PARAMS = {"k", "t", "knn"} OOD_PARAMS = {"confident_thresholds", "adjust_pred_probs", "method"} - DEFAULT_PARAM_DICT: Dict[Union[str, int, None], Union[str, int, None, np.ndarray]] = { + DEFAULT_PARAM_DICT: Dict[str, Union[str, int, None, np.ndarray]] = { "k": None, # ood features param "t": 1, # ood features param "knn": None, # ood features param @@ -102,9 +102,11 @@ class OutOfDistribution: "confident_thresholds": None, # ood pred_probs param } - def __init__(self, params: dict = {}): + def __init__(self, params: Optional[dict] = None) -> None: self._assert_valid_params(params, self.DEFAULT_PARAM_DICT) self.params = self.DEFAULT_PARAM_DICT + if params is None: + params = {} self.params = {**self.params, **params} def fit_score( @@ -280,7 +282,7 @@ def _assert_valid_params(params, param_keys): """ Helper method to check passed in params valid and get list of parameters in param that are not in param_keys. """ - if len(params) > 0: + if params is not None: wrong_params = list(set(params.keys()).difference(set(param_keys))) if len(wrong_params) > 0: raise ValueError( diff --git a/tests/test_outlier.py b/tests/test_outlier.py index 8d330a5892..f1f5b2bfd6 100644 --- a/tests/test_outlier.py +++ b/tests/test_outlier.py @@ -251,11 +251,12 @@ def test_class_public_func(): # Testing regular fit OOD_ood = OutOfDistribution() + print(OOD_ood.params) OOD_ood.fit(pred_probs=pred_probs, labels=labels) - + print(OOD_ood.params) OOD_outlier = OutOfDistribution() OOD_outlier.fit(features=features) - + print(OOD_outlier.params) assert OOD_ood.params["confident_thresholds"] is not None and OOD_ood.params["knn"] is None assert ( OOD_outlier.params["knn"] is not None and OOD_outlier.params["confident_thresholds"] is None From 266d9479dbc7a55eed8582081aa6b71862fa8596 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Wed, 1 Feb 2023 22:53:02 -0800 Subject: [PATCH 105/258] shorten notebook name in link (#617) --- docs/source/tutorials/multiannotator.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/tutorials/multiannotator.ipynb b/docs/source/tutorials/multiannotator.ipynb index a4263da465..617551272b 100644 --- a/docs/source/tutorials/multiannotator.ipynb +++ b/docs/source/tutorials/multiannotator.ipynb @@ -15,7 +15,7 @@ "source": [ "This 5-minute quickstart tutorial shows how to use cleanlab for classification data that has been labeled by multiple annotators (where each example has been labeled by at least one annotator, but not every annotator has labeled every example). Compared to existing crowdsourcing tools, cleanlab helps you better analyze such data by leveraging a trained classifier model in addition to the raw annotations. With one line of code, you can automatically compute:\n", "\n", - "- A **consensus label** for each example that aggregates the individual annotations more accurately than alternative aggregation via majority-vote or other algorithms used in crowdsourcing.\n", + "- A **consensus label** for each example (i.e. *truth inference*) that aggregates the individual annotations (more accurately than algorithms from crowdsourcing like majority-vote, Dawid-Skene, or GLAD).\n", "- a **quality score for each consensus label** which measures our confidence that this label is correct (via well-calibrated estimates that account for the: number annotators which have labeled this example, overall quality of each annotator, and quality of our trained ML models).\n", "- An analogous **label quality score** for each individual label chosen by one annotator for a particular example.\n", "- An **overall quality score for each annotator** which measures our confidence in the overall correctness of labels obtained from this annotator.\n", @@ -710,7 +710,7 @@ "You can also repeatedly iterate this process of getting better consensus labels using the model's out-of-sample predicted probabilities and then retraining the model with the improved labels to get even better predicted probabilities!\n", "For details, see our [examples](https://github.com/cleanlab/examples) notebook on [Iterative use of Cleanlab to Improve Classification Models (and Consensus Labels) from Data Labeled by Multiple Annotators](https://github.com/cleanlab/examples/blob/master/multiannotator_cifar10/multiannotator_cifar10.ipynb).\n", "\n", - "If possible, the best way to improve your model is to collect additional labels for both previously annotated data and extra not-yet-labeled examples. To decide which data is most informative to label next, use `cleanlab.multiannotator.get_active_learning_scores()` rather than the methods shown here. This is demonstrated in our [examples](https://github.com/cleanlab/examples) notebook on [Active Learning with Multiple Data Annotators via CROWDLAB](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb).\n", + "If possible, the best way to improve your model is to collect additional labels for both previously annotated data and extra not-yet-labeled examples. To decide which data is most informative to label next, use `cleanlab.multiannotator.get_active_learning_scores()` rather than the methods shown here. This is demonstrated in our [examples](https://github.com/cleanlab/examples) notebook on [Active Learning with Multiple Data Annotators](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb).\n", "\n", "\n", "## How does cleanlab.multiannotator work?\n", From 6ed61e0bb30e023fb8b1dedbcdc44d9bcb3eb8d0 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Thu, 2 Feb 2023 01:13:59 -0800 Subject: [PATCH 106/258] cap black version in CI (#618) --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ec5e8bb98a..3dddf8e3f6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,6 +59,8 @@ jobs: steps: - uses: actions/checkout@v3 - uses: psf/black@stable + with: + version: "~= 22.0" flake8: name: Check for unused/wildcard imports runs-on: ubuntu-latest From c191d8781daf92e5a8c962685814a1a18a267e94 Mon Sep 17 00:00:00 2001 From: Ulyana Date: Tue, 7 Feb 2023 10:01:45 -0800 Subject: [PATCH 107/258] Fix format compatibility with latest black==23. release (#620) --- .github/workflows/ci.yml | 2 -- cleanlab/classification.py | 2 -- cleanlab/experimental/coteaching.py | 1 + cleanlab/experimental/mnist_pytorch.py | 1 - cleanlab/internal/multiannotator_utils.py | 2 +- cleanlab/multiannotator.py | 1 - cleanlab/rank.py | 5 ----- tests/test_multilabel_classification.py | 2 -- tests/test_rank.py | 1 - 9 files changed, 2 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3dddf8e3f6..ec5e8bb98a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,8 +59,6 @@ jobs: steps: - uses: actions/checkout@v3 - uses: psf/black@stable - with: - version: "~= 22.0" flake8: name: Check for unused/wildcard imports runs-on: ubuntu-latest diff --git a/cleanlab/classification.py b/cleanlab/classification.py index 407afef0f6..66126de932 100644 --- a/cleanlab/classification.py +++ b/cleanlab/classification.py @@ -229,7 +229,6 @@ def __init__( label_quality_scores_kwargs={}, verbose=False, ): - if clf is None: # Use logistic regression if no classifier is provided. clf = LogReg(multi_class="auto", solver="lbfgs") @@ -645,7 +644,6 @@ def score(self, X, y, sample_weight=None) -> float: """ if hasattr(self.clf, "score"): - # Check if sample_weight in clf.score() if "sample_weight" in inspect.getfullargspec(self.clf.score).args: return self.clf.score(X, y, sample_weight=sample_weight) diff --git a/cleanlab/experimental/coteaching.py b/cleanlab/experimental/coteaching.py index 8d46f523f5..83223ff523 100644 --- a/cleanlab/experimental/coteaching.py +++ b/cleanlab/experimental/coteaching.py @@ -35,6 +35,7 @@ MINIMUM_BATCH_SIZE = 16 + # Loss function for Co-Teaching def loss_coteaching( y_1, diff --git a/cleanlab/experimental/mnist_pytorch.py b/cleanlab/experimental/mnist_pytorch.py index 3166b68f16..180f628f82 100644 --- a/cleanlab/experimental/mnist_pytorch.py +++ b/cleanlab/experimental/mnist_pytorch.py @@ -311,7 +311,6 @@ def fit(self, train_idx, train_labels=None, sample_weight=None, loader="train"): # Train for self.epochs epochs for epoch in range(1, self.epochs + 1): - # Enable dropout and batch norm layers self.model.train() for batch_idx, (data, target) in enumerate(train_loader): diff --git a/cleanlab/internal/multiannotator_utils.py b/cleanlab/internal/multiannotator_utils.py index 93cec654fc..27e73eb079 100644 --- a/cleanlab/internal/multiannotator_utils.py +++ b/cleanlab/internal/multiannotator_utils.py @@ -200,7 +200,7 @@ def format_multiannotator_labels(labels: LabelLike) -> Tuple[pd.DataFrame, dict] try: unique_labels = unique_labels[~np.isnan(unique_labels)] unique_labels.sort() - except (TypeError): # np.unique / np.sort cannot handle string values or pd.NA types + except TypeError: # np.unique / np.sort cannot handle string values or pd.NA types nan_mask = np.array([(l is np.NaN) or (l is pd.NA) or (l == "nan") for l in unique_labels]) unique_labels = unique_labels[~nan_mask] unique_labels.sort() diff --git a/cleanlab/multiannotator.py b/cleanlab/multiannotator.py index bf653e1da0..682142bff7 100644 --- a/cleanlab/multiannotator.py +++ b/cleanlab/multiannotator.py @@ -1227,7 +1227,6 @@ def _get_post_pred_probs_and_weights( quality_method: str = "crowdlab", verbose: bool = True, ) -> Tuple[np.ndarray, Any, Any]: - """Return the posterior predicted probabilities of each example given a specified quality method. Parameters diff --git a/cleanlab/rank.py b/cleanlab/rank.py index ae0f455a86..b00deafff3 100644 --- a/cleanlab/rank.py +++ b/cleanlab/rank.py @@ -140,7 +140,6 @@ class 0, 1, ..., K-1. # Adjust predicted probabilities if adjust_pred_probs: - # Check if adjust_pred_probs is supported for the chosen method if method == "confidence_weighted_entropy": raise ValueError(f"adjust_pred_probs is not currently supported for {method}.") @@ -259,14 +258,12 @@ def get_label_quality_ensemble_scores( # This weighting scheme performs search of t in log_loss_search_T_values for "best" log loss if weight_ensemble_members_by == "log_loss_search": - # Initialize variables for log loss search pred_probs_avg_log_loss_weighted = None neg_log_loss_weights = None best_eval_log_loss = float("inf") for t in log_loss_search_T_values: - neg_log_loss_list = [] # pred_probs for each model @@ -299,7 +296,6 @@ def get_label_quality_ensemble_scores( scores_list = [] accuracy_list = [] for pred_probs in pred_probs_list: - # Calculate scores and accuracy scores = get_label_quality_scores( labels=labels, @@ -349,7 +345,6 @@ def get_label_quality_ensemble_scores( label_quality_scores = (scores_ensemble * weights).sum(axis=1) elif weight_ensemble_members_by == "custom": - # Check custom_weights for errors assert ( custom_weights is not None diff --git a/tests/test_multilabel_classification.py b/tests/test_multilabel_classification.py index 9454b884d3..ac1d3afb57 100644 --- a/tests/test_multilabel_classification.py +++ b/tests/test_multilabel_classification.py @@ -388,7 +388,6 @@ def test_multilabel_py(given_labels, expected): @pytest.mark.parametrize("K", [2, 3, 4], ids=["K=2", "K=3", "K=4"]) def test_get_split_generator(cv, K): - all_configurations = np.array(list(itertools.product([0, 1], repeat=K))) given_labels = np.repeat(all_configurations, 2, axis=0) @@ -412,7 +411,6 @@ def test_get_split_generator(cv, K): # Test split_generator with rare/missing multilabel configurations @pytest.mark.parametrize("K", [2, 3, 4], ids=["K=2", "K=3", "K=4"]) def test_get_split_generator_rare_configurations(cv, K): - all_configurations = np.array(list(itertools.product([0, 1], repeat=K))) given_labels = np.repeat(all_configurations, 2, axis=0) diff --git a/tests/test_rank.py b/tests/test_rank.py index 3d3c2e1155..e928587774 100644 --- a/tests/test_rank.py +++ b/tests/test_rank.py @@ -153,7 +153,6 @@ def test_order_label_issues_using_scoring_func_ranking(scoring_method_func, adju # do not run the test below if the method does not support adjust_pred_probs # confidence_weighted_entropy scoring method does not support adjust_pred_probs if not (adjust_pred_probs == True and method == "confidence_weighted_entropy"): - indices = np.arange(len(data["label_errors_mask"]))[ data["label_errors_mask"] ] # indices of label issues From 6aaee8332b92c377f800c6c0263cc2b90a308cee Mon Sep 17 00:00:00 2001 From: Hui Wen <45724323+huiwengoh@users.noreply.github.com> Date: Wed, 8 Feb 2023 02:03:32 +0800 Subject: [PATCH 108/258] Create new cleanlab.models module (#601) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Co-authored-by: Elías Snorrason --- .github/workflows/ci.yml | 6 + cleanlab/experimental/README.md | 4 - cleanlab/models/README.md | 11 ++ cleanlab/models/__init__.py | 0 cleanlab/{experimental => models}/fasttext.py | 2 +- cleanlab/{experimental => models}/keras.py | 91 +++++++++--- docs/source/cleanlab/experimental/index.rst | 2 - .../{experimental => models}/fasttext.rst | 2 +- docs/source/cleanlab/models/index.rst | 15 ++ .../{experimental => models}/keras.rst | 2 +- docs/source/index.rst | 1 + docs/source/tutorials/text.ipynb | 22 ++- requirements-dev.txt | 1 + setup.cfg | 1 + tests/test_frameworks.py | 133 +++++++++++++++++- 15 files changed, 257 insertions(+), 36 deletions(-) create mode 100644 cleanlab/models/README.md create mode 100644 cleanlab/models/__init__.py rename cleanlab/{experimental => models}/fasttext.py (99%) rename cleanlab/{experimental => models}/keras.py (69%) rename docs/source/cleanlab/{experimental => models}/fasttext.rst (65%) create mode 100644 docs/source/cleanlab/models/index.rst rename docs/source/cleanlab/{experimental => models}/keras.rst (58%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ec5e8bb98a..73ac0fa216 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,6 +30,10 @@ jobs: run: python3 -c "import cleanlab" - name: Install development dependencies run: pip install -r requirements-dev.txt + - name: Install fasttext for non-Windows machines + if: matrix.os != 'windows-latest' + run: | + pip install fasttext - name: Overwrite tensorflow version on Windows if: matrix.os == 'windows-latest' run: | @@ -37,6 +41,8 @@ jobs: pip install tensorflow-cpu - name: Test with coverage run: pytest --verbose --cov=cleanlab/ --cov-config .coveragerc --cov-report=xml + env: + TEST_FASTTEXT: true - uses: codecov/codecov-action@v3 typecheck: name: Type check diff --git a/cleanlab/experimental/README.md b/cleanlab/experimental/README.md index 3c93ec9c71..d16e67c99b 100644 --- a/cleanlab/experimental/README.md +++ b/cleanlab/experimental/README.md @@ -5,10 +5,6 @@ Methods in this `experimental` module are bleeding edge and may have sharp edges Some of these files include various models that can be used with cleanlab to find issues in specific types of data. These require dependencies on deep learning and other machine learning packages that are not official cleanlab dependencies. You must install these dependencies on your own if you wish to use them. The dependencies are as follows: -* keras.py - a wrapper to make any Keras model compatible with cleanlab and sklearn - - tensorflow -* fasttext.py - a cleanlab-compatible FastText classifier for text data - - fasttext * mnist_pytorch.py - a cleanlab-compatible simplified AlexNet for MNIST using PyTorch - torch - torchvision diff --git a/cleanlab/models/README.md b/cleanlab/models/README.md new file mode 100644 index 0000000000..6551aa13ea --- /dev/null +++ b/cleanlab/models/README.md @@ -0,0 +1,11 @@ +# Useful models adapted for use with cleanlab + +Methods in this ``models`` module are not guaranteed to be stable between different ``cleanlab`` versions. + +Some of these files include various models that can be used with cleanlab to find issues in specific types of data. These require dependencies on deep learning and other machine learning packages that are not official cleanlab dependencies. You must install these dependencies on your own if you wish to use them. + +The dependencies are as follows: +* keras.py - a wrapper to make any Keras model compatible with cleanlab and sklearn + - tensorflow +* fasttext.py - a cleanlab-compatible FastText classifier for text data + - fasttext diff --git a/cleanlab/models/__init__.py b/cleanlab/models/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cleanlab/experimental/fasttext.py b/cleanlab/models/fasttext.py similarity index 99% rename from cleanlab/experimental/fasttext.py rename to cleanlab/models/fasttext.py index ad1a7923a4..1da6d766ad 100644 --- a/cleanlab/experimental/fasttext.py +++ b/cleanlab/models/fasttext.py @@ -151,7 +151,7 @@ def _create_train_data(self, data_indices): masked_fn = "fastTextClf_" + str(int(time.time())) + ".txt" open(masked_fn, "w").close() # Read in training data one line at a time - with open(self.train_data_fn, "rU") as rf: + with open(self.train_data_fn, "r") as rf: idx = 0 data_idx = data_indices.pop() for line in rf: diff --git a/cleanlab/experimental/keras.py b/cleanlab/models/keras.py similarity index 69% rename from cleanlab/experimental/keras.py rename to cleanlab/models/keras.py index 4e34567b47..d44ea6559f 100644 --- a/cleanlab/experimental/keras.py +++ b/cleanlab/models/keras.py @@ -36,10 +36,9 @@ """ import tensorflow as tf +import keras # type: ignore import numpy as np -import pandas as pd from typing import Callable, Optional -from cleanlab.internal.validation import assert_valid_inputs class KerasWrapperModel: @@ -75,44 +74,63 @@ def __init__( compile_kwargs: dict = { "loss": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) }, + params: Optional[dict] = None, ): + if params is None: + params = {} + self.model = model self.model_kwargs = model_kwargs self.compile_kwargs = compile_kwargs + self.params = params self.net = None def get_params(self, deep=True): + """Returns the parameters of the Keras model.""" return { "model": self.model, "model_kwargs": self.model_kwargs, "compile_kwargs": self.compile_kwargs, + "params": self.params, } + def set_params(self, **params): + """Set the parameters of the Keras model.""" + self.params.update(params) + return self + def fit(self, X, y=None, **kwargs): - """Trains a Keras classifier. + """Trains a Keras model. Parameters ---------- X : tf.Dataset or np.array or pd.DataFrame - If `X` is a tensorflow dataset object, it must already contain the labels as is required for standard Keras fit. + If ``X`` is a tensorflow dataset object, it must already contain the labels as is required for standard Keras fit. y : np.array or pd.DataFrame, default = None - If `X` is a tensorflow dataset object, you can optionally provide the labels again here as argument `y` to be compatible with sklearn, + If ``X`` is a tensorflow dataset object, you can optionally provide the labels again here as argument `y` to be compatible with sklearn, but they are ignored. - If `X` is a numpy array or pandas dataframe, the labels have to be passed in using this argument. + If ``X`` is a numpy array or pandas dataframe, the labels have to be passed in using this argument. """ self.net = self.model(**self.model_kwargs) self.net.compile(**self.compile_kwargs) - if isinstance(X, (np.ndarray, pd.DataFrame)): - assert_valid_inputs(X, y) + # TODO: check for generators + if y is not None and not isinstance(X, (tf.data.Dataset, keras.utils.Sequence)): kwargs["y"] = y - self.net.fit(X, **kwargs) + self.net.fit(X, **{**self.params, **kwargs}) def predict_proba(self, X, *, apply_softmax=True, **kwargs): - """Set extra argument `apply_softmax` to True to indicate your network only outputs logits not probabilities.""" + """Predict class probabilities for all classes using the wrapped Keras model. + Set extra argument `apply_softmax` to True to indicate your network only outputs logits not probabilities. + + Parameters + ---------- + X : tf.Dataset or np.array or pd.DataFrame + Data in the same format as the original ``X`` provided to ``fit()``. + """ if self.net is None: raise ValueError("must call fit() before predict()") pred_probs = self.net.predict(X, **kwargs) @@ -121,10 +139,19 @@ def predict_proba(self, X, *, apply_softmax=True, **kwargs): return pred_probs def predict(self, X, **kwargs): + """Predict class labels using the wrapped Keras model. + + Parameters + ---------- + X : tf.Dataset or np.array or pd.DataFrame + Data in the same format as the original ``X`` provided to ``fit()``. + + """ pred_probs = self.predict_proba(X, **kwargs) return np.argmax(pred_probs, axis=1) def summary(self, **kwargs): + """Returns the summary of the Keras model.""" self.net.summary(**kwargs) @@ -154,43 +181,63 @@ def __init__( compile_kwargs: dict = { "loss": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) }, + params: Optional[dict] = None, ): + if params is None: + params = {} + self.layers = layers self.name = name self.compile_kwargs = compile_kwargs + self.params = params self.net = None def get_params(self, deep=True): + """Returns the parameters of the Keras model.""" return { "layers": self.layers, "name": self.name, "compile_kwargs": self.compile_kwargs, + "params": self.params, } + def set_params(self, **params): + """Set the parameters of the Keras model.""" + for key, value in params.items(): + self.params[key] = value + return self + def fit(self, X, y=None, **kwargs): - """Trains a Sequential Keras classifier. + """Trains a Sequential Keras model. Parameters ---------- X : tf.Dataset or np.array or pd.DataFrame - If `X` is a tensorflow dataset object, it must already contain the labels as is required for standard Keras fit. + If ``X`` is a tensorflow dataset object, it must already contain the labels as is required for standard Keras fit. y : np.array or pd.DataFrame, default = None - If `X` is a tensorflow dataset object, you can optionally provide the labels again here as argument `y` to be compatible with sklearn, + If ``X`` is a tensorflow dataset object, you can optionally provide the labels again here as argument `y` to be compatible with sklearn, but they are ignored. - If `X` is a numpy array or pandas dataframe, the labels have to be passed in using this argument. + If ``X`` is a numpy array or pandas dataframe, the labels have to be passed in using this argument. """ self.net = tf.keras.models.Sequential(self.layers, self.name) self.net.compile(**self.compile_kwargs) - if isinstance(X, (np.ndarray, pd.DataFrame)): - assert_valid_inputs(X, y) + # TODO: check for generators + if y is not None and not isinstance(X, (tf.data.Dataset, keras.utils.Sequence)): kwargs["y"] = y - self.net.fit(X, **kwargs) + self.net.fit(X, **{**self.params, **kwargs}) def predict_proba(self, X, *, apply_softmax=True, **kwargs): - """Set extra argument `apply_softmax` to True to indicate your network only outputs logits not probabilities.""" + """Predict class probabilities for all classes using the wrapped Keras model. + Set extra argument `apply_softmax` to True to indicate your network only outputs logits not probabilities. + + Parameters + ---------- + X : tf.Dataset or np.array or pd.DataFrame + Data in the same format as the original ``X`` provided to ``fit()``. + """ if self.net is None: raise ValueError("must call fit() before predict()") pred_probs = self.net.predict(X, **kwargs) @@ -199,8 +246,16 @@ def predict_proba(self, X, *, apply_softmax=True, **kwargs): return pred_probs def predict(self, X, **kwargs): + """Predict class labels using the wrapped Keras model. + + Parameters + ---------- + X : tf.Dataset or np.array or pd.DataFrame + Data in the same format as the original ``X`` provided to ``fit()``. + """ pred_probs = self.predict_proba(X, **kwargs) return np.argmax(pred_probs, axis=1) def summary(self, **kwargs): + """Returns the summary of the Keras model.""" self.net.summary(**kwargs) diff --git a/docs/source/cleanlab/experimental/index.rst b/docs/source/cleanlab/experimental/index.rst index 4d32910206..935152c86c 100644 --- a/docs/source/cleanlab/experimental/index.rst +++ b/docs/source/cleanlab/experimental/index.rst @@ -11,8 +11,6 @@ experimental :show-inheritance: .. toctree:: - keras - fasttext mnist_pytorch coteaching cifar_cnn diff --git a/docs/source/cleanlab/experimental/fasttext.rst b/docs/source/cleanlab/models/fasttext.rst similarity index 65% rename from docs/source/cleanlab/experimental/fasttext.rst rename to docs/source/cleanlab/models/fasttext.rst index 792b769342..78efe7677a 100644 --- a/docs/source/cleanlab/experimental/fasttext.rst +++ b/docs/source/cleanlab/models/fasttext.rst @@ -1,7 +1,7 @@ fasttext ======== -.. automodule:: cleanlab.experimental.fasttext +.. automodule:: cleanlab.models.fasttext :autosummary: :members: :undoc-members: diff --git a/docs/source/cleanlab/models/index.rst b/docs/source/cleanlab/models/index.rst new file mode 100644 index 0000000000..d095a7b18f --- /dev/null +++ b/docs/source/cleanlab/models/index.rst @@ -0,0 +1,15 @@ +models +====== + +.. warning:: + Methods in this ``models`` module are not guaranteed to be stable between different ``cleanlab`` versions. + +.. automodule:: cleanlab.models + :autosummary: + :members: + :undoc-members: + :show-inheritance: + +.. toctree:: + keras + fasttext \ No newline at end of file diff --git a/docs/source/cleanlab/experimental/keras.rst b/docs/source/cleanlab/models/keras.rst similarity index 58% rename from docs/source/cleanlab/experimental/keras.rst rename to docs/source/cleanlab/models/keras.rst index f35f93c455..c9ff1b3138 100644 --- a/docs/source/cleanlab/experimental/keras.rst +++ b/docs/source/cleanlab/models/keras.rst @@ -1,7 +1,7 @@ keras ===== -.. automodule:: cleanlab.experimental.keras +.. automodule:: cleanlab.models.keras :autosummary: :members: :undoc-members: diff --git a/docs/source/index.rst b/docs/source/index.rst index 7e62505e44..76bebb5ace 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -142,6 +142,7 @@ Please see our `contributing guidelines `\n", @@ -284,7 +291,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We use a `TextVectorization` layer to preprocess, tokenize, and vectorize our text data, thus making it suitable as input for a neural network.\n" + "Then, we use a `TextVectorization` layer to preprocess, tokenize, and vectorize our text data to a suitabable format for a neural network." ] }, { @@ -326,6 +333,13 @@ "test_texts = vectorize_layer(raw_test_texts).numpy()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our subsequent neural network models will directly operate on elements of `train_texts` and `test_texts` in order to classify reviews." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -377,7 +391,7 @@ "source": [ "We can define the `CleanLearning` object with the neural network model and use `find_label_issues` to identify potential label errors.\n", "\n", - "`CleanLearning` provides a wrapper class that can easily be applied to any scikit-learn compatible model, which can be used to find potential label issues or train a more robust model if the original data contains noisy labels." + "`CleanLearning` provides a wrapper class that can easily be applied to any scikit-learn compatible model, which can be used to find potential label issues and train a more robust model if the original data contains noisy labels." ] }, { diff --git a/requirements-dev.txt b/requirements-dev.txt index d6c7a87a63..1dc5dc705c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -13,3 +13,4 @@ torchvision skorch tensorflow psutil +wget diff --git a/setup.cfg b/setup.cfg index 04a05501c6..8888d3b898 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,3 +5,4 @@ per-file-ignores = cleanlab/__init__.py: F401 cleanlab/token_classification/__init__.py: F401 cleanlab/benchmarking/__init__.py: F401 + cleanlab/models/__init__.py: F401 diff --git a/tests/test_frameworks.py b/tests/test_frameworks.py index e4fccb7eae..f26e532b89 100644 --- a/tests/test_frameworks.py +++ b/tests/test_frameworks.py @@ -15,8 +15,8 @@ # along with cleanlab. If not, see . """ -Scripts to test cleanlab usage with deep learning frameworks: -pytorch, skorch, tensorflow, keras +Scripts to test cleanlab usage with various ML frameworks: +pytorch, skorch, tensorflow, keras, fasttext """ import pytest @@ -27,6 +27,7 @@ import sys import os +import wget from copy import deepcopy import random import numpy as np @@ -38,9 +39,13 @@ import tensorflow as tf import torch import skorch +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import GridSearchCV from cleanlab.classification import CleanLearning -from cleanlab.experimental.keras import KerasWrapperSequential, KerasWrapperModel +from cleanlab.models.keras import KerasWrapperSequential, KerasWrapperModel +from cleanlab.internal.util import format_labels def python_version_ok(): # tensorflow and torch do not play nice with older Python @@ -48,6 +53,11 @@ def python_version_ok(): # tensorflow and torch do not play nice with older Pyt return (version.major >= 3) and (version.minor >= 7) +def run_fasttext_test(): + # run test only if os enviroment is set of true and os is not Windows + return os.environ.get("TEST_FASTTEXT") == "true" and os.name != "nt" + + def dataset_w_errors(): num_classes = 2 num_features = 3 @@ -158,7 +168,7 @@ def test_tensorflow_sequential(batch_size, shuffle_config, data=DATA, hidden_uni @pytest.mark.skipif("not python_version_ok()", reason="need at least python 3.7") @pytest.mark.parametrize("batch_size,shuffle_config", [(1, 0), (32, 0), (32, 1), (32, 2)]) -def test_tensorflow_functional(batch_size, shuffle_config, data=DATA, hidden_units=128): +def test_tensorflow_functional(batch_size, shuffle_config, data=DATA, hidden_units=64): dataset_tf = tf.data.Dataset.from_tensor_slices((data["X"], data["y"])) if shuffle_config == 0: # proper shuffling for SGD dataset_shuffled = dataset_tf.shuffle(buffer_size=len(data["X"])) @@ -172,7 +182,7 @@ def test_tensorflow_functional(batch_size, shuffle_config, data=DATA, hidden_uni def make_model(num_features, num_classes): inputs = tf.keras.Input(shape=(num_features,)) - x = tf.keras.layers.Dense(64, activation="relu")(inputs) + x = tf.keras.layers.Dense(hidden_units, activation="relu")(inputs) outputs = tf.keras.layers.Dense(num_classes)(x) model = tf.keras.Model(inputs=inputs, outputs=outputs, name="test_model") @@ -226,6 +236,66 @@ def test_tensorflow_rarelabel(batch_size, data=DATA_RARE_LABEL, hidden_units=8): preds = cl.predict(dataset_tf) +def test_keras_sklearn_compatability(data=DATA, hidden_units=32): + # test pipeline on Sequential API + model = KerasWrapperSequential( + [ + tf.keras.layers.Dense(128, input_shape=[data["num_features"]], activation="relu"), + tf.keras.layers.Dense(data["num_classes"]), + ], + ) + + pipeline = Pipeline([("scale", StandardScaler()), ("net", model)]) + pipeline.fit(data["X"], data["y"]) + preds = pipeline.predict(data["X"]) + + # test gridsearch on Sequential API + model = KerasWrapperSequential( + [ + tf.keras.layers.Dense( + hidden_units, input_shape=[data["num_features"]], activation="relu" + ), + tf.keras.layers.Dense(data["num_classes"]), + ], + ) + + params = {"batch_size": [32, 64], "epochs": [2, 3]} + gs = GridSearchCV( + model, params, refit=False, cv=3, verbose=2, scoring="accuracy", error_score="raise" + ) + gs.fit(data["X"], data["y"]) + + # test pipeline on functional API + def make_model(num_features, num_classes): + inputs = tf.keras.Input(shape=(num_features,)) + x = tf.keras.layers.Dense(64, activation="relu")(inputs) + outputs = tf.keras.layers.Dense(num_classes)(x) + model = tf.keras.Model(inputs=inputs, outputs=outputs, name="test_model") + + return model + + model = KerasWrapperModel( + make_model, + model_kwargs={"num_features": data["num_features"], "num_classes": data["num_classes"]}, + ) + + pipeline = Pipeline([("scale", StandardScaler()), ("net", model)]) + pipeline.fit(data["X"], data["y"]) + preds = pipeline.predict(data["X"]) + + # test gridsearch on Sequential API + model = KerasWrapperModel( + make_model, + model_kwargs={"num_features": data["num_features"], "num_classes": data["num_classes"]}, + ) + + params = {"batch_size": [32, 64], "epochs": [2, 3]} + gs = GridSearchCV( + model, params, refit=False, cv=3, verbose=2, scoring="accuracy", error_score="raise" + ) + gs.fit(data["X"], data["y"]) + + @pytest.mark.skipif("not python_version_ok()", reason="need at least python 3.7") def test_torch(data=DATA, hidden_units=128): dataset = torch.utils.data.TensorDataset( @@ -286,3 +356,56 @@ def forward(self, X): cl = CleanLearning(net) cl.fit(dataset, data["y"], clf_kwargs={"epochs": 2}) pred_probs = cl.predict(dataset) + + +# test fasttext only if not on windows and environment variable TEST_FASTTEXT has been set to "true" +@pytest.mark.skipif( + "not run_fasttext_test()", reason="fasttext is not easily pip install-able on windows" +) +def test_fasttext(): + from cleanlab.models.fasttext import FastTextClassifier, data_loader + + dir = "tests/fasttext_data" + if not os.path.isdir(dir): + os.makedirs(dir) + + try: + if not os.path.isfile("tests/fasttext_data/tweets_train.txt"): + wget.download( + "http://s.cleanlab.ai/tweets_fasttext/tweets_train.txt", "tests/fasttext_data" + ) + if not os.path.isfile("tests/fasttext_data/tweets_test.txt"): + wget.download( + "http://s.cleanlab.ai/tweets_fasttext/tweets_test.txt", "tests/fasttext_data" + ) + except: + raise RuntimeError( + "Download failed (potentially due to lack of internet connection or invalid url). " + "To skip this unittest, set the env variable TEST_FASTTEXT = false." + ) + + labels = np.ravel([x[0] for x in data_loader("tests/fasttext_data/tweets_train.txt")]) + labels = [lab[9:] for lab in labels] + labels, label_map = format_labels(labels) + X = np.array(range(len(labels))) + + # test basic fasttext methods + ftc = FastTextClassifier( + train_data_fn="tests/fasttext_data/tweets_train.txt", + test_data_fn="tests/fasttext_data/tweets_test.txt", + ) + ftc.fit() + pred_labels = ftc.predict() + pred_probs = ftc.predict_proba() + + # test CleanLearning + ftc = FastTextClassifier( + train_data_fn="tests/fasttext_data/tweets_train.txt", + test_data_fn="tests/fasttext_data/tweets_test.txt", + ) + cl = CleanLearning(ftc) + + issues = cl.find_label_issues(X=X, labels=labels) + cl.fit(X=X, labels=labels, label_issues=issues) + pred_labels = cl.predict() + pred_probs = cl.predict_proba() From e2611e76a1a11595a7b4ec84ac699e39ba1424bc Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Tue, 7 Feb 2023 13:26:35 -0800 Subject: [PATCH 109/258] upgrade torch in docs (#607) * upgrade versions of docs dependencies: torch, torchvision, speechbrain, etc --- docs/requirements.txt | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 1c944c18a7..070524aa36 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -8,19 +8,19 @@ ipython==8.0.1 ipykernel==6.8.0 ipywidgets==7.6.5 sphinx-multiversion==0.2.4 -torchvision==0.12.0 sphinx-copybutton==0.5.0 sphinxcontrib-katex==0.8.6 -matplotlib==3.5.1 -skorch==0.11.0 +sphinx-autodoc-typehints==1.19.2 +matplotlib==3.6.3 +requests==2.28.2 tensorflow-datasets==4.5.2 tensorflow==2.9.1 -speechbrain==0.5.12 tensorflow-io==0.26.0 -huggingface_hub==0.7 -torchaudio==0.11.0 +speechbrain==0.5.13 +huggingface_hub==0.11.1 fasttext==0.9.2 -timm==0.6.5 -torch==1.11.0 -requests==2.28.0 -sphinx-autodoc-typehints==1.19.2 +torch==1.13.1 +skorch==0.12.1 +torchvision==0.14.1 +torchaudio==0.13.1 +timm==0.6.12 From 5f6493f5098e03460c887ab28d073f9432750686 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Thu, 9 Feb 2023 16:33:47 -0800 Subject: [PATCH 110/258] fix bug: confidences -> confidence (#623) --- cleanlab/filter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cleanlab/filter.py b/cleanlab/filter.py index bcc33e1037..5752c4fc7d 100644 --- a/cleanlab/filter.py +++ b/cleanlab/filter.py @@ -247,7 +247,7 @@ class 0, 1, ..., K-1. "confident_learning", "predicted_neq_given", "low_normalized_margin", - "low_self_confidences", + "low_self_confidence", ] and (frac_noise != 1.0 or num_to_remove_per_class is not None): warn_str = ( "frac_noise and num_to_remove_per_class parameters are only supported" @@ -262,7 +262,7 @@ class 0, 1, ..., K-1. "confident_learning", "predicted_neq_given", "low_normalized_margin", - "low_self_confidences", + "low_self_confidence", ] ): # TODO - add support for these filters @@ -597,7 +597,7 @@ class 0, 1, ..., K-1. They need not sum to 1.0 Refer to documentation for this argument in filter.find_label_issues() for details. filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given', - 'low_normalized_margin', 'low_self_confidences'}, default='prune_by_noise_rate' + 'low_normalized_margin', 'low_self_confidence'}, default='prune_by_noise_rate' Refer to documentation for this argument in filter.find_label_issues() for details. frac_noise : float, default=1.0 From d99788b9de1ec065b6a1c4ef87fb4314db79cadb Mon Sep 17 00:00:00 2001 From: Ulyana Date: Fri, 10 Feb 2023 08:28:29 -0800 Subject: [PATCH 111/258] Fixed duplicate issue removal in find_label_issues (#624) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/filter.py | 12 ++++++------ tests/test_filter_count.py | 23 +++++++++++++++++++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/cleanlab/filter.py b/cleanlab/filter.py index 5752c4fc7d..2838fc441f 100644 --- a/cleanlab/filter.py +++ b/cleanlab/filter.py @@ -452,12 +452,12 @@ class 0, 1, ..., K-1. if filter_by == "predicted_neq_given": label_issues_mask = find_predicted_neq_given(labels, pred_probs, multi_label=multi_label) - # Remove label issues if given label == model prediction - # TODO: consider use of _multiclass_crossval_predict() here - pred = pred_probs.argmax(axis=1) - for i, pred_label in enumerate(pred): - if pred_label == labels[i]: - label_issues_mask[i] = False + if filter_by not in ["low_self_confidence", "low_normalized_margin"]: + # Remove label issues if given label == model prediction if issues haven't been removed yet + pred = pred_probs.argmax(axis=1) + for i, pred_label in enumerate(pred): + if pred_label == labels[i]: + label_issues_mask[i] = False if verbose: print("Number of label issues found: {}".format(sum(label_issues_mask))) diff --git a/tests/test_filter_count.py b/tests/test_filter_count.py index 4e0de2c08a..2da04a8482 100644 --- a/tests/test_filter_count.py +++ b/tests/test_filter_count.py @@ -750,6 +750,29 @@ def test_num_label_issues_different_estimation_types(): assert n3 != n2 +def test_find_label_issues_same_value(): + f1 = filter.find_label_issues( + labels=data["labels"], + pred_probs=data["pred_probs"], + filter_by="confident_learning", + ) + + f2 = filter.find_label_issues( + labels=data["labels"], + pred_probs=data["pred_probs"], + filter_by="low_self_confidence", + ) + + f3 = filter.find_label_issues( + labels=data["labels"], + pred_probs=data["pred_probs"], + filter_by="low_normalized_margin", + ) + + assert np.sum(f1) == np.sum(f2) + assert np.sum(f2) == np.sum(f3) + + @pytest.mark.filterwarnings() def test_num_label_issues(): cj_calibrated_off_diag_sum = data["cj"].sum() - data["cj"].trace() From 686cbf63f1c14799814de06c8be1f9974a4502d0 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Fri, 10 Feb 2023 16:59:16 -0800 Subject: [PATCH 112/258] Method to estimate label issues with limited memory via mini-batches (#615) Co-authored-by: clu0 <33559427+clu0@users.noreply.github.com> --- cleanlab/experimental/README.md | 2 +- cleanlab/experimental/label_issues_batched.py | 542 ++++++++++++++++++ cleanlab/filter.py | 2 +- cleanlab/rank.py | 41 +- docs/source/tutorials/faq.ipynb | 179 +++++- tests/test_filter_count.py | 78 +++ 6 files changed, 821 insertions(+), 23 deletions(-) create mode 100644 cleanlab/experimental/label_issues_batched.py diff --git a/cleanlab/experimental/README.md b/cleanlab/experimental/README.md index d16e67c99b..91155f9801 100644 --- a/cleanlab/experimental/README.md +++ b/cleanlab/experimental/README.md @@ -2,7 +2,7 @@ Methods in this `experimental` module are bleeding edge and may have sharp edges. They are not guaranteed to be stable between different cleanlab versions. -Some of these files include various models that can be used with cleanlab to find issues in specific types of data. These require dependencies on deep learning and other machine learning packages that are not official cleanlab dependencies. You must install these dependencies on your own if you wish to use them. +Some of these files include various models that can be used with cleanlab to find issues in specific types of data. These require dependencies on deep learning and other machine learning packages that are not official cleanlab dependencies. You must install these dependencies on your own if you wish to use them. The dependencies are as follows: * mnist_pytorch.py - a cleanlab-compatible simplified AlexNet for MNIST using PyTorch diff --git a/cleanlab/experimental/label_issues_batched.py b/cleanlab/experimental/label_issues_batched.py new file mode 100644 index 0000000000..509e56b790 --- /dev/null +++ b/cleanlab/experimental/label_issues_batched.py @@ -0,0 +1,542 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . + +""" +Implementation of :py:func:`filter.find_label_issues ` +that does not need much memory by operating in mini-batches. +You can also use this approach to estimate label quality scores or the number of label issues +for big datasets with limited memory. + +With default settings, the results returned from this approach closely approximate those returned from: +``cleanlab.filter.find_label_issues(..., filter_by="low_self_confidence", return_indices_ranked_by="self_confidence")`` + +To run this approach, either follow the examples script below, +or use the ``find_label_issues_batched()`` convenience function defined in this module. + +The recommended usage demonstrated in the examples script involves two passes over your data: +one pass to compute `confident_thresholds`, another to evaluate each label. +To maximize efficiency, try to use the largest batch_size your memory allows. +To reduce runtime further, you can run the first pass on a subset of your dataset +as long as it contains enough data from each class to estimate `confident_thresholds` accurately. + +In the examples script below: +- `labels` is a (big) 1D ``np.ndarray`` of class labels represented as integers in ``0,1,...,K-1``. +- ``pred_probs`` = is a (big) 2D ``np.ndarray`` of predicted class probabilities, +where each row is an example, each column represents a class. + +`labels` and `pred_probs` can be stored in a file instead where you load chunks of them at a time. +Methods to load arrays in chunks include: ``np.load(...,mmap_mode='r')``, ``numpy.memmap()``, +HDF5 or Zarr files, see: https://pythonspeed.com/articles/mmap-vs-zarr-hdf5/ + +Examples +-------- +>>> n = len(labels) +>>> batch_size = 10000 # you can change this in between batches, set as big as your RAM allows +>>> lab = LabelInspector(num_class = pred_probs.shape[1]) +>>> # First compute confident thresholds (for faster results, can also do this on a random subset of your data): +>>> i = 0 +>>> while i < n: +>>> end_index = i + batch_size +>>> labels_batch = labels[i:end_index] +>>> pred_probs_batch = pred_probs[i:end_index,:] +>>> i = end_index +>>> lab.update_confident_thresholds(labels_batch, pred_probs_batch) +>>> # See what we calculated: +>>> confident_thresholds = lab.get_confident_thresholds() +>>> # Evaluate the quality of the labels (run this on full dataset you want to evaluate): +>>> i = 0 +>>> while i < n: +>>> end_index = i + batch_size +>>> labels_batch = labels[i:end_index] +>>> pred_probs_batch = pred_probs[i:end_index,:] +>>> i = end_index +>>> batch_results = lab.score_label_quality(labels_batch, pred_probs_batch) +>>> # Indices of examples with label issues, sorted by label quality score (most severe to least severe): +>>> indices_of_examples_with_issues = lab.get_label_issues() +>>> # If your `pred_probs` and `labels` are arrays already in memory, +>>> # then you can use this shortcut for all of the above: +>>> indices_of_examples_with_issues = find_label_issues_batched(labels, pred_probs, batch_size=10000) +""" + +import numpy as np +from typing import Optional, List + +from cleanlab.count import get_confident_thresholds +from cleanlab.rank import find_top_issues, _compute_label_quality_scores +from cleanlab.typing import LabelLike +from cleanlab.internal.util import value_counts_fill_missing_classes + + +EPS = 1e-6 # small number + + +class LabelInspector: + """ + Class for finding label issues in big datasets where memory becomes a problem for other cleanlab methods. + Only create one such object per dataset and do not try to use the same ``LabelInspector`` across 2 datasets. + For efficiency, this class does little input checking. + You can first run :py:func:`filter.find_label_issues ` + on a small subset of your data to verify your inputs are properly formatted. + Do NOT modify any of the attributes of this class yourself! + Multi-label classification is not supported by this class, it is only for multi-class classification. + + Parameters + ---------- + num_class : int + The number of classes in your multi-class classification task. + + store_results : bool, optional + Whether this object will store all label quality scores, a 1D array of shape ``(N,)`` + where ``N`` is the total number of examples in your dataset. + Set this to False if you encounter memory problems even for small batch sizes (~1000). + If ``False``, you can still identify the label issues yourself by aggregating + the label quality scores for each batch, sorting them across all batches, and returning the top ``T`` indices + with ``T = self.get_num_issues()``. + + verbose : bool, optional + Whether to suppress print statements or not. + + quality_score_kwargs : dict, optional + Keyword arguments to pass into :py:func:`rank.get_label_quality_scores `. + + num_issue_kwargs : dict, optional + Keyword arguments to :py:func:`count.num_label_issues()` ` + to control estimation of the number of label issues. + The only supported kwarg here for now is: `estimation_method`. + """ + + def __init__( + self, + *, + num_class: int, + store_results: bool = True, + verbose: bool = True, + quality_score_kwargs: Optional[dict] = None, + num_issue_kwargs: Optional[dict] = None, + ): + if quality_score_kwargs is None: + quality_score_kwargs = {} + if num_issue_kwargs is None: + num_issue_kwargs = {} + + self.num_class = num_class + self.store_results = store_results + self.verbose = verbose + self.quality_score_kwargs = quality_score_kwargs # extra arguments for ``rank.get_label_quality_scores()`` to control label quality scoring + self.num_issue_kwargs = num_issue_kwargs # extra arguments for ``count.num_label_issues()`` to control estimation of the number of label issues (only supported argument for now is: `estimation_method`). + self.off_diagonal_calibrated = False + if num_issue_kwargs.get("estimation_method") == "off_diagonal_calibrated": + # store extra attributes later needed for calibration: + self.off_diagonal_calibrated = True + self.prune_counts = np.zeros(self.num_class) + self.class_counts = np.zeros(self.num_class) + self.normalization = np.zeros(self.num_class) + else: + self.prune_count = 0 # number of label issues estimated based on data seen so far (only used when estimation_method is not calibrated) + + if self.store_results: + self.label_quality_scores: List[float] = [] + + self.confident_thresholds = np.zeros( + (num_class,) + ) # current estimate of thresholds based on data seen so far + self.examples_per_class = np.zeros( + (num_class,) + ) # current counts of examples with each given label seen so far + self.examples_processed_thresh = ( + 0 # number of examples seen so far for estimating thresholds + ) + self.examples_processed_quality = 0 # number of examples seen so far for estimating label quality and number of label issues + + def get_confident_thresholds(self, silent: bool = False) -> np.ndarray: + """ + Fetches already-computed confident thresholds from the data seen so far + in same format as: :py:func:`count.get_confident_thresholds `. + + + Returns + ------- + confident_thresholds : np.ndarray + An array of shape ``(K, )`` where ``K`` is the number of classes. + """ + if self.examples_processed_thresh < 1: + raise ValueError( + "Have not computed any confident_thresholds yet. Call `update_confident_thresholds()` first." + ) + else: + if self.verbose and not silent: + print( + f"Total number of examples used to estimate confident thresholds: {self.examples_processed_thresh}" + ) + return self.confident_thresholds + + def get_num_issues(self, silent: bool = False) -> int: + """ + Fetches already-computed estimate of the number of label issues in the data seen so far + in the same format as: :py:func:`count.num_label_issues `. + + Note: The estimated number of issues may differ from :py:func:`count.num_label_issues ` + by 1 due to rounding differences. + + Returns + ------- + num_issues : + The estimated number of examples with label issues in the data seen so far. + """ + if self.examples_processed_quality < 1: + raise ValueError( + "Have not evaluated any labels yet. Call `score_label_quality()` first." + ) + else: + if self.verbose and not silent: + print( + f"Total number of examples whose labels have been evaluated: {self.examples_processed_quality}" + ) + if self.off_diagonal_calibrated: + calibrated_prune_counts = ( + self.prune_counts + * self.class_counts + / np.clip(self.normalization, a_min=EPS, a_max=None) + ) # avoid division by 0 + return np.rint(np.sum(calibrated_prune_counts)).astype("int") + else: # not calibrated + return self.prune_count + + def get_quality_scores(self) -> np.ndarray: + """ + Fetches already-computed estimate of the label quality of each example seen so far + in the same format as: :py:func:`rank.get_label_quality_scores `. + + Returns + ------- + label_quality_scores : np.ndarray + Contains one score (between 0 and 1) per example seen so far. + Lower scores indicate more likely mislabeled examples. + """ + if not self.store_results: + raise ValueError( + "Must initialize the LabelInspector with `store_results` == True. " + "Otherwise you can assemble the label quality scores yourself based on " + "the scores returned for each batch of data from `score_label_quality()`" + ) + else: + return np.asarray(self.label_quality_scores) + + def get_label_issues(self) -> np.ndarray: + """ + Fetches already-computed estimate of indices of examples with label issues in the data seen so far, + in the same format as: :py:func:`filter.find_label_issues ` + with its `return_indices_ranked_by` argument specified. + + Note: this method corresponds to ``filter.find_label_issues(..., filter_by=METHOD1, return_indices_ranked_by=METHOD2) + where by default: ``METHOD1="low_self_confidence"``, ``METHOD2="self_confidence"`` + or if this object was instantiated with ``quality_score_kwargs = {"method": "normalized_margin"}`` then we instead have: + ``METHOD1="low_normalized_margin"``, ``METHOD2="normalized_margin"``. + + Note: The estimated number of issues may differ from :py:func:`filter.find_label_issues ` + by 1 due to rounding differences. + + Returns + ------- + issue_indices : np.ndarray + Indices of examples with label issues, sorted by label quality score. + """ + if not self.store_results: + raise ValueError( + "Must initialize the LabelInspector with `store_results` == True. " + "Otherwise you can identify label issues yourself based on the scores from all " + "the batches of data and the total number of issues returned by `get_num_issues()`" + ) + if self.examples_processed_quality < 1: + raise ValueError( + "Have not evaluated any labels yet. Call `score_label_quality()` first." + ) + if self.verbose: + print( + f"Total number of examples whose labels have been evaluated: {self.examples_processed_quality}" + ) + return find_top_issues(self.get_quality_scores(), top=self.get_num_issues(silent=True)) + + def update_confident_thresholds(self, labels: LabelLike, pred_probs: np.ndarray): + """ + Updates the estimate of confident_thresholds stored in this class using a new batch of data. + Inputs should be in same format as for: :py:func:`count.get_confident_thresholds `. + + Parameters + ---------- + labels: np.ndarray or list + Given class labels for each example in the batch, values in ``0,1,2,...,K-1``. + + pred_probs: np.ndarray + 2D array of model-predicted class probabilities for each example in the batch. + """ + labels = _batch_check(labels, pred_probs, self.num_class) + batch_size = len(labels) + batch_thresholds = get_confident_thresholds( + labels, pred_probs + ) # values for missing classes may exceed 1 but should not matter since we multiply by this class counts in the batch + batch_class_counts = value_counts_fill_missing_classes(labels, num_classes=self.num_class) + self.confident_thresholds = ( + self.examples_per_class * self.confident_thresholds + + batch_class_counts * batch_thresholds + ) / np.clip( + self.examples_per_class + batch_class_counts, a_min=1, a_max=None + ) # avoid division by 0 + self.examples_per_class += batch_class_counts + self.examples_processed_thresh += batch_size + + def score_label_quality( + self, labels: LabelLike, pred_probs: np.ndarray, *, update_num_issues: bool = True + ) -> np.ndarray: + """ + Scores the label quality of each example in the provided batch of data, + and also updates the number of label issues stored in this class. + Inputs should be in same format as for: :py:func:`rank.get_label_quality_scores `.. + + Parameters + ---------- + labels: np.ndarray or list + Given class labels for each example in the batch, values in ``0,1,2,...,K-1``. + + pred_probs: np.ndarray + 2D array of model-predicted class probabilities for each example in the batch of data. + + update_num_issues: bool, optional + Whether or not to update the number of label issues or only compute label quality scores. + For lower runtimes, set this to ``False`` if you only want to score label quality and not find label issues. + + Returns + ------- + label_quality_scores : np.ndarray + Contains one score (between 0 and 1) for each example in the batch of data. + """ + labels = _batch_check(labels, pred_probs, self.num_class) + batch_size = len(labels) + scores = _compute_label_quality_scores( + labels, + pred_probs, + confident_thresholds=self.get_confident_thresholds(silent=True), + **self.quality_score_kwargs, + ) + class_counts = value_counts_fill_missing_classes(labels, num_classes=self.num_class) + if update_num_issues: + self._update_num_label_issues(labels, pred_probs, **self.num_issue_kwargs) + self.examples_processed_quality += batch_size + if self.store_results: + self.label_quality_scores += list(scores) + + return scores + + def _update_num_label_issues(self, labels: LabelLike, pred_probs: np.ndarray, **kwargs): + """ + Update the estimate of num_label_issues stored in this class using a new batch of data. + Kwargs are ignored here for now (included for forwards compatibility). + Instead of being specified here, `estimation_method` should be declared when this class is initialized. + """ + if self.examples_processed_thresh < 1: + raise ValueError( + "Have not computed any confident_thresholds yet. Call `update_confident_thresholds()` first." + ) + batch_size = len(labels) + pred_class = np.argmax(pred_probs, axis=1) + # for efficiency, this pred_class index is also used where older implementation instead used: + # max_ind = np.argmax(pred_probs * (pred_probs >= adj_confident_thresholds), axis=1) + pred_confidence = pred_probs[np.arange(batch_size), pred_class] + # add margin for floating point comparison operations: + adj_confident_thresholds = self.confident_thresholds - EPS + if not self.off_diagonal_calibrated: + prune_count_batch = np.sum( + ( + pred_probs[np.arange(batch_size), pred_class] + >= adj_confident_thresholds[pred_class] + ) + & (pred_class != labels) + ) + self.prune_count += prune_count_batch + else: # calibrated + self.class_counts += value_counts_fill_missing_classes( + labels, num_classes=self.num_class + ) + to_increment = ( + pred_probs[np.arange(batch_size), pred_class] + >= adj_confident_thresholds[pred_class] + ) + for class_label in range(self.num_class): + labels_equal_to_class = labels == class_label + self.normalization[class_label] += np.sum(labels_equal_to_class & to_increment) + self.prune_counts[class_label] += np.sum( + labels_equal_to_class & to_increment & (pred_class != labels) + ) + + +def _batch_check(labels: LabelLike, pred_probs: np.ndarray, num_class: int) -> np.ndarray: + """ + Basic checks to ensure batch of data looks ok. For efficiency, this check is quite minimal. + + Returns + ------- + labels : np.ndarray + `labels` formatted as a 1D array. + """ + batch_size = pred_probs.shape[0] + labels = np.asarray(labels) + if batch_size < 10: + raise ValueError("Please run this with batches containing at least 10 examples.") + if len(labels) != batch_size: + raise ValueError("labels and pred_probs must have same length") + if pred_probs.shape[1] != num_class: + raise ValueError("num_class must equal pred_probs.shape[1]") + + return labels + + +def find_label_issues_batched( + *, + labels_file: Optional[str] = None, + pred_probs_file: Optional[str] = None, + labels: Optional[LabelLike] = None, + pred_probs: Optional[np.ndarray] = None, + batch_size: int = 10000, + verbose: bool = True, + quality_score_kwargs: Optional[dict] = None, + num_issue_kwargs: Optional[dict] = None, +) -> np.ndarray: + """ + Variant of :py:func:`filter.find_label_issues ` + that requires less memory by reading `pred_probs`, `labels` in mini-batches, if provided as files. + Only .npy files are supported (not .npz), and these must be loadable via: ``np.load(your_file, mmap_mode="r")``. + If you want to read from other file-types (eg. HDF5 or Zarr) instead, + see the example usage of the ``LabelInspector`` class. + + This function basically implements the example ``LabelInspector`` usage script, + but you can further customize that script by running it yourself. + See the documentation of ``LabelInspector`` to learn more about how this method works internally. + + With default settings, the results returned from this method closely approximate those returned from: + ``cleanlab.filter.find_label_issues(..., filter_by="low_self_confidence", return_indices_ranked_by="self_confidence")`` + + Parameters + ---------- + labels_file: str, optional + Path to .npy file where the entire 1D `labels` numpy array is stored on disk (list format is not supported). + This is loaded using: ``np.load(labels_file, mmap_mode="r")`` + so make sure this file was created via: ``np.save()`` or other compatible methods. + + pred_probs_file: str, optional + Path to .npy file where the entire `pred_probs` numpy array is stored on disk. + This is loaded using: ``np.load(pred_probs_file, mmap_mode="r")`` + so make sure this file was created via: ``np.save()`` or other compatible methods. + + labels: np.ndarray or list, optional + Given class labels for each example in the dataset, (int) values in ``0,1,2,...,K-1``. + Recommend providing `labels_file` instead of `labels` to avoid loading big objects into memory. + + pred_probs: np.ndarray, optional + 2D array of model-predicted class probabilities (floats) for each example in the dataset. + Recommend providing `pred_probs_file` instead of `pred_probs` to avoid loading big objects into memory. + + batch_size : int, optional + Size of mini-batches to use for estimating the label issues. + To maximize efficiency, try to use the largest `batch_size` your memory allows. + + verbose : bool, optional + Whether to suppress print statements or not. + + quality_score_kwargs : dict, optional + Keyword arguments to pass into :py:func:`rank.get_label_quality_scores `. + + num_issue_kwargs : dict, optional + Keyword arguments to :py:func:`count.num_label_issues()` ` + to control estimation of the number of label issues. + The only supported kwarg here for now is: `estimation_method`. + + Returns + ------- + issue_indices : np.ndarray + Indices of examples with label issues, sorted by label quality score. + """ + if labels_file is not None: + if labels is not None: + raise ValueError("only specify one of: `labels` or `labels_file`") + if not isinstance(labels_file, str): + raise ValueError( + "labels_file must be str specifying path to .npy file containing the array of labels" + ) + labels = np.load(labels_file, mmap_mode="r") + assert isinstance(labels, np.ndarray) + + if pred_probs_file is not None: + if pred_probs is not None: + raise ValueError("only specify one of: `pred_probs` or `pred_probs_file`") + if not isinstance(pred_probs_file, str): + raise ValueError( + "pred_probs_file must be str specifying path to .npy file containing 2D array of pred_probs" + ) + pred_probs = np.load(pred_probs_file, mmap_mode="r") + assert isinstance(pred_probs, np.ndarray) + if verbose: + print( + f"mmap-loaded numpy arrays have: {len(pred_probs)} examples, {pred_probs.shape[1]} classes" + ) + if labels is None: + raise ValueError("must provide one of: `labels` or `labels_file`") + if pred_probs is None: + raise ValueError("must provide one of: `pred_probs` or `pred_probs_file`") + + assert isinstance(pred_probs, np.ndarray) + if len(labels) != len(pred_probs): + raise ValueError( + f"len(labels)={len(labels)} does not match len(pred_probs)={len(pred_probs)}. Perhaps an issue loading mmap numpy arrays from file." + ) + lab = LabelInspector( + num_class=pred_probs.shape[1], + verbose=verbose, + quality_score_kwargs=quality_score_kwargs, + num_issue_kwargs=num_issue_kwargs, + ) + n = len(labels) + if verbose: + from tqdm.auto import tqdm + + pbar = tqdm(desc="number of examples processed for estimating thresholds", total=n) + i = 0 + while i < n: + end_index = i + batch_size + labels_batch = labels[i:end_index] + pred_probs_batch = pred_probs[i:end_index, :] + i = end_index + lab.update_confident_thresholds(labels_batch, pred_probs_batch) + if verbose: + pbar.update(batch_size) + + # Next evaluate the quality of the labels (run this on full dataset you want to evaluate): + if verbose: + pbar.close() + pbar = tqdm(desc="number of examples processed for checking labels", total=n) + i = 0 + while i < n: + end_index = i + batch_size + labels_batch = labels[i:end_index] + pred_probs_batch = pred_probs[i:end_index, :] + i = end_index + batch_results = lab.score_label_quality(labels_batch, pred_probs_batch) + if verbose: + pbar.update(batch_size) + + if verbose: + pbar.close() + + return lab.get_label_issues() diff --git a/cleanlab/filter.py b/cleanlab/filter.py index 2838fc441f..e6f2cc803d 100644 --- a/cleanlab/filter.py +++ b/cleanlab/filter.py @@ -270,7 +270,7 @@ class 0, 1, ..., K-1. "filter_by 'confident_learning', 'predicted_neq_given', 'low_normalized_margin' " "or 'low_self_confidence' is not supported (yet) when setting 'num_to_remove_per_class'" ) - if filter_by is "confident_learning" and isinstance(confident_joint, np.ndarray): + if filter_by == "confident_learning" and isinstance(confident_joint, np.ndarray): warn_str = ( "The supplied `confident_joint` is ignored when `filter_by = 'confident_learning'`; confident joint will be " "re-estimated from the given labels. To use your supplied `confident_joint`, please specify a different " diff --git a/cleanlab/rank.py b/cleanlab/rank.py index b00deafff3..5be235f6bf 100644 --- a/cleanlab/rank.py +++ b/cleanlab/rank.py @@ -119,15 +119,28 @@ class 0, 1, ..., K-1. assert_valid_inputs( X=None, y=labels, pred_probs=pred_probs, multi_label=False, allow_one_class=True ) + return _compute_label_quality_scores( + labels=labels, pred_probs=pred_probs, method=method, adjust_pred_probs=adjust_pred_probs + ) + - # Available scoring functions to choose from +def _compute_label_quality_scores( + labels: np.ndarray, + pred_probs: np.ndarray, + *, + method: str = "self_confidence", + adjust_pred_probs: bool = False, + confident_thresholds: Optional[np.ndarray] = None, +) -> np.ndarray: + """Internal implementation of get_label_quality_scores that assumes inputs + have already been checked and are valid. This speeds things up. + Can also take in pre-computed confident_thresholds to further accelerate things. + """ scoring_funcs = { "self_confidence": get_self_confidence_for_each_label, "normalized_margin": get_normalized_margin_for_each_label, "confidence_weighted_entropy": get_confidence_weighted_entropy_for_each_label, } - - # Select scoring function try: scoring_func = scoring_funcs[method] except KeyError: @@ -137,21 +150,15 @@ class 0, 1, ..., K-1. Please choose a valid rank_by: self_confidence, normalized_margin, confidence_weighted_entropy """ ) - - # Adjust predicted probabilities if adjust_pred_probs: - # Check if adjust_pred_probs is supported for the chosen method if method == "confidence_weighted_entropy": raise ValueError(f"adjust_pred_probs is not currently supported for {method}.") + pred_probs = _subtract_confident_thresholds( + labels=labels, pred_probs=pred_probs, confident_thresholds=confident_thresholds + ) - pred_probs = _subtract_confident_thresholds(labels, pred_probs) - - # Pass keyword arguments for scoring function - input = {"labels": labels, "pred_probs": pred_probs} - - # Calculate scores - label_quality_scores = scoring_func(**input) - + scoring_inputs = {"labels": labels, "pred_probs": pred_probs} + label_quality_scores = scoring_func(**scoring_inputs) return label_quality_scores @@ -494,9 +501,9 @@ def get_self_confidence_for_each_label( Lower scores indicate more likely mislabeled examples. """ - # np.mean is used so that this works for multi-labels (list of lists) - label_quality_scores = np.array([np.mean(pred_probs[i, l]) for i, l in enumerate(labels)]) - return label_quality_scores + # To make this work for multi-label (but it will slow down runtime), replace: + # pred_probs[i, l] -> np.mean(pred_probs[i, l]) + return np.array([pred_probs[i, l] for i, l in enumerate(labels)]) def get_normalized_margin_for_each_label( diff --git a/docs/source/tutorials/faq.ipynb b/docs/source/tutorials/faq.ipynb index 989f9b3711..6f9972505a 100644 --- a/docs/source/tutorials/faq.ipynb +++ b/docs/source/tutorials/faq.ipynb @@ -19,7 +19,7 @@ }, "outputs": [], "source": [ - "# This cell is for internal CI purposes. Run it to ensure all other cells below can be executed in your own notebook\n", + "# This cell is hidden on docs.cleanlab.ai. Execute it to ensure all other cells below can be executed in your own notebook\n", "\n", "import os \n", "import logging \n", @@ -38,7 +38,10 @@ "pred_probs[np.arange(N),labels] += 4 # make pred_probs accurate\n", "pred_probs = pred_probs/pred_probs.sum(axis=1)[:, np.newaxis]\n", "data = np.array([[label+np.random.uniform(), label+np.random.uniform()] for label in labels])\n", - "labels[-num_errors:] = 0 # introduce label errors\n", + "# introduce label errors in last few examples:\n", + "og0_indices = labels[-num_errors:] == 0\n", + "labels[-num_errors:] = 0\n", + "labels[-num_errors:][og0_indices] = 1\n", "\n", "your_classifier=sklearn.linear_model.LogisticRegression() # toy classifier" ] @@ -205,6 +208,174 @@ "You will be able to produce a much better version of your dataset interactively using [Cleanlab Studio](https://cleanlab.ai/studio/?utm_source=github&utm_medium=docs&utm_campaign=clostostudio), which helps you efficiently fix issues like this in large datasets." ] }, + { + "cell_type": "markdown", + "id": "21f42f24", + "metadata": {}, + "source": [ + "### How can I find label issues in big datasets with limited memory? " + ] + }, + { + "cell_type": "markdown", + "id": "089f505e", + "metadata": {}, + "source": [ + "For a dataset with many rows and/or classes, there are more efficient methods in the `label_issues_batched` module. These methods read data in mini-batches and you can reduce the `batch_size` to control how much memory they require. Below is an example of how to use the `find_label_issues_batched()` method from this module, which can load mini-batches of data from `labels`, `pred_probs` saved as .npy files on disk. Check out the `LabelInspector` class from this module if you instead have `labels`, `pred_probs` saved as other file-types (eg. HDF5 or Zarr)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41714b51", + "metadata": {}, + "outputs": [], + "source": [ + "# We'll assume your big arrays of labels, pred_probs have been saved to file like this:\n", + "from tempfile import mkdtemp\n", + "import os.path as path\n", + "\n", + "labels_file = path.join(mkdtemp(), \"labels.npy\")\n", + "pred_probs_file = path.join(mkdtemp(), \"pred_probs.npy\")\n", + "np.save(labels_file, labels)\n", + "np.save(pred_probs_file, pred_probs)\n", + "\n", + "# Code to find label issues by loading data from file in batches:\n", + "from cleanlab.experimental.label_issues_batched import find_label_issues_batched\n", + "\n", + "batch_size = 10000 # for efficiency, set this to as large of a value as your memory can handle\n", + "\n", + "# Indices of examples with label issues, sorted by label quality score (most severe to least severe):\n", + "indices_of_examples_with_issues = find_label_issues_batched(\n", + " labels_file=labels_file, pred_probs_file=pred_probs_file, batch_size=batch_size\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20476c70", + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# This cell is hidden on docs.cleanlab.ai, and is only for internal testing. You can ignore it.\n", + "\n", + "issue_indices = cleanlab.filter.find_label_issues(labels, pred_probs, filter_by = \"confident_learning\", return_indices_ranked_by=\"self_confidence\")\n", + "assert np.abs(len(issue_indices) - len(indices_of_examples_with_issues)) < 2, \"num issues differ in batched mode\"\n", + "set1 = set(issue_indices)\n", + "set2 = set(indices_of_examples_with_issues)\n", + "intersection = len(list(set1.intersection(set2)))\n", + "union = len(set1) + len(set2) - intersection\n", + "assert float(intersection) / union > 0.9, \"issue indices differ in batched mode\"" + ] + }, + { + "cell_type": "markdown", + "id": "438b424d", + "metadata": {}, + "source": [ + "**To use less memory and get results faster if your dataset has many classes:** Try merging the rare classes into a single \"Other\" class before you find label issues. The resulting issues won't be affected much since cleanlab anyway does not have enough data to accurately diagnose label errors in classes that are rarely seen. To do this, you should aggregate all the probability assigned to the rare classes in `pred_probs` into a single new dimension of `pred_probs_merged` (where this new array no longer has columns for the rare classes). Here is a function that does this for you, which you can also modify as needed:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6983cdad", + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# This cell is hidden on docs.cleanlab.ai\n", + "# Add two rare additional classes to the dataset:\n", + "\n", + "num_rare_instances = 3\n", + "small_prob = 1e-4\n", + "pred_probs = np.hstack((pred_probs, np.ones((len(pred_probs),2))*small_prob))\n", + "pred_probs = pred_probs / np.sum(pred_probs, axis=1)[:, np.newaxis]\n", + "labels[:num_rare_instances] = 3\n", + "labels[num_rare_instances:(2*num_rare_instances)] = 4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9092b8a0", + "metadata": {}, + "outputs": [], + "source": [ + "from cleanlab.internal.util import value_counts # use this to count how often each class occurs in labels\n", + "\n", + "def merge_rare_classes(labels, pred_probs, count_threshold = 10):\n", + " \"\"\" \n", + " Returns: labels, pred_probs after we merge all rare classes into a single 'Other' class.\n", + " Merged pred_probs has less columns. Rare classes are any occuring less than `count_threshold` times.\n", + " Also returns: `class_mapping_orig2new`, a dict to map new classes in merged labels back to classes \n", + " in original labels, useful for interpreting outputs from `dataset.heath_summary()` or `count.confident_joint()`.\n", + " \"\"\"\n", + " num_classes = pred_probs.shape[1]\n", + " num_examples_per_class = value_counts(labels, num_classes=num_classes)\n", + " rare_classes = [c for c in range(num_classes) if num_examples_per_class[c] < count_threshold]\n", + " if len(rare_classes) < 1:\n", + " raise ValueError(\"No rare classes found at the given `count_threshold`, merging is unnecessary unless you increase it.\")\n", + "\n", + " num_classes_merged = num_classes - len(rare_classes) + 1 # one extra class for all the merged ones\n", + " other_class = num_classes_merged - 1\n", + " labels_merged = labels.copy()\n", + " class_mapping_orig2new = {} # key = original class in `labels`, value = new class in `labels_merged`\n", + " new_c = 0\n", + " for c in range(num_classes):\n", + " if c in rare_classes:\n", + " class_mapping_orig2new[c] = other_class\n", + " else:\n", + " class_mapping_orig2new[c] = new_c\n", + " new_c += 1\n", + " labels_merged[labels == c] = class_mapping_orig2new[c]\n", + "\n", + " merged_prob = np.sum(pred_probs[:, rare_classes], axis=1, keepdims=True) # total probability over all merged classes for each example\n", + " pred_probs_merged = np.hstack((np.delete(pred_probs, rare_classes, axis=1), merged_prob)) # assumes new_class is as close to original_class in sorted order as is possible after removing the merged original classes\n", + " # check a few rows of probabilities after merging to verify they still sum to 1:\n", + " num_check = 1000 # only check a few rows for efficiency\n", + " ones_array_ref = np.ones(min(num_check,len(pred_probs)))\n", + " if np.isclose(np.sum(pred_probs[:num_check], axis=1), ones_array_ref).all() and (not np.isclose(np.sum(pred_probs_merged[:num_check], axis=1), ones_array_ref).all()):\n", + " raise ValueError(\"merged pred_probs do not sum to 1 in each row, check that merging was correctly done.\")\n", + " \n", + " return (labels_merged, pred_probs_merged, class_mapping_orig2new)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0a01109", + "metadata": {}, + "outputs": [], + "source": [ + "from cleanlab.filter import find_label_issues # can alternatively use find_label_issues_batched() shown above\n", + "\n", + "labels_merged, pred_probs_merged, class_mapping_orig2new = merge_rare_classes(labels, pred_probs, count_threshold=5)\n", + "examples_w_issues = find_label_issues(labels_merged, pred_probs_merged, return_indices_ranked_by=\"self_confidence\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b1da032", + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# This cell is hidden on docs.cleanlab.ai, and is only for internal testing. You can ignore it.\n", + "\n", + "rare_classes = [c for c in class_mapping_orig2new.keys() if class_mapping_orig2new[c] == pred_probs_merged.shape[1]-1]\n", + "og_examples_w_issues = find_label_issues(labels, pred_probs, return_indices_ranked_by=\"self_confidence\")\n", + "examples_of_interest = [x for x in examples_w_issues if labels[x] not in rare_classes]\n", + "og_examples_of_interest = [x for x in og_examples_w_issues if labels[x] not in rare_classes]\n", + "assert set(examples_of_interest) == set(og_examples_of_interest), \"merged label issues differ from non-merged label issues\"" + ] + }, { "cell_type": "markdown", "id": "3868ee8b", @@ -224,7 +395,7 @@ "You can still use cleanlab with other data formats though! Just separately obtain predicted probabilities (`pred_probs`) from your model via cross-validation and pass them as inputs. \n", "\n", "\n", - "If CleanLearning is running successfully but not improving predictive accuracy of your model, here are some tips:\n", + "If CleanLearning is running successfully but not improving predictive accuracy of your model, here are some tips:\n", "\n", "1. Use cleanlab to find label issues in your test data as well (we recommend pooling `labels` across both training and test data into one input for `find_label_issues()`). Then manually review and fix label issues identified in the test data to verify accuracy measurements are actually meaningful.\n", "\n", @@ -426,7 +597,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.9.15" } }, "nbformat": 4, diff --git a/tests/test_filter_count.py b/tests/test_filter_count.py index 2da04a8482..8cc624f6a3 100644 --- a/tests/test_filter_count.py +++ b/tests/test_filter_count.py @@ -24,12 +24,15 @@ from cleanlab.benchmarking.noise_generation import generate_noisy_labels from cleanlab.internal.util import value_counts from cleanlab.internal.multilabel_utils import int2onehot +from cleanlab.experimental.label_issues_batched import find_label_issues_batched import numpy as np import scipy import pytest from sklearn.multioutput import MultiOutputClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_predict +from tempfile import mkdtemp +import os.path as path def make_data( @@ -894,6 +897,81 @@ def test_num_label_issues_multilabel(confident_joint): assert sum(f) == n +def test_batched_label_issues(): + f1 = filter.find_label_issues( + labels=data["labels"], + pred_probs=data["pred_probs"], + return_indices_ranked_by="self_confidence", + filter_by="confident_learning", + # TODO: replace the above line with: + # filter_by="low_self_confidence", + ) + f2 = find_label_issues_batched( + labels=data["labels"], + pred_probs=data["pred_probs"], + batch_size=int(len(data["labels"]) / 4.0), + ) + f3 = find_label_issues_batched( + labels=data["labels"], + pred_probs=data["pred_probs"], + batch_size=int(len(data["labels"]) / 2.0), + ) + f4 = find_label_issues_batched( + labels=data["labels"], pred_probs=data["pred_probs"], batch_size=len(data["labels"]) + 100 + ) + assert np.all(f4 == f3) + assert np.all(f4 == f2) + assert len(f2) == len(f1) + # check jaccard similarity: + intersection = len(list(set(f1).intersection(set(f2)))) + union = len(set(f1)) + len(set(f2)) - intersection + assert float(intersection) / union > 0.6 + n1 = count.num_label_issues( + labels=data["labels"], + pred_probs=data["pred_probs"], + estimation_method="off_diagonal_calibrated", + ) + quality_score_kwargs = {"method": "normalized_margin"} + num_issue_kwargs = {"estimation_method": "off_diagonal_calibrated"} + extra_args = { + "quality_score_kwargs": quality_score_kwargs, + "num_issue_kwargs": num_issue_kwargs, + } + f5 = find_label_issues_batched( + labels=data["labels"], + pred_probs=data["pred_probs"], + batch_size=int(len(data["labels"]) / 4.0), + **extra_args, + ) + f6 = find_label_issues_batched( + labels=data["labels"], + pred_probs=data["pred_probs"], + batch_size=int(len(data["labels"]) / 2.0), + **extra_args, + ) + f7 = find_label_issues_batched( + labels=data["labels"], + pred_probs=data["pred_probs"], + batch_size=len(data["labels"]) + 100, + **extra_args, + ) + assert not np.array_equal(f5, f2) + assert np.all(f7 == f5) + assert np.all(f6 == f5) + assert np.abs(len(f5) - n1) < 2 + # Test batches loaded from file: + labels_file = path.join(mkdtemp(), "labels.npy") + pred_probs_file = path.join(mkdtemp(), "pred_probs.npy") + np.save(labels_file, data["labels"]) + np.save(pred_probs_file, data["pred_probs"]) + f8 = find_label_issues_batched( + labels_file=labels_file, + pred_probs_file=pred_probs_file, + batch_size=int(len(data["labels"]) / 4.0), + ) + assert np.all(f8 == f3) + + def test_issue_158(): # ref: https://github.com/cleanlab/cleanlab/issues/158 pred_probs = np.array( From 5319a124d4f759b80acbc9eaa4cd7ac5b3cfd4ac Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Fri, 10 Feb 2023 17:30:03 -0800 Subject: [PATCH 113/258] Make label_issues_batched documentation appear (#627) --- docs/source/cleanlab/experimental/index.rst | 1 + .../source/cleanlab/experimental/label_issues_batched.rst | 8 ++++++++ 2 files changed, 9 insertions(+) create mode 100644 docs/source/cleanlab/experimental/label_issues_batched.rst diff --git a/docs/source/cleanlab/experimental/index.rst b/docs/source/cleanlab/experimental/index.rst index 935152c86c..35af09b504 100644 --- a/docs/source/cleanlab/experimental/index.rst +++ b/docs/source/cleanlab/experimental/index.rst @@ -11,6 +11,7 @@ experimental :show-inheritance: .. toctree:: + label_issues_batched mnist_pytorch coteaching cifar_cnn diff --git a/docs/source/cleanlab/experimental/label_issues_batched.rst b/docs/source/cleanlab/experimental/label_issues_batched.rst new file mode 100644 index 0000000000..1262958fed --- /dev/null +++ b/docs/source/cleanlab/experimental/label_issues_batched.rst @@ -0,0 +1,8 @@ +label_issues_batched +==================== + +.. automodule:: cleanlab.experimental.label_issues_batched + :autosummary: + :members: + :undoc-members: + :show-inheritance: From ac98282dcb0f375d1f7b90527dbde3f117648f28 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Fri, 10 Feb 2023 18:29:00 -0800 Subject: [PATCH 114/258] add example script for find_label_issues_batched (#629) --- cleanlab/experimental/label_issues_batched.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cleanlab/experimental/label_issues_batched.py b/cleanlab/experimental/label_issues_batched.py index 509e56b790..ebd164a867 100644 --- a/cleanlab/experimental/label_issues_batched.py +++ b/cleanlab/experimental/label_issues_batched.py @@ -467,6 +467,11 @@ def find_label_issues_batched( ------- issue_indices : np.ndarray Indices of examples with label issues, sorted by label quality score. + + Examples + -------- + >>> batch_size = 10000 # for efficiency, set this to as large of a value as your memory can handle + >>> issues = find_label_issues_batched(labels_file="LABELS.npy", pred_probs_file="PREDPROBS.npy", batch_size=batch_size) """ if labels_file is not None: if labels is not None: From 4d2753ae058fd10ee3a997282e718f5b78a840a8 Mon Sep 17 00:00:00 2001 From: Hui Wen <45724323+huiwengoh@users.noreply.github.com> Date: Mon, 13 Feb 2023 10:40:52 +0800 Subject: [PATCH 115/258] Fix KerasWrapper summary method (#631) --- cleanlab/models/keras.py | 26 +++++++++++++++++--------- tests/test_frameworks.py | 4 ++++ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/cleanlab/models/keras.py b/cleanlab/models/keras.py index d44ea6559f..5935cd5ef9 100644 --- a/cleanlab/models/keras.py +++ b/cleanlab/models/keras.py @@ -112,9 +112,9 @@ def fit(self, X, y=None, **kwargs): but they are ignored. If ``X`` is a numpy array or pandas dataframe, the labels have to be passed in using this argument. """ - - self.net = self.model(**self.model_kwargs) - self.net.compile(**self.compile_kwargs) + if self.net is None: + self.net = self.model(**self.model_kwargs) + self.net.compile(**self.compile_kwargs) # TODO: check for generators if y is not None and not isinstance(X, (tf.data.Dataset, keras.utils.Sequence)): @@ -152,7 +152,11 @@ def predict(self, X, **kwargs): def summary(self, **kwargs): """Returns the summary of the Keras model.""" - self.net.summary(**kwargs) + if self.net is None: + self.net = self.model(**self.model_kwargs) + self.net.compile(**self.compile_kwargs) + + return self.net.summary(**kwargs) class KerasWrapperSequential: @@ -203,8 +207,7 @@ def get_params(self, deep=True): def set_params(self, **params): """Set the parameters of the Keras model.""" - for key, value in params.items(): - self.params[key] = value + self.params.update(params) return self def fit(self, X, y=None, **kwargs): @@ -220,8 +223,9 @@ def fit(self, X, y=None, **kwargs): but they are ignored. If ``X`` is a numpy array or pandas dataframe, the labels have to be passed in using this argument. """ - self.net = tf.keras.models.Sequential(self.layers, self.name) - self.net.compile(**self.compile_kwargs) + if self.net is None: + self.net = tf.keras.models.Sequential(self.layers, self.name) + self.net.compile(**self.compile_kwargs) # TODO: check for generators if y is not None and not isinstance(X, (tf.data.Dataset, keras.utils.Sequence)): @@ -258,4 +262,8 @@ def predict(self, X, **kwargs): def summary(self, **kwargs): """Returns the summary of the Keras model.""" - self.net.summary(**kwargs) + if self.net is None: + self.net = tf.keras.models.Sequential(self.layers, self.name) + self.net.compile(**self.compile_kwargs) + + return self.net.summary(**kwargs) diff --git a/tests/test_frameworks.py b/tests/test_frameworks.py index f26e532b89..d14f9f0a0f 100644 --- a/tests/test_frameworks.py +++ b/tests/test_frameworks.py @@ -143,6 +143,8 @@ def test_tensorflow_sequential(batch_size, shuffle_config, data=DATA, hidden_uni ], ) + model.summary() + # Test base model works: model.fit( X=dataset_tf, @@ -193,6 +195,8 @@ def make_model(num_features, num_classes): model_kwargs={"num_features": data["num_features"], "num_classes": data["num_classes"]}, ) + model.summary() + # Test base model works: model.fit( X=dataset_tf, From 6acc7aead1bbac26c6158f7b16df892cad774dc2 Mon Sep 17 00:00:00 2001 From: Ulyana Date: Sun, 12 Feb 2023 19:45:20 -0800 Subject: [PATCH 116/258] Clarify rank.py not for multi-label classification (#626) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/rank.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cleanlab/rank.py b/cleanlab/rank.py index 5be235f6bf..60def50a09 100644 --- a/cleanlab/rank.py +++ b/cleanlab/rank.py @@ -25,6 +25,7 @@ Note: multi-label classification is not supported by most methods in this module, each example must be labeled as belonging to a single class, e.g. format: ``labels = np.ndarray([1,0,2,1,1,0...])``. +For multi-label classification, instead see :py:func:`multilabel_classification.get_label_quality_scores `. Note: Label quality scores are most accurate when they are computed based on out-of-sample `pred_probs` from your model. To obtain out-of-sample predicted probabilities for every datapoint in your dataset, you can use :ref:`cross-validation `. This is encouraged to get better results. From 2adb8b68b0fb2cea71a62d363466db4d80963492 Mon Sep 17 00:00:00 2001 From: Sanjana Date: Sun, 12 Feb 2023 19:46:45 -0800 Subject: [PATCH 117/258] Removed $ from shell commands to avoid it being copied (#625) --- DEVELOPMENT.md | 48 +++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 19a1d739f0..6bc352cf3c 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -13,9 +13,9 @@ and [venv](https://docs.python.org/3/library/venv.html). You can the tools and choose what is right for you. Here, we'll explain how to get set up with venv, which is built in to Python 3. -```console -$ python3 -m venv ./ENV # create a new virtual environment in the directory ENV -$ source ./ENV/bin/activate # switch to using the virtual environment +```shell +python3 -m venv ./ENV # create a new virtual environment in the directory ENV +source ./ENV/bin/activate # switch to using the virtual environment ``` You only need to create the virtual environment once, but you will need to @@ -27,9 +27,15 @@ virtual environment rather than your system Python installation. Run the following commands in the repository's root directory. -1. Install development requirements with `pip install -r requirements-dev.txt` +1. Install development requirements +```shell +pip install -r requirements-dev.txt +``` -2. Install cleanlab as an editable package with `pip install -e .` +2. Install cleanlab as an editable package +```shell +pip install -e . +``` For Macs with Apple silicon: replace `tensorflow` in requirements-dev.txt with: `tensorflow-macos==2.9.2` and `tensorflow-metal==0.5.1` @@ -38,26 +44,26 @@ For Macs with Apple silicon: replace `tensorflow` in requirements-dev.txt with: **Run all the tests:** -```console -$ pytest +```shell +pytest ``` **Run a specific file or test:** -``` -$ pytest -k +```shell +pytest -k ``` **Run with verbose output:** -``` -$ pytest --verbose +```shell +pytest --verbose ``` **Run with code coverage:** -``` -$ pytest --cov=cleanlab/ --cov-config .coveragerc --cov-report=html +```shell +pytest --cov=cleanlab/ --cov-config .coveragerc --cov-report=html ``` The coverage report will be available in `coverage_html_report/index.html`, @@ -69,13 +75,13 @@ Cleanlab uses [mypy](https://mypy.readthedocs.io/en/stable/) typing. Type checki **Check typing in all files:** -``` -$ mypy cleanlab +```shell +mypy cleanlab ``` The above is just a simplified command for demonstration, do NOT run this for testing your own type annotations! Our CI adds a few additional flags to the `mypy` command it uses in the file: -**.github/workflows/ci.yml**. +**.github/workflows/ci.yml**. To exactly match the `mypy` command that is executed in CI, copy these flags, and also ensure your version of `mypy` and related packages like `pandas-stubs` match the latest released versions (used in our CI). ### Examples @@ -84,7 +90,7 @@ You can check that the [examples](https://github.com/cleanlab/examples) still work with changes you make to cleanlab by manually running the notebooks. You can also run all example notebooks as follows: -```console +```shell git clone https://github.com/cleanlab/examples.git ``` @@ -93,7 +99,7 @@ E.g. you can edit this line to point to your local version of cleanlab as a rela Finally execute the bash script: -```console +```shell examples/run_all_notebooks.sh ``` @@ -103,7 +109,7 @@ examples/run_all_notebooks.sh cleanlab follows the [Black](https://black.readthedocs.io/) code style (see [pyproject.toml](pyproject.toml)). This is enforced by CI, so please format your code by invoking `black` before submitting a pull request. -Generally aim to follow the [PEP-8 coding style](https://peps.python.org/pep-0008/). +Generally aim to follow the [PEP-8 coding style](https://peps.python.org/pep-0008/). Please do not use wildcard `import *` in any files, instead you should always import the specific functions that you need from a module. All cleanlab code should have a maximum line length of 100 characters. @@ -114,8 +120,8 @@ This repo uses the [pre-commit framework](https://pre-commit.com/) to easily set up code style checks that run automatically whenever you make a commit. You can install the git hook scripts with: -```console -$ pre-commit install +```shell +pre-commit install ``` ### EditorConfig From 485005080398b0088c8d2f640201e234514f3f6a Mon Sep 17 00:00:00 2001 From: clu0 <33559427+clu0@users.noreply.github.com> Date: Sun, 12 Feb 2023 23:25:02 -0500 Subject: [PATCH 118/258] label_issues_batched multiprocessing (#630) 4x speedup (48 cores, 10k classes x 1M examples) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/experimental/label_issues_batched.py | 217 +++++++++++++++--- tests/test_filter_count.py | 15 ++ 2 files changed, 199 insertions(+), 33 deletions(-) diff --git a/cleanlab/experimental/label_issues_batched.py b/cleanlab/experimental/label_issues_batched.py index ebd164a867..9b528973d3 100644 --- a/cleanlab/experimental/label_issues_batched.py +++ b/cleanlab/experimental/label_issues_batched.py @@ -26,7 +26,7 @@ To run this approach, either follow the examples script below, or use the ``find_label_issues_batched()`` convenience function defined in this module. -The recommended usage demonstrated in the examples script involves two passes over your data: +The recommended usage demonstrated in the examples script involves two passes over your data: one pass to compute `confident_thresholds`, another to evaluate each label. To maximize efficiency, try to use the largest batch_size your memory allows. To reduce runtime further, you can run the first pass on a subset of your dataset @@ -72,16 +72,30 @@ """ import numpy as np -from typing import Optional, List +from typing import Optional, List, Tuple, Any from cleanlab.count import get_confident_thresholds from cleanlab.rank import find_top_issues, _compute_label_quality_scores from cleanlab.typing import LabelLike from cleanlab.internal.util import value_counts_fill_missing_classes +import platform +import multiprocessing as mp + +try: + import psutil + + PSUTIL_EXISTS = True +except ImportError: # pragma: no cover + PSUTIL_EXISTS = False EPS = 1e-6 # small number +# global variable for multiproc on linux +adj_confident_thresholds_shared: np.ndarray +labels_shared: LabelLike +pred_probs_shared: np.ndarray + class LabelInspector: """ @@ -109,6 +123,10 @@ class LabelInspector: verbose : bool, optional Whether to suppress print statements or not. + n_jobs: int, optional + Number of processes for multiprocessing. Only used on Linux. + If `n_jobs=None`, will use either the number of: physical cores if psutil is installed, or logical cores otherwise. + quality_score_kwargs : dict, optional Keyword arguments to pass into :py:func:`rank.get_label_quality_scores `. @@ -126,6 +144,7 @@ def __init__( verbose: bool = True, quality_score_kwargs: Optional[dict] = None, num_issue_kwargs: Optional[dict] = None, + n_jobs: Optional[int] = None, ): if quality_score_kwargs is None: quality_score_kwargs = {} @@ -160,6 +179,26 @@ def __init__( 0 # number of examples seen so far for estimating thresholds ) self.examples_processed_quality = 0 # number of examples seen so far for estimating label quality and number of label issues + # Determine number of cores for multiprocessing: + os_name = platform.system() + if os_name != "Linux": + self.n_jobs = 1 + if n_jobs is not None and n_jobs != 1 and self.verbose: + print( + "n_jobs is overridden to 1 because multiprocessing is only supported for Linux." + ) + elif n_jobs is not None: + self.n_jobs = n_jobs + else: + if PSUTIL_EXISTS: + self.n_jobs = psutil.cpu_count(logical=False) # physical cores + if not self.n_jobs: + # switch to logical cores + self.n_jobs = mp.cpu_count() + if self.verbose: + print( + f"Multiprocessing will default to using the number of logical cores ({self.n_jobs}). To default to number of physical cores: pip install psutil" + ) def get_confident_thresholds(self, silent: bool = False) -> np.ndarray: """ @@ -299,7 +338,11 @@ def update_confident_thresholds(self, labels: LabelLike, pred_probs: np.ndarray) self.examples_processed_thresh += batch_size def score_label_quality( - self, labels: LabelLike, pred_probs: np.ndarray, *, update_num_issues: bool = True + self, + labels: LabelLike, + pred_probs: np.ndarray, + *, + update_num_issues: bool = True, ) -> np.ndarray: """ Scores the label quality of each example in the provided batch of data, @@ -340,46 +383,148 @@ def score_label_quality( return scores - def _update_num_label_issues(self, labels: LabelLike, pred_probs: np.ndarray, **kwargs): + def _update_num_label_issues( + self, + labels: LabelLike, + pred_probs: np.ndarray, + **kwargs, + ): """ Update the estimate of num_label_issues stored in this class using a new batch of data. Kwargs are ignored here for now (included for forwards compatibility). Instead of being specified here, `estimation_method` should be declared when this class is initialized. """ + + # whether to match the output of count.num_label_issues exactly + # default is False, which gives significant speedup on large batches + # and empirically matches num_label_issues even on input sizes of + # 1M x 10k + thorough = False if self.examples_processed_thresh < 1: raise ValueError( "Have not computed any confident_thresholds yet. Call `update_confident_thresholds()` first." ) - batch_size = len(labels) - pred_class = np.argmax(pred_probs, axis=1) - # for efficiency, this pred_class index is also used where older implementation instead used: - # max_ind = np.argmax(pred_probs * (pred_probs >= adj_confident_thresholds), axis=1) - pred_confidence = pred_probs[np.arange(batch_size), pred_class] - # add margin for floating point comparison operations: - adj_confident_thresholds = self.confident_thresholds - EPS - if not self.off_diagonal_calibrated: - prune_count_batch = np.sum( - ( - pred_probs[np.arange(batch_size), pred_class] - >= adj_confident_thresholds[pred_class] + + if self.n_jobs == 1: + adj_confident_thresholds = self.confident_thresholds - EPS + pred_class = np.argmax(pred_probs, axis=1) + batch_size = len(labels) + if thorough: + # add margin for floating point comparison operations: + pred_gt_thresholds = pred_probs >= adj_confident_thresholds + max_ind = np.argmax(pred_probs * pred_gt_thresholds, axis=1) + if not self.off_diagonal_calibrated: + mask = (max_ind != labels) & (pred_class != labels) + else: + # calibrated + # should we change to above? + mask = pred_class != labels + else: + max_ind = pred_class + mask = pred_class != labels + + if not self.off_diagonal_calibrated: + prune_count_batch = np.sum( + ( + pred_probs[np.arange(batch_size), max_ind] + >= adj_confident_thresholds[max_ind] + ) + & mask ) - & (pred_class != labels) - ) - self.prune_count += prune_count_batch - else: # calibrated - self.class_counts += value_counts_fill_missing_classes( - labels, num_classes=self.num_class - ) - to_increment = ( - pred_probs[np.arange(batch_size), pred_class] - >= adj_confident_thresholds[pred_class] - ) - for class_label in range(self.num_class): - labels_equal_to_class = labels == class_label - self.normalization[class_label] += np.sum(labels_equal_to_class & to_increment) - self.prune_counts[class_label] += np.sum( - labels_equal_to_class & to_increment & (pred_class != labels) + self.prune_count += prune_count_batch + else: # calibrated + self.class_counts += value_counts_fill_missing_classes( + labels, num_classes=self.num_class ) + to_increment = ( + pred_probs[np.arange(batch_size), max_ind] >= adj_confident_thresholds[max_ind] + ) + for class_label in range(self.num_class): + labels_equal_to_class = labels == class_label + self.normalization[class_label] += np.sum(labels_equal_to_class & to_increment) + self.prune_counts[class_label] += np.sum( + labels_equal_to_class + & to_increment + & (max_ind != labels) + # & (pred_class != labels) + # This is not applied in num_label_issues(..., estimation_method="off_diagonal_custom"). Do we want to add it? + ) + else: # multiprocessing implementation + global adj_confident_thresholds_shared + adj_confident_thresholds_shared = self.confident_thresholds - EPS + + global labels_shared, pred_probs_shared + labels_shared = labels + pred_probs_shared = pred_probs + if thorough: + use_thorough = np.ones(len(labels_shared), dtype=bool) + else: + use_thorough = np.zeros(len(labels_shared), dtype=bool) + inds = np.arange(len(labels_shared)) + args = zip(inds, use_thorough) + with mp.Pool(self.n_jobs) as pool: + if not self.off_diagonal_calibrated: + prune_count_batch = np.sum( + np.asarray(list(pool.imap_unordered(_compute_num_issues, args))) + ) + self.prune_count += prune_count_batch + else: + results = list(pool.imap_unordered(_compute_num_issues_calibrated, args)) + for result in results: + class_label = result[0] + self.class_counts[class_label] += 1 + self.normalization[class_label] += result[1] + self.prune_counts[class_label] += result[2] + + +def _compute_num_issues(arg: Tuple[int, bool]) -> int: + """ + Helper function for `_update_num_label_issues` multiprocessing without calibration + """ + ind = arg[0] + thorough = arg[1] + label = labels_shared[ind] + pred_prob = pred_probs_shared[ind, :] + pred_class = np.argmax(pred_prob, axis=-1) + if thorough: + pred_gt_thresholds = pred_prob >= adj_confident_thresholds_shared + max_ind = np.argmax(pred_prob * pred_gt_thresholds, axis=-1) + prune_count_batch = ( + (pred_prob[max_ind] >= adj_confident_thresholds_shared[max_ind]) + & (max_ind != label) + & (pred_class != label) + ) + else: + prune_count_batch = np.sum( + (pred_prob[pred_class] >= adj_confident_thresholds_shared[pred_class]) + & (pred_class != label) + ) + return prune_count_batch + + +def _compute_num_issues_calibrated(arg: Tuple[int, bool]) -> Tuple[Any, int, int]: + """ + Helper function for `_update_num_label_issues` multiprocessing with calibration + """ + ind = arg[0] + thorough = arg[1] + label = labels_shared[ind] + pred_prob = pred_probs_shared[ind, :] + + pred_class = np.argmax(pred_prob, axis=-1) + if thorough: + pred_gt_thresholds = pred_prob >= adj_confident_thresholds_shared + max_ind = np.argmax(pred_prob * pred_gt_thresholds, axis=-1) + to_inc = pred_prob[max_ind] >= adj_confident_thresholds_shared[max_ind] + + prune_count_batch = to_inc & (max_ind != label) + normalization_batch = to_inc + else: + to_inc = pred_prob[pred_class] >= adj_confident_thresholds_shared[pred_class] + normalization_batch = to_inc + prune_count_batch = to_inc & (pred_class != label) + + return (label, normalization_batch, prune_count_batch) def _batch_check(labels: LabelLike, pred_probs: np.ndarray, num_class: int) -> np.ndarray: @@ -410,6 +555,7 @@ def find_label_issues_batched( labels: Optional[LabelLike] = None, pred_probs: Optional[np.ndarray] = None, batch_size: int = 10000, + n_jobs: Optional[int] = None, verbose: bool = True, quality_score_kwargs: Optional[dict] = None, num_issue_kwargs: Optional[dict] = None, @@ -452,6 +598,10 @@ def find_label_issues_batched( Size of mini-batches to use for estimating the label issues. To maximize efficiency, try to use the largest `batch_size` your memory allows. + n_jobs: int, optional + Number of processes for multiprocessing. Only used on Linux. + If `n_jobs=None`, will use either the number of: physical cores if psutil is installed, or logical cores otherwise. + verbose : bool, optional Whether to suppress print statements or not. @@ -509,6 +659,7 @@ def find_label_issues_batched( lab = LabelInspector( num_class=pred_probs.shape[1], verbose=verbose, + n_jobs=n_jobs, quality_score_kwargs=quality_score_kwargs, num_issue_kwargs=num_issue_kwargs, ) @@ -537,7 +688,7 @@ def find_label_issues_batched( labels_batch = labels[i:end_index] pred_probs_batch = pred_probs[i:end_index, :] i = end_index - batch_results = lab.score_label_quality(labels_batch, pred_probs_batch) + _ = lab.score_label_quality(labels_batch, pred_probs_batch) if verbose: pbar.update(batch_size) diff --git a/tests/test_filter_count.py b/tests/test_filter_count.py index 8cc624f6a3..d4a1e8da8b 100644 --- a/tests/test_filter_count.py +++ b/tests/test_filter_count.py @@ -919,8 +919,15 @@ def test_batched_label_issues(): f4 = find_label_issues_batched( labels=data["labels"], pred_probs=data["pred_probs"], batch_size=len(data["labels"]) + 100 ) + f_single = find_label_issues_batched( + labels=data["labels"], + pred_probs=data["pred_probs"], + batch_size=len(data["labels"]), + n_jobs=1, + ) assert np.all(f4 == f3) assert np.all(f4 == f2) + assert np.all(f_single == f4) assert len(f2) == len(f1) # check jaccard similarity: intersection = len(list(set(f1).intersection(set(f2)))) @@ -955,9 +962,17 @@ def test_batched_label_issues(): batch_size=len(data["labels"]) + 100, **extra_args, ) + f_single = find_label_issues_batched( + labels=data["labels"], + pred_probs=data["pred_probs"], + batch_size=len(data["labels"]), + n_jobs=1, + **extra_args, + ) assert not np.array_equal(f5, f2) assert np.all(f7 == f5) assert np.all(f6 == f5) + assert np.all(f_single == f5) assert np.abs(len(f5) - n1) < 2 # Test batches loaded from file: labels_file = path.join(mkdtemp(), "labels.npy") From f4572dc543a1f39011f9f03d8a94dd075d3992bb Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Sun, 12 Feb 2023 21:31:59 -0800 Subject: [PATCH 119/258] bugfix: missing self.n_jobs --- cleanlab/experimental/label_issues_batched.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cleanlab/experimental/label_issues_batched.py b/cleanlab/experimental/label_issues_batched.py index 9b528973d3..2ca60f3e15 100644 --- a/cleanlab/experimental/label_issues_batched.py +++ b/cleanlab/experimental/label_issues_batched.py @@ -190,6 +190,7 @@ def __init__( elif n_jobs is not None: self.n_jobs = n_jobs else: + self.n_jobs = None if PSUTIL_EXISTS: self.n_jobs = psutil.cpu_count(logical=False) # physical cores if not self.n_jobs: From 8c424d3b0d7d40cb133fb09e7f512b867f80e697 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Sun, 12 Feb 2023 22:06:40 -0800 Subject: [PATCH 120/258] mypy tpye annotations --- cleanlab/experimental/label_issues_batched.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/experimental/label_issues_batched.py b/cleanlab/experimental/label_issues_batched.py index 2ca60f3e15..1a1c7be424 100644 --- a/cleanlab/experimental/label_issues_batched.py +++ b/cleanlab/experimental/label_issues_batched.py @@ -180,6 +180,7 @@ def __init__( ) self.examples_processed_quality = 0 # number of examples seen so far for estimating label quality and number of label issues # Determine number of cores for multiprocessing: + self.n_jobs: Optional[int] = None os_name = platform.system() if os_name != "Linux": self.n_jobs = 1 @@ -190,7 +191,6 @@ def __init__( elif n_jobs is not None: self.n_jobs = n_jobs else: - self.n_jobs = None if PSUTIL_EXISTS: self.n_jobs = psutil.cpu_count(logical=False) # physical cores if not self.n_jobs: From 731dd419a15f817fd468c323ce6d055abed7b4f9 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Sun, 12 Feb 2023 23:59:26 -0800 Subject: [PATCH 121/258] Support Zarr files in find_label_issues_batched (#632) --- cleanlab/experimental/label_issues_batched.py | 418 +++++++++--------- docs/source/tutorials/faq.ipynb | 2 +- 2 files changed, 222 insertions(+), 198 deletions(-) diff --git a/cleanlab/experimental/label_issues_batched.py b/cleanlab/experimental/label_issues_batched.py index 1a1c7be424..ee59a7958e 100644 --- a/cleanlab/experimental/label_issues_batched.py +++ b/cleanlab/experimental/label_issues_batched.py @@ -23,52 +23,8 @@ With default settings, the results returned from this approach closely approximate those returned from: ``cleanlab.filter.find_label_issues(..., filter_by="low_self_confidence", return_indices_ranked_by="self_confidence")`` -To run this approach, either follow the examples script below, -or use the ``find_label_issues_batched()`` convenience function defined in this module. - -The recommended usage demonstrated in the examples script involves two passes over your data: -one pass to compute `confident_thresholds`, another to evaluate each label. -To maximize efficiency, try to use the largest batch_size your memory allows. -To reduce runtime further, you can run the first pass on a subset of your dataset -as long as it contains enough data from each class to estimate `confident_thresholds` accurately. - -In the examples script below: -- `labels` is a (big) 1D ``np.ndarray`` of class labels represented as integers in ``0,1,...,K-1``. -- ``pred_probs`` = is a (big) 2D ``np.ndarray`` of predicted class probabilities, -where each row is an example, each column represents a class. - -`labels` and `pred_probs` can be stored in a file instead where you load chunks of them at a time. -Methods to load arrays in chunks include: ``np.load(...,mmap_mode='r')``, ``numpy.memmap()``, -HDF5 or Zarr files, see: https://pythonspeed.com/articles/mmap-vs-zarr-hdf5/ - -Examples --------- ->>> n = len(labels) ->>> batch_size = 10000 # you can change this in between batches, set as big as your RAM allows ->>> lab = LabelInspector(num_class = pred_probs.shape[1]) ->>> # First compute confident thresholds (for faster results, can also do this on a random subset of your data): ->>> i = 0 ->>> while i < n: ->>> end_index = i + batch_size ->>> labels_batch = labels[i:end_index] ->>> pred_probs_batch = pred_probs[i:end_index,:] ->>> i = end_index ->>> lab.update_confident_thresholds(labels_batch, pred_probs_batch) ->>> # See what we calculated: ->>> confident_thresholds = lab.get_confident_thresholds() ->>> # Evaluate the quality of the labels (run this on full dataset you want to evaluate): ->>> i = 0 ->>> while i < n: ->>> end_index = i + batch_size ->>> labels_batch = labels[i:end_index] ->>> pred_probs_batch = pred_probs[i:end_index,:] ->>> i = end_index ->>> batch_results = lab.score_label_quality(labels_batch, pred_probs_batch) ->>> # Indices of examples with label issues, sorted by label quality score (most severe to least severe): ->>> indices_of_examples_with_issues = lab.get_label_issues() ->>> # If your `pred_probs` and `labels` are arrays already in memory, ->>> # then you can use this shortcut for all of the above: ->>> indices_of_examples_with_issues = find_label_issues_batched(labels, pred_probs, batch_size=10000) +To run this approach, either use the ``find_label_issues_batched()`` convenience function defined in this module, +or follow the examples script for the ``LabelInspector`` class if you require greater customization. """ import numpy as np @@ -97,6 +53,180 @@ pred_probs_shared: np.ndarray +def find_label_issues_batched( + labels: Optional[LabelLike] = None, + pred_probs: Optional[np.ndarray] = None, + *, + labels_file: Optional[str] = None, + pred_probs_file: Optional[str] = None, + batch_size: int = 10000, + n_jobs: Optional[int] = None, + verbose: bool = True, + quality_score_kwargs: Optional[dict] = None, + num_issue_kwargs: Optional[dict] = None, +) -> np.ndarray: + """ + Variant of :py:func:`filter.find_label_issues ` + that requires less memory by reading from `pred_probs`, `labels` in mini-batches. + To avoid loading big `pred_probs`, `labels` arrays into memory, + provide these as memory-mapped objects like Zarr arrays or memmap arrays instead of regular numpy arrays. + See: https://pythonspeed.com/articles/mmap-vs-zarr-hdf5/ + + With default settings, the results returned from this method closely approximate those returned from: + ``cleanlab.filter.find_label_issues(..., filter_by="low_self_confidence", return_indices_ranked_by="self_confidence")`` + + This function internally implements the example usage script of the ``LabelInspector`` class, + but you can further customize that script by running it yourself instead of this function. + See the documentation of ``LabelInspector`` to learn more about how this method works internally. + + Parameters + ---------- + labels: np.ndarray-like object, optional + 1D array of given class labels for each example in the dataset, (int) values in ``0,1,2,...,K-1``. + To avoid loading big objects into memory, you should pass this as a memory-mapped object like: + Zarr array loaded with ``zarr.convenience.open(YOURFILE.zarr, mode="r")``, + or memmap array loaded with ``np.load(YOURFILE.npy, mmap_mode="r")``. + + Tip: You can save an existing numpy array to Zarr via: ``zarr.convenience.save_array(YOURFILE.zarr, your_array)``, + or to .npy file that can be loaded with mmap via: ``np.save(YOURFILE.npy, your_array)``. + + pred_probs: np.ndarray-like object, optional + 2D array of model-predicted class probabilities (floats) for each example in the dataset. + To avoid loading big objects into memory, you should pass this as a memory-mapped object like: + Zarr array loaded with ``zarr.convenience.open(YOURFILE.zarr, mode="r")`` + or memmap array loaded with ``np.load(YOURFILE.npy, mmap_mode="r")``. + + labels_file: str, optional + Specify this instead of `labels` if you want this method to load from file for you into a memmap array. + Path to .npy file where the entire 1D `labels` numpy array is stored on disk (list format is not supported). + This is loaded using: ``np.load(labels_file, mmap_mode="r")`` + so make sure this file was created via: ``np.save()`` or other compatible methods (.npz not supported). + + pred_probs_file: str, optional + Specify this instead of `pred_probs` if you want this method to load from file for you into a memmap array. + Path to .npy file where the entire `pred_probs` numpy array is stored on disk. + This is loaded using: ``np.load(pred_probs_file, mmap_mode="r")`` + so make sure this file was created via: ``np.save()`` or other compatible methods (.npz not supported). + + batch_size : int, optional + Size of mini-batches to use for estimating the label issues. + To maximize efficiency, try to use the largest `batch_size` your memory allows. + + n_jobs: int, optional + Number of processes for multiprocessing. Only used on Linux. + If `n_jobs=None`, will use either the number of: physical cores if psutil is installed, or logical cores otherwise. + + verbose : bool, optional + Whether to suppress print statements or not. + + quality_score_kwargs : dict, optional + Keyword arguments to pass into :py:func:`rank.get_label_quality_scores `. + + num_issue_kwargs : dict, optional + Keyword arguments to :py:func:`count.num_label_issues()` ` + to control estimation of the number of label issues. + The only supported kwarg here for now is: `estimation_method`. + + Returns + ------- + issue_indices : np.ndarray + Indices of examples with label issues, sorted by label quality score. + + Examples + -------- + >>> batch_size = 10000 # for efficiency, set this to as large of a value as your memory can handle + >>> # Just demonstrating how to save your existing numpy labels, pred_probs arrays to compatible .npy files: + >>> np.save("LABELS.npy", labels_array) + >>> np.save("PREDPROBS.npy", pred_probs_array) + >>> # You can load these back into memmap arrays via: labels = np.load("LABELS.npy", mmap_mode="r") + >>> # and then run this method on the memmap arrays, or just run it directly on the .npy files like this: + >>> issues = find_label_issues_batched(labels_file="LABELS.npy", pred_probs_file="PREDPROBS.npy", batch_size=batch_size) + >>> # This method also works with Zarr arrays: + >>> import zarr + >>> # Just demonstrating how to save your existing numpy labels, pred_probs arrays to compatible .zarr files: + >>> zarr.convenience.save_array("LABELS.zarr", labels_array) + >>> zarr.convenience.save_array("PREDPROBS.zarr", pred_probs_array) + >>> # You can load from such files into Zarr arrays: + >>> labels = zarr.convenience.open("LABELS.zarr", mode="r") + >>> pred_probs = zarr.convenience.open("PREDPROBS.zarr", mode="r") + >>> # This method can be directly run on Zarr arrays, memmap arrays, or regular numpy arrays: + >>> issues = find_label_issues_batched(labels=labels, pred_probs=pred_probs, batch_size=batch_size) + """ + if labels_file is not None: + if labels is not None: + raise ValueError("only specify one of: `labels` or `labels_file`") + if not isinstance(labels_file, str): + raise ValueError( + "labels_file must be str specifying path to .npy file containing the array of labels" + ) + labels = np.load(labels_file, mmap_mode="r") + assert isinstance(labels, np.ndarray) + + if pred_probs_file is not None: + if pred_probs is not None: + raise ValueError("only specify one of: `pred_probs` or `pred_probs_file`") + if not isinstance(pred_probs_file, str): + raise ValueError( + "pred_probs_file must be str specifying path to .npy file containing 2D array of pred_probs" + ) + pred_probs = np.load(pred_probs_file, mmap_mode="r") + assert isinstance(pred_probs, np.ndarray) + if verbose: + print( + f"mmap-loaded numpy arrays have: {len(pred_probs)} examples, {pred_probs.shape[1]} classes" + ) + if labels is None: + raise ValueError("must provide one of: `labels` or `labels_file`") + if pred_probs is None: + raise ValueError("must provide one of: `pred_probs` or `pred_probs_file`") + + assert pred_probs is not None + if len(labels) != len(pred_probs): + raise ValueError( + f"len(labels)={len(labels)} does not match len(pred_probs)={len(pred_probs)}. Perhaps an issue loading mmap numpy arrays from file." + ) + lab = LabelInspector( + num_class=pred_probs.shape[1], + verbose=verbose, + n_jobs=n_jobs, + quality_score_kwargs=quality_score_kwargs, + num_issue_kwargs=num_issue_kwargs, + ) + n = len(labels) + if verbose: + from tqdm.auto import tqdm + + pbar = tqdm(desc="number of examples processed for estimating thresholds", total=n) + i = 0 + while i < n: + end_index = i + batch_size + labels_batch = labels[i:end_index] + pred_probs_batch = pred_probs[i:end_index, :] + i = end_index + lab.update_confident_thresholds(labels_batch, pred_probs_batch) + if verbose: + pbar.update(batch_size) + + # Next evaluate the quality of the labels (run this on full dataset you want to evaluate): + if verbose: + pbar.close() + pbar = tqdm(desc="number of examples processed for checking labels", total=n) + i = 0 + while i < n: + end_index = i + batch_size + labels_batch = labels[i:end_index] + pred_probs_batch = pred_probs[i:end_index, :] + i = end_index + _ = lab.score_label_quality(labels_batch, pred_probs_batch) + if verbose: + pbar.update(batch_size) + + if verbose: + pbar.close() + + return lab.get_label_issues() + + class LabelInspector: """ Class for finding label issues in big datasets where memory becomes a problem for other cleanlab methods. @@ -107,6 +237,50 @@ class LabelInspector: Do NOT modify any of the attributes of this class yourself! Multi-label classification is not supported by this class, it is only for multi-class classification. + The recommended usage demonstrated in the examples script below involves two passes over your data: + one pass to compute `confident_thresholds`, another to evaluate each label. + To maximize efficiency, try to use the largest batch_size your memory allows. + To reduce runtime further, you can run the first pass on a subset of your dataset + as long as it contains enough data from each class to estimate `confident_thresholds` accurately. + + In the examples script below: + - `labels` is a (big) 1D ``np.ndarray`` of class labels represented as integers in ``0,1,...,K-1``. + - ``pred_probs`` = is a (big) 2D ``np.ndarray`` of predicted class probabilities, + where each row is an example, each column represents a class. + + `labels` and `pred_probs` can be stored in a file instead where you load chunks of them at a time. + Methods to load arrays in chunks include: ``np.load(...,mmap_mode='r')``, ``numpy.memmap()``, + HDF5 or Zarr files, see: https://pythonspeed.com/articles/mmap-vs-zarr-hdf5/ + + Examples + -------- + >>> n = len(labels) + >>> batch_size = 10000 # you can change this in between batches, set as big as your RAM allows + >>> lab = LabelInspector(num_class = pred_probs.shape[1]) + >>> # First compute confident thresholds (for faster results, can also do this on a random subset of your data): + >>> i = 0 + >>> while i < n: + >>> end_index = i + batch_size + >>> labels_batch = labels[i:end_index] + >>> pred_probs_batch = pred_probs[i:end_index,:] + >>> i = end_index + >>> lab.update_confident_thresholds(labels_batch, pred_probs_batch) + >>> # See what we calculated: + >>> confident_thresholds = lab.get_confident_thresholds() + >>> # Evaluate the quality of the labels (run this on full dataset you want to evaluate): + >>> i = 0 + >>> while i < n: + >>> end_index = i + batch_size + >>> labels_batch = labels[i:end_index] + >>> pred_probs_batch = pred_probs[i:end_index,:] + >>> i = end_index + >>> batch_results = lab.score_label_quality(labels_batch, pred_probs_batch) + >>> # Indices of examples with label issues, sorted by label quality score (most severe to least severe): + >>> indices_of_examples_with_issues = lab.get_label_issues() + >>> # If your `pred_probs` and `labels` are arrays already in memory, + >>> # then you can use this shortcut for all of the above: + >>> indices_of_examples_with_issues = find_label_issues_batched(labels, pred_probs, batch_size=10000) + Parameters ---------- num_class : int @@ -352,7 +526,7 @@ def score_label_quality( Parameters ---------- - labels: np.ndarray or list + labels: np.ndarray Given class labels for each example in the batch, values in ``0,1,2,...,K-1``. pred_probs: np.ndarray @@ -547,153 +721,3 @@ def _batch_check(labels: LabelLike, pred_probs: np.ndarray, num_class: int) -> n raise ValueError("num_class must equal pred_probs.shape[1]") return labels - - -def find_label_issues_batched( - *, - labels_file: Optional[str] = None, - pred_probs_file: Optional[str] = None, - labels: Optional[LabelLike] = None, - pred_probs: Optional[np.ndarray] = None, - batch_size: int = 10000, - n_jobs: Optional[int] = None, - verbose: bool = True, - quality_score_kwargs: Optional[dict] = None, - num_issue_kwargs: Optional[dict] = None, -) -> np.ndarray: - """ - Variant of :py:func:`filter.find_label_issues ` - that requires less memory by reading `pred_probs`, `labels` in mini-batches, if provided as files. - Only .npy files are supported (not .npz), and these must be loadable via: ``np.load(your_file, mmap_mode="r")``. - If you want to read from other file-types (eg. HDF5 or Zarr) instead, - see the example usage of the ``LabelInspector`` class. - - This function basically implements the example ``LabelInspector`` usage script, - but you can further customize that script by running it yourself. - See the documentation of ``LabelInspector`` to learn more about how this method works internally. - - With default settings, the results returned from this method closely approximate those returned from: - ``cleanlab.filter.find_label_issues(..., filter_by="low_self_confidence", return_indices_ranked_by="self_confidence")`` - - Parameters - ---------- - labels_file: str, optional - Path to .npy file where the entire 1D `labels` numpy array is stored on disk (list format is not supported). - This is loaded using: ``np.load(labels_file, mmap_mode="r")`` - so make sure this file was created via: ``np.save()`` or other compatible methods. - - pred_probs_file: str, optional - Path to .npy file where the entire `pred_probs` numpy array is stored on disk. - This is loaded using: ``np.load(pred_probs_file, mmap_mode="r")`` - so make sure this file was created via: ``np.save()`` or other compatible methods. - - labels: np.ndarray or list, optional - Given class labels for each example in the dataset, (int) values in ``0,1,2,...,K-1``. - Recommend providing `labels_file` instead of `labels` to avoid loading big objects into memory. - - pred_probs: np.ndarray, optional - 2D array of model-predicted class probabilities (floats) for each example in the dataset. - Recommend providing `pred_probs_file` instead of `pred_probs` to avoid loading big objects into memory. - - batch_size : int, optional - Size of mini-batches to use for estimating the label issues. - To maximize efficiency, try to use the largest `batch_size` your memory allows. - - n_jobs: int, optional - Number of processes for multiprocessing. Only used on Linux. - If `n_jobs=None`, will use either the number of: physical cores if psutil is installed, or logical cores otherwise. - - verbose : bool, optional - Whether to suppress print statements or not. - - quality_score_kwargs : dict, optional - Keyword arguments to pass into :py:func:`rank.get_label_quality_scores `. - - num_issue_kwargs : dict, optional - Keyword arguments to :py:func:`count.num_label_issues()` ` - to control estimation of the number of label issues. - The only supported kwarg here for now is: `estimation_method`. - - Returns - ------- - issue_indices : np.ndarray - Indices of examples with label issues, sorted by label quality score. - - Examples - -------- - >>> batch_size = 10000 # for efficiency, set this to as large of a value as your memory can handle - >>> issues = find_label_issues_batched(labels_file="LABELS.npy", pred_probs_file="PREDPROBS.npy", batch_size=batch_size) - """ - if labels_file is not None: - if labels is not None: - raise ValueError("only specify one of: `labels` or `labels_file`") - if not isinstance(labels_file, str): - raise ValueError( - "labels_file must be str specifying path to .npy file containing the array of labels" - ) - labels = np.load(labels_file, mmap_mode="r") - assert isinstance(labels, np.ndarray) - - if pred_probs_file is not None: - if pred_probs is not None: - raise ValueError("only specify one of: `pred_probs` or `pred_probs_file`") - if not isinstance(pred_probs_file, str): - raise ValueError( - "pred_probs_file must be str specifying path to .npy file containing 2D array of pred_probs" - ) - pred_probs = np.load(pred_probs_file, mmap_mode="r") - assert isinstance(pred_probs, np.ndarray) - if verbose: - print( - f"mmap-loaded numpy arrays have: {len(pred_probs)} examples, {pred_probs.shape[1]} classes" - ) - if labels is None: - raise ValueError("must provide one of: `labels` or `labels_file`") - if pred_probs is None: - raise ValueError("must provide one of: `pred_probs` or `pred_probs_file`") - - assert isinstance(pred_probs, np.ndarray) - if len(labels) != len(pred_probs): - raise ValueError( - f"len(labels)={len(labels)} does not match len(pred_probs)={len(pred_probs)}. Perhaps an issue loading mmap numpy arrays from file." - ) - lab = LabelInspector( - num_class=pred_probs.shape[1], - verbose=verbose, - n_jobs=n_jobs, - quality_score_kwargs=quality_score_kwargs, - num_issue_kwargs=num_issue_kwargs, - ) - n = len(labels) - if verbose: - from tqdm.auto import tqdm - - pbar = tqdm(desc="number of examples processed for estimating thresholds", total=n) - i = 0 - while i < n: - end_index = i + batch_size - labels_batch = labels[i:end_index] - pred_probs_batch = pred_probs[i:end_index, :] - i = end_index - lab.update_confident_thresholds(labels_batch, pred_probs_batch) - if verbose: - pbar.update(batch_size) - - # Next evaluate the quality of the labels (run this on full dataset you want to evaluate): - if verbose: - pbar.close() - pbar = tqdm(desc="number of examples processed for checking labels", total=n) - i = 0 - while i < n: - end_index = i + batch_size - labels_batch = labels[i:end_index] - pred_probs_batch = pred_probs[i:end_index, :] - i = end_index - _ = lab.score_label_quality(labels_batch, pred_probs_batch) - if verbose: - pbar.update(batch_size) - - if verbose: - pbar.close() - - return lab.get_label_issues() diff --git a/docs/source/tutorials/faq.ipynb b/docs/source/tutorials/faq.ipynb index 6f9972505a..1152e5c86e 100644 --- a/docs/source/tutorials/faq.ipynb +++ b/docs/source/tutorials/faq.ipynb @@ -221,7 +221,7 @@ "id": "089f505e", "metadata": {}, "source": [ - "For a dataset with many rows and/or classes, there are more efficient methods in the `label_issues_batched` module. These methods read data in mini-batches and you can reduce the `batch_size` to control how much memory they require. Below is an example of how to use the `find_label_issues_batched()` method from this module, which can load mini-batches of data from `labels`, `pred_probs` saved as .npy files on disk. Check out the `LabelInspector` class from this module if you instead have `labels`, `pred_probs` saved as other file-types (eg. HDF5 or Zarr)." + "For a dataset with many rows and/or classes, there are more efficient methods in the `label_issues_batched` module. These methods read data in mini-batches and you can reduce the `batch_size` to control how much memory they require. Below is an example of how to use the `find_label_issues_batched()` method from this module, which can load mini-batches of data from `labels`, `pred_probs` saved as .npy files on disk. You can also run this method on Zarr arrays loaded from .zarr files. If you need greater flexibility, check out the `LabelInspector` class from this module." ] }, { From c85bc7e58910750c2cb602216fb1367f0d817725 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Mon, 13 Feb 2023 17:04:55 -0800 Subject: [PATCH 122/258] default n_jobs in label issues batched to =1 (#633) --- cleanlab/experimental/label_issues_batched.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cleanlab/experimental/label_issues_batched.py b/cleanlab/experimental/label_issues_batched.py index ee59a7958e..8ccaadd10e 100644 --- a/cleanlab/experimental/label_issues_batched.py +++ b/cleanlab/experimental/label_issues_batched.py @@ -60,7 +60,7 @@ def find_label_issues_batched( labels_file: Optional[str] = None, pred_probs_file: Optional[str] = None, batch_size: int = 10000, - n_jobs: Optional[int] = None, + n_jobs: Optional[int] = 1, verbose: bool = True, quality_score_kwargs: Optional[dict] = None, num_issue_kwargs: Optional[dict] = None, @@ -113,7 +113,7 @@ def find_label_issues_batched( To maximize efficiency, try to use the largest `batch_size` your memory allows. n_jobs: int, optional - Number of processes for multiprocessing. Only used on Linux. + Number of processes for multiprocessing (default value = 1). Only used on Linux. If `n_jobs=None`, will use either the number of: physical cores if psutil is installed, or logical cores otherwise. verbose : bool, optional @@ -298,7 +298,7 @@ class LabelInspector: Whether to suppress print statements or not. n_jobs: int, optional - Number of processes for multiprocessing. Only used on Linux. + Number of processes for multiprocessing (default value = 1). Only used on Linux. If `n_jobs=None`, will use either the number of: physical cores if psutil is installed, or logical cores otherwise. quality_score_kwargs : dict, optional @@ -318,7 +318,7 @@ def __init__( verbose: bool = True, quality_score_kwargs: Optional[dict] = None, num_issue_kwargs: Optional[dict] = None, - n_jobs: Optional[int] = None, + n_jobs: Optional[int] = 1, ): if quality_score_kwargs is None: quality_score_kwargs = {} From 3757637d8cf30ad0712b2a2080c9f9c1147da7f4 Mon Sep 17 00:00:00 2001 From: clu0 <33559427+clu0@users.noreply.github.com> Date: Wed, 15 Feb 2023 16:14:39 -0500 Subject: [PATCH 123/258] Fix batched multiprocessing being slower on tall matrices (#634) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/experimental/label_issues_batched.py | 45 ++++++++++++++----- docs/source/tutorials/faq.ipynb | 2 +- tests/test_filter_count.py | 8 +++- 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/cleanlab/experimental/label_issues_batched.py b/cleanlab/experimental/label_issues_batched.py index 8ccaadd10e..8fe52a9be6 100644 --- a/cleanlab/experimental/label_issues_batched.py +++ b/cleanlab/experimental/label_issues_batched.py @@ -631,11 +631,19 @@ def _update_num_label_issues( global labels_shared, pred_probs_shared labels_shared = labels pred_probs_shared = pred_probs + + # good values for this are ~1000-10000 in benchmarks where pred_probs has 1B entries: + processes = 5000 + if len(labels) <= processes: + chunksize = 1 + else: + chunksize = len(labels) // processes + inds = split_arr(np.arange(len(labels)), chunksize) + if thorough: - use_thorough = np.ones(len(labels_shared), dtype=bool) + use_thorough = np.ones(len(inds), dtype=bool) else: - use_thorough = np.zeros(len(labels_shared), dtype=bool) - inds = np.arange(len(labels_shared)) + use_thorough = np.zeros(len(inds), dtype=bool) args = zip(inds, use_thorough) with mp.Pool(self.n_jobs) as pool: if not self.off_diagonal_calibrated: @@ -652,7 +660,14 @@ def _update_num_label_issues( self.prune_counts[class_label] += result[2] -def _compute_num_issues(arg: Tuple[int, bool]) -> int: +def split_arr(arr: np.ndarray, chunksize: int) -> List[np.ndarray]: + """ + Helper function to split array into chunks for multiprocessing + """ + return np.split(arr, np.arange(chunksize, arr.shape[0], chunksize), axis=0) + + +def _compute_num_issues(arg: Tuple[np.ndarray, bool]) -> int: """ Helper function for `_update_num_label_issues` multiprocessing without calibration """ @@ -661,23 +676,27 @@ def _compute_num_issues(arg: Tuple[int, bool]) -> int: label = labels_shared[ind] pred_prob = pred_probs_shared[ind, :] pred_class = np.argmax(pred_prob, axis=-1) + batch_size = len(label) if thorough: pred_gt_thresholds = pred_prob >= adj_confident_thresholds_shared max_ind = np.argmax(pred_prob * pred_gt_thresholds, axis=-1) - prune_count_batch = ( - (pred_prob[max_ind] >= adj_confident_thresholds_shared[max_ind]) + prune_count_batch = np.sum( + (pred_prob[np.arange(batch_size), max_ind] >= adj_confident_thresholds_shared[max_ind]) & (max_ind != label) & (pred_class != label) ) else: prune_count_batch = np.sum( - (pred_prob[pred_class] >= adj_confident_thresholds_shared[pred_class]) + ( + pred_prob[np.arange(batch_size), pred_class] + >= adj_confident_thresholds_shared[pred_class] + ) & (pred_class != label) ) return prune_count_batch -def _compute_num_issues_calibrated(arg: Tuple[int, bool]) -> Tuple[Any, int, int]: +def _compute_num_issues_calibrated(arg: Tuple[np.ndarray, bool]) -> Tuple[Any, int, int]: """ Helper function for `_update_num_label_issues` multiprocessing with calibration """ @@ -685,17 +704,23 @@ def _compute_num_issues_calibrated(arg: Tuple[int, bool]) -> Tuple[Any, int, int thorough = arg[1] label = labels_shared[ind] pred_prob = pred_probs_shared[ind, :] + batch_size = len(label) pred_class = np.argmax(pred_prob, axis=-1) if thorough: pred_gt_thresholds = pred_prob >= adj_confident_thresholds_shared max_ind = np.argmax(pred_prob * pred_gt_thresholds, axis=-1) - to_inc = pred_prob[max_ind] >= adj_confident_thresholds_shared[max_ind] + to_inc = ( + pred_prob[np.arange(batch_size), max_ind] >= adj_confident_thresholds_shared[max_ind] + ) prune_count_batch = to_inc & (max_ind != label) normalization_batch = to_inc else: - to_inc = pred_prob[pred_class] >= adj_confident_thresholds_shared[pred_class] + to_inc = ( + pred_prob[np.arange(batch_size), pred_class] + >= adj_confident_thresholds_shared[pred_class] + ) normalization_batch = to_inc prune_count_batch = to_inc & (pred_class != label) diff --git a/docs/source/tutorials/faq.ipynb b/docs/source/tutorials/faq.ipynb index 1152e5c86e..1ecdc4f0f1 100644 --- a/docs/source/tutorials/faq.ipynb +++ b/docs/source/tutorials/faq.ipynb @@ -221,7 +221,7 @@ "id": "089f505e", "metadata": {}, "source": [ - "For a dataset with many rows and/or classes, there are more efficient methods in the `label_issues_batched` module. These methods read data in mini-batches and you can reduce the `batch_size` to control how much memory they require. Below is an example of how to use the `find_label_issues_batched()` method from this module, which can load mini-batches of data from `labels`, `pred_probs` saved as .npy files on disk. You can also run this method on Zarr arrays loaded from .zarr files. If you need greater flexibility, check out the `LabelInspector` class from this module." + "For a dataset with many rows and/or classes, there are more efficient methods in the `label_issues_batched` module. These methods read data in mini-batches and you can reduce the `batch_size` to control how much memory they require. Below is an example of how to use the `find_label_issues_batched()` method from this module, which can load mini-batches of data from `labels`, `pred_probs` saved as .npy files on disk. You can also run this method on Zarr arrays loaded from .zarr files. Try playing with the `n_jobs` argument for further multiprocessing speedups. If you need greater flexibility, check out the `LabelInspector` class from this module." ] }, { diff --git a/tests/test_filter_count.py b/tests/test_filter_count.py index d4a1e8da8b..a7b7afd259 100644 --- a/tests/test_filter_count.py +++ b/tests/test_filter_count.py @@ -915,9 +915,13 @@ def test_batched_label_issues(): labels=data["labels"], pred_probs=data["pred_probs"], batch_size=int(len(data["labels"]) / 2.0), + n_jobs=None, ) f4 = find_label_issues_batched( - labels=data["labels"], pred_probs=data["pred_probs"], batch_size=len(data["labels"]) + 100 + labels=data["labels"], + pred_probs=data["pred_probs"], + batch_size=len(data["labels"]) + 100, + n_jobs=4, ) f_single = find_label_issues_batched( labels=data["labels"], @@ -954,12 +958,14 @@ def test_batched_label_issues(): labels=data["labels"], pred_probs=data["pred_probs"], batch_size=int(len(data["labels"]) / 2.0), + n_jobs=None, **extra_args, ) f7 = find_label_issues_batched( labels=data["labels"], pred_probs=data["pred_probs"], batch_size=len(data["labels"]) + 100, + n_jobs=4, **extra_args, ) f_single = find_label_issues_batched( From 6eb60198e60c7425c52fe88f0b1c2ea2d1d856bc Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Wed, 15 Feb 2023 17:34:11 -0800 Subject: [PATCH 124/258] update tests to be more stringent (#635) --- docs/source/tutorials/faq.ipynb | 4 ++-- tests/test_filter_count.py | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/docs/source/tutorials/faq.ipynb b/docs/source/tutorials/faq.ipynb index 1ecdc4f0f1..a1978e1417 100644 --- a/docs/source/tutorials/faq.ipynb +++ b/docs/source/tutorials/faq.ipynb @@ -262,13 +262,13 @@ "source": [ "# This cell is hidden on docs.cleanlab.ai, and is only for internal testing. You can ignore it.\n", "\n", - "issue_indices = cleanlab.filter.find_label_issues(labels, pred_probs, filter_by = \"confident_learning\", return_indices_ranked_by=\"self_confidence\")\n", + "issue_indices = cleanlab.filter.find_label_issues(labels, pred_probs, filter_by = \"low_self_confidence\", return_indices_ranked_by=\"self_confidence\")\n", "assert np.abs(len(issue_indices) - len(indices_of_examples_with_issues)) < 2, \"num issues differ in batched mode\"\n", "set1 = set(issue_indices)\n", "set2 = set(indices_of_examples_with_issues)\n", "intersection = len(list(set1.intersection(set2)))\n", "union = len(set1) + len(set2) - intersection\n", - "assert float(intersection) / union > 0.9, \"issue indices differ in batched mode\"" + "assert float(intersection) / union > 0.95, \"issue indices differ in batched mode\"" ] }, { diff --git a/tests/test_filter_count.py b/tests/test_filter_count.py index a7b7afd259..26c82c6214 100644 --- a/tests/test_filter_count.py +++ b/tests/test_filter_count.py @@ -902,9 +902,7 @@ def test_batched_label_issues(): labels=data["labels"], pred_probs=data["pred_probs"], return_indices_ranked_by="self_confidence", - filter_by="confident_learning", - # TODO: replace the above line with: - # filter_by="low_self_confidence", + filter_by="low_self_confidence", ) f2 = find_label_issues_batched( labels=data["labels"], @@ -936,7 +934,7 @@ def test_batched_label_issues(): # check jaccard similarity: intersection = len(list(set(f1).intersection(set(f2)))) union = len(set(f1)) + len(set(f2)) - intersection - assert float(intersection) / union > 0.6 + assert float(intersection) / union > 0.95 n1 = count.num_label_issues( labels=data["labels"], pred_probs=data["pred_probs"], From 570ecbdcb81074a31a7b5a61997de7ae37b32c49 Mon Sep 17 00:00:00 2001 From: Anish Athalye Date: Sun, 19 Feb 2023 09:55:41 -0500 Subject: [PATCH 125/258] Switch to typing.Self (#489) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PEP 673 [1] introduces `typing.Self` as a simple way to annotate methods that return an instance of their class. This patch adopts `typing.Self`. Because this is only included in Python 3.11, for backwards compatibility, this patch adds typing-extensions [2], an officially-supported module that is designed for enabling use of new type system features on older Python versions. We don't want to make typing-extensions a runtime dependency, so we add a guard and only import it when we're `TYPE_CHECKING`. [1]: https://peps.python.org/pep-0673/ [2]: https://pypi.org/project/typing-extensions/ Co-authored-by: Elías Snorrason --- cleanlab/classification.py | 12 ++++++------ requirements-dev.txt | 7 ++++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/cleanlab/classification.py b/cleanlab/classification.py index 66126de932..c419ef5463 100644 --- a/cleanlab/classification.py +++ b/cleanlab/classification.py @@ -120,7 +120,10 @@ def score(self, X, y, sample_weight=None): import pandas as pd import inspect import warnings -from typing import TypeVar, Optional +from typing import Optional, TYPE_CHECKING + +if TYPE_CHECKING: # pragma: no cover + from typing_extensions import Self from cleanlab.rank import get_label_quality_scores from cleanlab import filter @@ -147,9 +150,6 @@ def score(self, X, y, sample_weight=None): ) -TCleanLearning = TypeVar("TCleanLearning", bound="CleanLearning") # self type for the class - - class CleanLearning(BaseEstimator): # Inherits sklearn classifier """ CleanLearning = Machine Learning with cleaned data (even when training on messy, error-ridden data). @@ -265,7 +265,7 @@ def __init__( self.clf_final_kwargs = None def fit( - self: TCleanLearning, + self, X, labels=None, *, @@ -279,7 +279,7 @@ def fit( clf_final_kwargs={}, validation_func=None, y=None, - ) -> TCleanLearning: + ) -> "Self": """ Train the model `clf` with error-prone, noisy labels as if the model had been instead trained on a dataset with the correct labels. diff --git a/requirements-dev.txt b/requirements-dev.txt index 1dc5dc705c..dd3d55e4fe 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,14 +3,15 @@ coverage != 6.3, != 6.3.* mypy pandas-stubs pre-commit +psutil pytest pytest-cov pytest-lazy-fixture requests scipy -torch -torchvision skorch tensorflow -psutil +torch +torchvision +typing-extensions>=4.1.1 wget From 1deb349e86f43a132d002108f990462bfd875cf1 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Mon, 27 Feb 2023 15:38:47 -0800 Subject: [PATCH 126/258] list more tasks in quickstart (#640) --- docs/source/index.rst | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 76bebb5ace..79050a77e5 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,9 +1,9 @@ cleanlab documentation ====================== -`cleanlab `_ **automatically finds and fixes label issues in your ML datasets.** +`cleanlab `_ **automatically detects data and label issues in your ML datasets.** -| This reduces manual work needed to fix data errors and helps train reliable ML models on noisy real-world datasets. cleanlab has already found thousands of `label errors `_ in ImageNet, MNIST, and other popular ML benchmarking datasets, so let's get started with yours! +| This helps you improve your data and train reliable ML models on noisy real-world datasets. cleanlab has already found thousands of `label errors `_ in ImageNet, MNIST, and other popular ML benchmarking datasets. Beyond handling label errors, this is a comprehensive open-source library implementing many data-centric AI capabilities. Start using automation to improve your data in 5 minutes! Quickstart ========== @@ -35,7 +35,7 @@ Quickstart 2. Find label errors in your data --------------------------------- -cleanlab finds issues in *any dataset that a classifier can be trained on*. The cleanlab package *works with any model* by using model outputs (predicted probabilities) as input -- it doesn't depend on which model created those outputs. +cleanlab finds issues in *any dataset that a classifier can be trained on*. The cleanlab package *works with any ML model* by using model outputs (predicted probabilities) as input -- it doesn't depend on which model created those outputs. If you're using a scikit-learn-compatible model (option 1), you don't need to train a model -- you can pass the model, data, and labels into :py:meth:`CleanLearning.find_label_issues ` and cleanlab will handle model training for you. If you want to use any non-sklearn-compatible model (option 2), you can input the trained model's out-of-sample predicted probabilities into :py:meth:`find_label_issues `. Examples for both options are below. @@ -54,15 +54,15 @@ If you're using a scikit-learn-compatible model (option 1), you don't need to tr return_indices_ranked_by='self_confidence', ) -:py:class:`CleanLearning ` (option 1) also works with models from most standard ML frameworks by wrapping the model for scikit-learn compliance, e.g. huggingface/tensorflow/keras (using our KerasWrapperModel), pytorch (using skorch package), etc. +:py:class:`CleanLearning ` (option 1) also works with models from most standard ML frameworks by wrapping the model for scikit-learn compliance, e.g. :ref:`tensorflow/keras ` (using our KerasWrapperModel), :ref:`pytorch ` (using skorch package), etc. By default, :py:meth:`find_label_issues ` returns a boolean mask of label issues. You can instead return the indices of potential mislabeled examples by setting `return_indices_ranked_by` in :py:meth:`find_label_issues `. The indices are ordered by likelihood of a label error (estimated via :py:meth:`rank.get_label_quality_scores `). +Beyond standard classification tasks, cleanlab can also detect mislabeled examples in: :ref:`multi-label data ` (e.g. image/document tagging), :ref:`sequence prediction ` (e.g. entity recognition), and :ref:`data labeled by multiple annotators ` (e.g. crowdsourcing). + .. important:: Cleanlab performs better if the ``pred_probs`` from your model are **out-of-sample**. Details on how to compute out-of-sample predicted probabilities for your entire dataset are :ref:`here `. -.. - TODO - include the url for tf and torch beginner tutorials 3. Train robust models with noisy labels ---------------------------------------- @@ -86,9 +86,9 @@ When the :py:meth:`.fit() ` method is 4. Dataset curation: fix dataset-level issues --------------------------------------------- -cleanlab's :py:mod:`dataset ` module helps you deal with dataset-level issues by :ref:`finding overlapping classes ` (classes to merge), :ref:`rank class-level label quality ` (classes to keep/delete), and :ref:`measure overall dataset health ` (to track dataset quality as you make adjustments). +cleanlab's :ref:`dataset ` module helps you deal with dataset-level issues -- :py:meth:`find overlapping classes ` (classes to merge), :py:meth:`rank class-level label quality ` (classes to keep/delete), and :py:meth:`measure overall dataset health ` (to track dataset quality as you make adjustments). -The example below shows how to view all dataset-level issues in one line of code with :py:meth:`dataset.health_summary() `. Check out the dataset tutorial for more examples. +View all dataset-level issues in one line of code with :py:meth:`dataset.health_summary() `. .. code-block:: python @@ -97,6 +97,18 @@ The example below shows how to view all dataset-level issues in one line of code health_summary(labels, pred_probs, class_names=class_names) +5. Improve your data via many other techniques +---------------------------------------------- + +Beyond handling label errors, cleanlab supports other data-centric AI capabilities including: + +- Detecting outliers and out-of-distribution examples in both training and future test data :ref:`(tutorial) ` +- Analyzing data labeled by multiple annotators to estimate consensus labels and their quality :ref:`(tutorial) ` +- Active learning with multiple annotators to identify which data is most informative to label or re-label next `(tutorial) `_ + + +If you have questions, check out our :ref:`FAQ ` and feel free to ask in `Slack `_! + Contributing ------------ From 497947a867086bdeec80cb139c8c93f3c0b5b39d Mon Sep 17 00:00:00 2001 From: Hui Wen <45724323+huiwengoh@users.noreply.github.com> Date: Tue, 28 Feb 2023 12:15:19 -0500 Subject: [PATCH 127/258] Fix tutorial hyperlinks in docs (#642) --- DEVELOPMENT.md | 3 ++- docs/source/index.rst | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 6bc352cf3c..8b912a591f 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -213,7 +213,8 @@ Use relative linking to connect information between docs and jupyter notebooks, - Link another function from within a source code docstring: ``:py:func:`function_name ` `` - Link another class from within a source code docstring: ``:py:class:`class_name ` `` -- Link a tutorial notebook from within a source code docstring: ``:ref:`notebook_name ` `` +- Link a tutorial (rst file) from within a source code docstring or rst file: ``:ref:`tutorial_name ` `` +- Link a tutorial notebook (ipynb file) from within a source code docstring or rst file: `` `notebook_name `_ `` . (If the notebook is not the in the same folder as the source code, use a relative path) - Link a function from within a tutorial notebook: `[function_name](../cleanlab/file.rst#cleanlab.file.function_name)` - Link a specific section of a notebook from within the notebook: `[section title](#section-title)` - Link a different tutorial notebook from within a tutorial notebook: `[another notebook](another_notebook.html)`. (Note this only works when the other notebook is in same folder as this notebook, otherwise may need to try relative path) diff --git a/docs/source/index.rst b/docs/source/index.rst index 79050a77e5..a99ef826a8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -54,11 +54,11 @@ If you're using a scikit-learn-compatible model (option 1), you don't need to tr return_indices_ranked_by='self_confidence', ) -:py:class:`CleanLearning ` (option 1) also works with models from most standard ML frameworks by wrapping the model for scikit-learn compliance, e.g. :ref:`tensorflow/keras ` (using our KerasWrapperModel), :ref:`pytorch ` (using skorch package), etc. +:py:class:`CleanLearning ` (option 1) also works with models from most standard ML frameworks by wrapping the model for scikit-learn compliance, e.g. `tensorflow/keras `_ (using our KerasWrapperModel), `pytorch `_ (using skorch package), etc. By default, :py:meth:`find_label_issues ` returns a boolean mask of label issues. You can instead return the indices of potential mislabeled examples by setting `return_indices_ranked_by` in :py:meth:`find_label_issues `. The indices are ordered by likelihood of a label error (estimated via :py:meth:`rank.get_label_quality_scores `). -Beyond standard classification tasks, cleanlab can also detect mislabeled examples in: :ref:`multi-label data ` (e.g. image/document tagging), :ref:`sequence prediction ` (e.g. entity recognition), and :ref:`data labeled by multiple annotators ` (e.g. crowdsourcing). +Beyond standard classification tasks, cleanlab can also detect mislabeled examples in: `multi-label data `_ (e.g. image/document tagging), `sequence prediction `_ (e.g. entity recognition), and `data labeled by multiple annotators `_ (e.g. crowdsourcing). .. important:: Cleanlab performs better if the ``pred_probs`` from your model are **out-of-sample**. Details on how to compute out-of-sample predicted probabilities for your entire dataset are :ref:`here `. @@ -86,7 +86,7 @@ When the :py:meth:`.fit() ` method is 4. Dataset curation: fix dataset-level issues --------------------------------------------- -cleanlab's :ref:`dataset ` module helps you deal with dataset-level issues -- :py:meth:`find overlapping classes ` (classes to merge), :py:meth:`rank class-level label quality ` (classes to keep/delete), and :py:meth:`measure overall dataset health ` (to track dataset quality as you make adjustments). +cleanlab's `dataset `_ module helps you deal with dataset-level issues -- :py:meth:`find overlapping classes ` (classes to merge), :py:meth:`rank class-level label quality ` (classes to keep/delete), and :py:meth:`measure overall dataset health ` (to track dataset quality as you make adjustments). View all dataset-level issues in one line of code with :py:meth:`dataset.health_summary() `. @@ -102,12 +102,12 @@ View all dataset-level issues in one line of code with :py:meth:`dataset.health_ Beyond handling label errors, cleanlab supports other data-centric AI capabilities including: -- Detecting outliers and out-of-distribution examples in both training and future test data :ref:`(tutorial) ` -- Analyzing data labeled by multiple annotators to estimate consensus labels and their quality :ref:`(tutorial) ` +- Detecting outliers and out-of-distribution examples in both training and future test data `(tutorial) `_ +- Analyzing data labeled by multiple annotators to estimate consensus labels and their quality `(tutorial) `_ - Active learning with multiple annotators to identify which data is most informative to label or re-label next `(tutorial) `_ -If you have questions, check out our :ref:`FAQ ` and feel free to ask in `Slack `_! +If you have questions, check out our `FAQ `_ and feel free to ask in `Slack `_! Contributing ------------ From dbc87116eaa4070a3dd8fa1000405e5e144f6a79 Mon Sep 17 00:00:00 2001 From: Hui Wen <45724323+huiwengoh@users.noreply.github.com> Date: Tue, 28 Feb 2023 19:06:17 -0500 Subject: [PATCH 128/258] Documentation improvements (#643) --- cleanlab/count.py | 30 +++++++++++-------- cleanlab/experimental/README.md | 2 +- cleanlab/experimental/label_issues_batched.py | 16 +++++----- cleanlab/models/fasttext.py | 21 +++++++++++-- cleanlab/models/keras.py | 4 +-- cleanlab/multiannotator.py | 11 ++++--- docs/source/cleanlab/experimental/index.rst | 4 +++ docs/source/cleanlab/models/index.rst | 5 ++++ .../cleanlab/token_classification/index.rst | 2 ++ 9 files changed, 65 insertions(+), 30 deletions(-) diff --git a/cleanlab/count.py b/cleanlab/count.py index de86abd9f6..cbfdb13f2e 100644 --- a/cleanlab/count.py +++ b/cleanlab/count.py @@ -91,19 +91,23 @@ def num_label_issues( estimation_method : Method for estimating the number of label issues in dataset by counting the examples in the off-diagonal of the `confident_joint` ``P(label=i, true_label=j)``. - - ``'off_diagonal'``: Counts the number of examples in the off-diagonal of the `confident_joint`. Returns the same value as ``sum(find_label_issues(filter_by='confident_learning'))`` - - ``'off_diagonal_calibrated'``: Calibrates confident joint estimate ``P(label=i, true_label=j)`` such that - ``np.sum(cj) == len(labels)`` and ``np.sum(cj, axis = 1) == np.bincount(labels)`` before counting the number - of examples in the off-diagonal. Number will always be equal to or greater than - ``estimate_issues='off_diagonal'``. You can use this value as the cutoff threshold used with ranking/scoring - functions from :py:mod:`cleanlab.rank` with `num_label_issues` over ``estimation_method='off_diagonal'`` in - two cases: - 1. As we add more label and data quality scoring functions in :py:mod:`cleanlab.rank`, this approach will always work. - 2. If you have a custom score to rank your data by label quality and you just need to know the cut-off of likely label issues. - - ``'off_diagonal_custom'``: Counts the number of examples in the off-diagonal of a provided `confident_joint` matrix. - - - TL;DR: Use this method to get the most accurate estimate of number of label issues when you don't need the indices of the label issues. + + * ``'off_diagonal'``: Counts the number of examples in the off-diagonal of the `confident_joint`. Returns the same value as ``sum(find_label_issues(filter_by='confident_learning'))`` + + * ``'off_diagonal_calibrated'``: Calibrates confident joint estimate ``P(label=i, true_label=j)`` such that + ``np.sum(cj) == len(labels)`` and ``np.sum(cj, axis = 1) == np.bincount(labels)`` before counting the number + of examples in the off-diagonal. Number will always be equal to or greater than + ``estimate_issues='off_diagonal'``. You can use this value as the cutoff threshold used with ranking/scoring + functions from :py:mod:`cleanlab.rank` with `num_label_issues` over ``estimation_method='off_diagonal'`` in + two cases: + + #. As we add more label and data quality scoring functions in :py:mod:`cleanlab.rank`, this approach will always work. + #. If you have a custom score to rank your data by label quality and you just need to know the cut-off of likely label issues. + + * ``'off_diagonal_custom'``: Counts the number of examples in the off-diagonal of a provided `confident_joint` matrix. + + TL;DR: Use this method to get the most accurate estimate of number of label issues when you don't need the indices of the label issues. + Note: ``'off_diagonal'`` may sometimes underestimate issues for data with few classes, so consider using ``'off_diagonal_calibrated'`` instead if your data has < 4 classes. multi_label : bool, optional diff --git a/cleanlab/experimental/README.md b/cleanlab/experimental/README.md index 91155f9801..3b24007b89 100644 --- a/cleanlab/experimental/README.md +++ b/cleanlab/experimental/README.md @@ -4,7 +4,7 @@ Methods in this `experimental` module are bleeding edge and may have sharp edges Some of these files include various models that can be used with cleanlab to find issues in specific types of data. These require dependencies on deep learning and other machine learning packages that are not official cleanlab dependencies. You must install these dependencies on your own if you wish to use them. -The dependencies are as follows: +The modules and required dependencies are as follows: * mnist_pytorch.py - a cleanlab-compatible simplified AlexNet for MNIST using PyTorch - torch - torchvision diff --git a/cleanlab/experimental/label_issues_batched.py b/cleanlab/experimental/label_issues_batched.py index 8fe52a9be6..d30df370dd 100644 --- a/cleanlab/experimental/label_issues_batched.py +++ b/cleanlab/experimental/label_issues_batched.py @@ -123,7 +123,7 @@ def find_label_issues_batched( Keyword arguments to pass into :py:func:`rank.get_label_quality_scores `. num_issue_kwargs : dict, optional - Keyword arguments to :py:func:`count.num_label_issues()` ` + Keyword arguments to :py:func:`count.num_label_issues ` to control estimation of the number of label issues. The only supported kwarg here for now is: `estimation_method`. @@ -305,7 +305,7 @@ class LabelInspector: Keyword arguments to pass into :py:func:`rank.get_label_quality_scores `. num_issue_kwargs : dict, optional - Keyword arguments to :py:func:`count.num_label_issues()` ` + Keyword arguments to :py:func:`count.num_label_issues ` to control estimation of the number of label issues. The only supported kwarg here for now is: `estimation_method`. """ @@ -407,7 +407,7 @@ def get_num_issues(self, silent: bool = False) -> int: Returns ------- - num_issues : + num_issues : int The estimated number of examples with label issues in the data seen so far. """ if self.examples_processed_quality < 1: @@ -455,7 +455,7 @@ def get_label_issues(self) -> np.ndarray: in the same format as: :py:func:`filter.find_label_issues ` with its `return_indices_ranked_by` argument specified. - Note: this method corresponds to ``filter.find_label_issues(..., filter_by=METHOD1, return_indices_ranked_by=METHOD2) + Note: this method corresponds to ``filter.find_label_issues(..., filter_by=METHOD1, return_indices_ranked_by=METHOD2)`` where by default: ``METHOD1="low_self_confidence"``, ``METHOD2="self_confidence"`` or if this object was instantiated with ``quality_score_kwargs = {"method": "normalized_margin"}`` then we instead have: ``METHOD1="low_normalized_margin"``, ``METHOD2="normalized_margin"``. @@ -522,7 +522,7 @@ def score_label_quality( """ Scores the label quality of each example in the provided batch of data, and also updates the number of label issues stored in this class. - Inputs should be in same format as for: :py:func:`rank.get_label_quality_scores `.. + Inputs should be in same format as for: :py:func:`rank.get_label_quality_scores `. Parameters ---------- @@ -662,14 +662,14 @@ def _update_num_label_issues( def split_arr(arr: np.ndarray, chunksize: int) -> List[np.ndarray]: """ - Helper function to split array into chunks for multiprocessing + Helper function to split array into chunks for multiprocessing. """ return np.split(arr, np.arange(chunksize, arr.shape[0], chunksize), axis=0) def _compute_num_issues(arg: Tuple[np.ndarray, bool]) -> int: """ - Helper function for `_update_num_label_issues` multiprocessing without calibration + Helper function for `_update_num_label_issues` multiprocessing without calibration. """ ind = arg[0] thorough = arg[1] @@ -698,7 +698,7 @@ def _compute_num_issues(arg: Tuple[np.ndarray, bool]) -> int: def _compute_num_issues_calibrated(arg: Tuple[np.ndarray, bool]) -> Tuple[Any, int, int]: """ - Helper function for `_update_num_label_issues` multiprocessing with calibration + Helper function for `_update_num_label_issues` multiprocessing with calibration. """ ind = arg[0] thorough = arg[1] diff --git a/cleanlab/models/fasttext.py b/cleanlab/models/fasttext.py index 1da6d766ad..3b2cd66259 100644 --- a/cleanlab/models/fasttext.py +++ b/cleanlab/models/fasttext.py @@ -15,10 +15,16 @@ # along with cleanlab. If not, see . """ -Text classification with FastText models that are compatible with cleanlab. +Text classification with fastText models that are compatible with cleanlab. This module allows you to easily find label issues in your text datasets. -You must first ``pip install fasttext`` +You must have fastText installed: ``pip install fasttext``. + +Tips: + +* Check out our example using this class: `fasttext_amazon_reviews `_ +* Our `unit tests `_ also provide basic usage examples. + """ import time @@ -96,6 +102,17 @@ def _split_labels_and_text(batch): class FastTextClassifier(BaseEstimator): # Inherits sklearn base classifier + """Instantiate a fastText classifier that is compatible with :py:class:`CleanLearning `. + + Parameters + ---------- + train_data_fn: str + File name of the training data in the format compatible with fastText. + + test_data_fn: str, optional + File name of the test data in the format compatible with fastText. + """ + def __init__( self, train_data_fn, diff --git a/cleanlab/models/keras.py b/cleanlab/models/keras.py index 5935cd5ef9..c297c6efd8 100644 --- a/cleanlab/models/keras.py +++ b/cleanlab/models/keras.py @@ -30,7 +30,7 @@ * If this class lacks certain functionality, you can alternatively try `scikeras `_. * Unlike scikeras, our `KerasWrapper` classes can operate directly on ``tensorflow.data.Dataset`` objects (like regular Keras models). * To call ``fit()`` on a tensorflow ``Dataset`` object with a Keras model, the ``Dataset`` should already be batched. -* Check out our `example `_ using this class: `huggingface_keras_imdb `_ +* Check out our example using this class: `huggingface_keras_imdb `_ * Our `unit tests `_ also provide basic usage examples. """ @@ -51,7 +51,7 @@ class KerasWrapperModel: Parameters ---------- model: Callable - A callable function to construct the Keras Model (using functional API). Pass in the function here, not the constructed model! + A callable function to construct the Keras Model (using functional API). Pass in the function here, not the constructed model! For example:: diff --git a/cleanlab/multiannotator.py b/cleanlab/multiannotator.py index 682142bff7..7b9f984de8 100644 --- a/cleanlab/multiannotator.py +++ b/cleanlab/multiannotator.py @@ -31,8 +31,9 @@ use the :py:func:`get_active_learning_scores ` function, which is intended for active learning. This function estimates an active learning quality score for each example, which can be used to prioritize which examples are most informative to collect additional labels for. -This function is effective for settings where some examples have been labeled by one or more annotators and other examples can have no labels at all so far, -as well as settings where new labels are collected either in batches of examples or one at a time. +This function is effective for settings where some examples have been labeled by one or more annotators and other examples can have no labels at all so far, +as well as settings where new labels are collected either in batches of examples or one at a time. +Here is an `example notebook `_ showcasing the use of this function in multiple active learning rounds. Each of the main functions in this module utilizes any trained classifier model. Variants of these functions are provided for settings where you have trained an ensemble of multiple models. @@ -540,7 +541,7 @@ def get_active_learning_scores( ) -> Tuple[np.ndarray, np.ndarray]: """Returns an active learning quality score for each example in the dataset. - We consider settings where one example can be labeled by multiple annotators and some examples have no labels at all so far. + We consider settings where one example can be labeled by one or more annotators and some examples have no labels at all so far. The score is in between 0 and 1, and can be used to prioritize what data to collect additional labels for. Lower scores indicate examples whose true label we are least confident about based on the current data; @@ -555,7 +556,8 @@ def get_active_learning_scores( ---------- labels_multiannotator : pd.DataFrame of np.ndarray 2D pandas DataFrame or array of multiple given labels for each example with shape ``(N, M)``, - where N is the number of examples and M is the number of annotators. + where N is the number of examples and M is the number of annotators. Note that this function also works with + datasets where there is only one annotator (M=1). For more details, labels in the same format expected by the :py:func:`get_label_quality_multiannotator `. Note that examples that have no annotator labels should not be included in this DataFrame/array. pred_probs : np.ndarray @@ -672,6 +674,7 @@ def get_active_learning_scores_ensemble( Multiannotator labels in the same format expected by :py:func:`get_active_learning_scores `. pred_probs : np.ndarray An array of shape ``(P, N, K)`` where P is the number of models, consisting of predicted class probabilities from the ensemble models. + Note that this function also works with datasets where there is only one annotator (M=1). Each set of predicted probabilities with shape ``(N, K)`` is in the same format expected by the :py:func:`get_label_quality_scores `. pred_probs_unlabeled : np.ndarray, optional An array of shape ``(P, N, K)`` where P is the number of models, consisting of predicted class probabilities from a trained classifier model diff --git a/docs/source/cleanlab/experimental/index.rst b/docs/source/cleanlab/experimental/index.rst index 35af09b504..798be77934 100644 --- a/docs/source/cleanlab/experimental/index.rst +++ b/docs/source/cleanlab/experimental/index.rst @@ -4,6 +4,10 @@ experimental .. warning:: Methods in this ``experimental`` module are bleeding edge and may have sharp edges. They are not guaranteed to be stable between different ``cleanlab`` versions. +Useful methods/models adapted for use with cleanlab. + +Some of these files include various models that can be used with cleanlab to find issues in specific types of data. These require dependencies on deep learning and other machine learning packages that are not official cleanlab dependencies. You must install these dependencies on your own if you wish to use them. + .. automodule:: cleanlab.experimental :autosummary: :members: diff --git a/docs/source/cleanlab/models/index.rst b/docs/source/cleanlab/models/index.rst index d095a7b18f..8c34d0e267 100644 --- a/docs/source/cleanlab/models/index.rst +++ b/docs/source/cleanlab/models/index.rst @@ -4,6 +4,11 @@ models .. warning:: Methods in this ``models`` module are not guaranteed to be stable between different ``cleanlab`` versions. +Useful models adapted for use with cleanlab. + +Some of these files include various models that can be used with cleanlab to find issues in specific types of data. These require dependencies on deep learning and other machine learning packages that are not official cleanlab dependencies. You must install these dependencies on your own if you wish to use them. + + .. automodule:: cleanlab.models :autosummary: :members: diff --git a/docs/source/cleanlab/token_classification/index.rst b/docs/source/cleanlab/token_classification/index.rst index 1681cfa2f6..69ea1dfcde 100644 --- a/docs/source/cleanlab/token_classification/index.rst +++ b/docs/source/cleanlab/token_classification/index.rst @@ -1,6 +1,8 @@ token_classification ==================== +Methods to detect data and label issues in token classification datasets. + .. automodule:: cleanlab.token_classification :autosummary: :members: From 38e1dab34de89757f918d01db513f8bcaa08574e Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Wed, 1 Mar 2023 00:39:35 -0800 Subject: [PATCH 129/258] bump version number --- cleanlab/version.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/cleanlab/version.py b/cleanlab/version.py index 14c5fc056f..831688b145 100644 --- a/cleanlab/version.py +++ b/cleanlab/version.py @@ -15,9 +15,20 @@ # along with cleanlab. If not, see . -__version__ = "2.2.1" +__version__ = "2.3.0" -# 2.2.1 - Not yet released, you are using developer version. See documentation at: https://docs.cleanlab.ai/master/ +# 2.3.0 - Extending cleanlab beyond label errors into a complete library for data-centric AI +# +# Major new functionalities include: +# - Active learning with data re-labeling (ActiveLab) +# - KerasWrapperModel and KerasSequentialWrapper to make arbitrary Keras models compatible with scikit-learn +# - Computational improvements for detecting label issues (better efficiency and mini-batch estimation that works with lower memory) +# +# See release for a full changelog. + +# ------------------------------------------ +# | PREVIOUS VERSION RELEASE NOTES SUMMARY | +# ------------------------------------------ # 2.2.0 - Re-invented algorithms for multi-label classification and support for datasets with missing classes # @@ -30,12 +41,6 @@ # - cleanlab now works much better for datasets in which some classes happen to not be present. # - Algorithmic improvements to ensure count.num_label_issues() returns more accurate estimates. # - For developers: introduction of flake8 code linter and more comprehensive mypy type annotations. -# -# See release for a full changelog. - -# ------------------------------------------ -# | PREVIOUS VERSION RELEASE NOTES SUMMARY | -# ------------------------------------------ # 2.1.0 - "Multiannotator, Outlier detection, and Token Classification" - Cleanlab supports several new features # From b8c034f34f95fa473ee44fb5c0f52d5e8e262b5c Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Wed, 1 Mar 2023 01:02:29 -0800 Subject: [PATCH 130/258] add 2.3.0 to release versions (#644) --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 3dc11aade2..2665fc5a31 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -147,6 +147,7 @@ # Add new tags to RELEASE_VERSIONS before release # fmt: off "RELEASE_VERSIONS": [ + "v2.3.0", "v2.2.0", "v2.1.0", "v2.0.0", From 4606fecfd47433b850d30768e20e19bc207c845e Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Wed, 1 Mar 2023 15:35:53 -0800 Subject: [PATCH 131/258] update readme links to point to latest docs (#645) --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 1fc52ef554..706c32d1a4 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ Get started with: [documentation](https://docs.cleanlab.ai/), [tutorials](https:
  • Sep 2022 📖 cleanlab 2.1.0 released! Added support for: data labeled by multiple annotators in cleanlab.multiannotator, token classification with text data in cleanlab.token_classification, out-of-distribution detection in cleanlab.outlier, and CleanLearning with non-numpy-array data (e.g. pandas dataframes, tensorflow/pytorch datasets, etc) in cleanlab.classification.CleanLearning.
  • April 2022 📖 cleanlab 2.0.0 released! Lays foundations for this library to grow into a general-purpose data-centric AI toolkit.
  • March 2022 📖 Documentation migrated to new website: docs.cleanlab.ai with quickstart tutorials for image/text/audio/tabular data.
  • -
  • Feb 2022 💻 APIs simplified to make cleanlab accessible for everybody, not just ML researchers
  • +
  • Feb 2022 💻 APIs simplified to make cleanlab accessible for everybody, not just ML researchers
  • Long-time cleanlab user? Here's how to migrate to cleanlab versions >= 2.0.0.
  • @@ -65,9 +65,9 @@ Get started with: [documentation](https://docs.cleanlab.ai/), [tutorials](https:

    • Dec 2020 🎉 cleanlab supports NeurIPS workshop paper (Northcutt, Athalye, & Lin, 2020).
    • -
    • Dec 2020 🤖 cleanlab supports PU learning.
    • +
    • Dec 2020 🤖 cleanlab supports Positive-Unlabeled (PU) learning.
    • Feb 2020 🤖 cleanlab now natively supports Mac, Linux, and Windows.
    • -
    • Feb 2020 🤖 cleanlab now supports Co-Teaching (Han et al., 2018).
    • +
    • Feb 2020 🤖 cleanlab now supports Co-Teaching (Han et al., 2018).
    • Jan 2020 🎉 cleanlab achieves state-of-the-art on CIFAR-10 with noisy labels. Code to reproduce: examples/cifar10. This is a great place to see how to use cleanlab on real datasets (with predicted probabilities from trained model already precomputed for you).

    @@ -151,7 +151,7 @@ cl.predict(test_data) More details are provided in documentation of [cleanlab.classification.CleanLearning](https://docs.cleanlab.ai/stable/cleanlab/classification.html). -Note, some libraries exist to give you sklearn-compatibility for free. For PyTorch, check out the [skorch](https://skorch.readthedocs.io/) Python library which will wrap your PyTorch model into a sklearn-compatible model ([example](https://docs.cleanlab.ai/stable/tutorials/image.html)). For TensorFlow/Keras, check out [SciKeras](https://www.adriangb.com/scikeras/) ([example](https://docs.cleanlab.ai/stable/tutorials/text.html)) or [our own Keras wrapper](https://docs.cleanlab.ai/stable/cleanlab/experimental/keras.html). Many libraries also already offer a special scikit-learn API, for example: [XGBoost](https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn) or [LightGBM](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html). +Note, some libraries exist to give you sklearn-compatibility for free. For PyTorch, check out the [skorch](https://skorch.readthedocs.io/) Python library which will wrap your PyTorch model into a sklearn-compatible model ([example](https://docs.cleanlab.ai/stable/tutorials/image.html)). For TensorFlow/Keras, check out our [Keras wrapper](https://docs.cleanlab.ai/stable/cleanlab/models/keras.html). Many libraries also already offer a special scikit-learn API, for example: [XGBoost](https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn) or [LightGBM](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html).
    @@ -217,7 +217,7 @@ cleanlab is a general tool that can learn with noisy labels regardless of datase ![](https://raw.githubusercontent.com/cleanlab/assets/master/cleanlab/demo_cleanlab_across_datasets_and_classifiers.png) -Each sub-figure above depicts the decision boundary learned using [cleanlab.classification.CleanLearning](https://github.com/cleanlab/cleanlab/blob/master/cleanlab/classification.py#L141) in the presence of extreme (\~35%) label errors (circled in green). Label noise is class-conditional (not uniformly random). Columns are organized by the classifier used, except the left-most column which depicts the ground-truth data distribution. Rows are organized by dataset. +Each sub-figure above depicts the decision boundary learned using [cleanlab.classification.CleanLearning](https://docs.cleanlab.ai/stable/cleanlab/classification.html) in the presence of extreme (\~35%) label errors (circled in green). Label noise is class-conditional (not uniformly random). Columns are organized by the classifier used, except the left-most column which depicts the ground-truth data distribution. Rows are organized by dataset. Each sub-figure depicts accuracy scores on a test set (with correct non-noisy labels) as decimal values: @@ -289,12 +289,12 @@ cleanlab for advanced users
    -Many methods and their default parameters are not covered here. Check out the [documentation for the master branch version](https://docs.cleanlab.ai/master/) for the full suite of features supported by the cleanlab API. +Many methods and their default parameters are not covered here. Check out the [documentation for the developer version (aka master branch)](https://docs.cleanlab.ai/master/) for the full suite of features supported by the cleanlab API. ## Use any custom model's predicted probabilities to find label errors in 1 line of code pred_probs (num_examples x num_classes matrix of predicted probabilities) should already be computed on your own, with any classifier. For best results, pred_probs should be obtained in a holdout/out-of-sample manner (e.g. via cross-validation). -* cleanlab can do this for you via [`cleanlab.count.estimate_cv_predicted_probabilities`](https://docs.cleanlab.ai/master/cleanlab/count.html)] +* cleanlab can do this for you via [`cleanlab.count.estimate_cv_predicted_probabilities`](https://docs.cleanlab.ai/stable/cleanlab/count.html)] * Tutorial with more info: [[here](https://docs.cleanlab.ai/stable/tutorials/pred_probs_cross_val.html)] * Examples how to compute pred_probs with: [[CNN image classifier (PyTorch)](https://docs.cleanlab.ai/stable/tutorials/image.html)], [[NN text classifier (TensorFlow)](https://docs.cleanlab.ai/stable/tutorials/text.html)] @@ -387,11 +387,11 @@ Positive-Unlabeled Learning
    -Positive-Unlabeled (PU) learning (in which your data only contains a few positively labeled examples with the rest unlabeled) is just a special case of [CleanLearning](https://github.com/cleanlab/cleanlab/blob/master/cleanlab/classification.py#L141) when one of the classes has no error. `P` stands for the positive class and **is assumed to have zero label errors** and `U` stands for unlabeled data, but in practice, we just assume the `U` class is a noisy negative class that actually contains some positive examples. Thus, the goal of PU learning is to (1) estimate the proportion of negatively labeled examples that actually belong to the positive class (see`fraction\_noise\_in\_unlabeled\_class` in the last example), (2) find the errors (see last example), and (3) train on clean data (see first example below). cleanlab does all three, taking into account that there are no label errors in whichever class you specify as positive. +Positive-Unlabeled (PU) learning (in which your data only contains a few positively labeled examples with the rest unlabeled) is just a special case of [CleanLearning](https://docs.cleanlab.ai/stable/cleanlab/classification.html) when one of the classes has no error. `P` stands for the positive class and **is assumed to have zero label errors** and `U` stands for unlabeled data, but in practice, we just assume the `U` class is a noisy negative class that actually contains some positive examples. Thus, the goal of PU learning is to (1) estimate the proportion of negatively labeled examples that actually belong to the positive class (see`fraction\_noise\_in\_unlabeled\_class` in the last example), (2) find the errors (see last example), and (3) train on clean data (see first example below). cleanlab does all three, taking into account that there are no label errors in whichever class you specify as positive. There are two ways to use cleanlab for PU learning. We'll look at each here. -Method 1. If you are using the cleanlab classifier [CleanLearning()](https://github.com/cleanlab/cleanlab/blob/master/cleanlab/classification.py#L141), and your dataset has exactly two classes (positive = 1, and negative = 0), PU +Method 1. If you are using the cleanlab classifier [CleanLearning()](https://docs.cleanlab.ai/stable/cleanlab/classification.html), and your dataset has exactly two classes (positive = 1, and negative = 0), PU learning is supported directly in cleanlab. You can perform PU learning like this: ``` python @@ -405,7 +405,7 @@ cl.fit(X=X_train_data, labels=train_noisy_labels) predicted_test_labels = cl.predict(X_test) ``` -Method 2. However, you might be using a more complicated classifier that doesn't work well with [CleanLearning](https://github.com/cleanlab/cleanlab/blob/master/cleanlab/classification.py#L141) (see this example for CIFAR-10). Or you might have 3 or more classes. Here's how to use cleanlab for PU learning in this situation. To let cleanlab know which class has no error (in standard PU learning, this is the P class), you need to set the threshold for that class to 1 (1 means the probability that the labels of that class are correct is 1, i.e. that class has no +Method 2. However, you might be using a more complicated classifier that doesn't work well with [CleanLearning](https://docs.cleanlab.ai/stable/cleanlab/classification.html) (see this example for CIFAR-10). Or you might have 3 or more classes. Here's how to use cleanlab for PU learning in this situation. To let cleanlab know which class has no error (in standard PU learning, this is the P class), you need to set the threshold for that class to 1 (1 means the probability that the labels of that class are correct is 1, i.e. that class has no error). Here's the code: ``` python From b1bfdecf45d9945d8980edbaa94532f4f95a2813 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Thu, 9 Mar 2023 23:33:10 -0800 Subject: [PATCH 132/258] resize readme image (#650) --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 706c32d1a4..169c43dfd5 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ -![](https://raw.githubusercontent.com/cleanlab/assets/master/cleanlab/cleanlab_logo_open_source_transparent_optimized_size.png) +

    + +

    + cleanlab automatically detects problems in a ML dataset. This data-centric AI package facilitates **machine learning with messy, real-world data** by providing **clean lab**els for robust training and flagging errors in your data. From 2a16c473fc2d01bbbca565f4f4acdd52312e80a7 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Sun, 12 Mar 2023 20:23:37 -0700 Subject: [PATCH 133/258] post v2.3.0 release version bump (#646) --- cleanlab/version.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cleanlab/version.py b/cleanlab/version.py index 831688b145..c3ec42f9af 100644 --- a/cleanlab/version.py +++ b/cleanlab/version.py @@ -15,7 +15,9 @@ # along with cleanlab. If not, see . -__version__ = "2.3.0" +__version__ = "2.3.1" + +# 2.3.1 - Not yet released, you are using bleeding-edge developer version. See its documentation at: https://docs.cleanlab.ai/master/ # 2.3.0 - Extending cleanlab beyond label errors into a complete library for data-centric AI # From 44081c6538360f72e8c2acdb3a8fc5c4ba7d6cda Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Sun, 12 Mar 2023 20:26:04 -0700 Subject: [PATCH 134/258] add activelab name to docs (#648) Co-authored-by: huiwengoh <45724323+huiwengoh@users.noreply.github.com> --- cleanlab/multiannotator.py | 24 ++++++++++++---------- docs/source/tutorials/multiannotator.ipynb | 2 +- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/cleanlab/multiannotator.py b/cleanlab/multiannotator.py index 7b9f984de8..0b0dfb02d1 100644 --- a/cleanlab/multiannotator.py +++ b/cleanlab/multiannotator.py @@ -25,15 +25,17 @@ * An analogous label quality score for each individual label chosen by one annotator for a particular example. * An overall quality score for each annotator which measures our confidence in the overall correctness of labels obtained from this annotator. -The underlying algorithms used to compute the statistics are described in `the CROWDLAB paper `_. +The algorithms to compute these estimates are described in `the CROWDLAB paper `_. If you have some labeled and unlabeled data (with multiple annotators for some labeled examples) and want to decide what data to collect additional labels for, use the :py:func:`get_active_learning_scores ` function, which is intended for active learning. -This function estimates an active learning quality score for each example, +This function estimates an ActiveLab quality score for each example, which can be used to prioritize which examples are most informative to collect additional labels for. This function is effective for settings where some examples have been labeled by one or more annotators and other examples can have no labels at all so far, as well as settings where new labels are collected either in batches of examples or one at a time. -Here is an `example notebook `_ showcasing the use of this function in multiple active learning rounds. +Here is an `example notebook `_ showcasing the use of this ActiveLab method for active learning with data re-labeling. + +The algorithms to compute these active learning scores are described in `the ActiveLab paper `_. Each of the main functions in this module utilizes any trained classifier model. Variants of these functions are provided for settings where you have trained an ensemble of multiple models. @@ -69,7 +71,7 @@ def get_label_quality_multiannotator( verbose: bool = True, label_quality_score_kwargs: dict = {}, ) -> Dict[str, Any]: - """Returns label quality scores for each example and for each annotator. + """Returns label quality scores for each example and for each annotator in a dataset labeled by multiple annotators. This function is for multiclass classification datasets where examples have been labeled by multiple annotators (not necessarily the same number of annotators per example). @@ -77,10 +79,10 @@ def get_label_quality_multiannotator( It computes one consensus label for each example that best accounts for the labels chosen by each annotator (and their quality), as well as a consensus quality score for how confident we are that this consensus label is actually correct. It also computes similar quality scores for each annotator's individual labels, and the quality of each annotator. - Scores are between 0 and 1; lower scores indicate labels/annotators less likely to be correct. + Scores are between 0 and 1 (estimated via methods like CROWDLAB); lower scores indicate labels/annotators less likely to be correct. To decide what data to collect additional labels for, try the :py:func:`get_active_learning_scores ` - function, which is intended for active learning with multiple annotators. + (ActiveLab) function, which is intended for active learning with multiple annotators. Parameters ---------- @@ -539,7 +541,7 @@ def get_active_learning_scores( pred_probs: np.ndarray, pred_probs_unlabeled: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, np.ndarray]: - """Returns an active learning quality score for each example in the dataset. + """Returns an ActiveLab quality score for each example in the dataset, to estimate which examples are most informative to (re)label next in active learning. We consider settings where one example can be labeled by one or more annotators and some examples have no labels at all so far. @@ -550,7 +552,7 @@ def get_active_learning_scores( and repeat this process after retraining your classifier. To analyze a fixed dataset labeled by multiple annotators rather than collecting additional labels, try the - :py:func:`get_label_quality_multiannotator ` function instead. + :py:func:`get_label_quality_multiannotator ` (CROWDLAB) function instead. Parameters ---------- @@ -570,7 +572,7 @@ def get_active_learning_scores( Returns ------- active_learning_scores : np.ndarray - Array of shape ``(N,)`` indicating the active learning quality scores for each example. + Array of shape ``(N,)`` indicating the ActiveLab quality scores for each example. Examples with the lowest scores are those we should label next in order to maximally improve our classifier model. active_learning_scores_unlabeled : np.ndarray @@ -663,10 +665,10 @@ def get_active_learning_scores_ensemble( pred_probs: np.ndarray, pred_probs_unlabeled: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, np.ndarray]: - """Returns an active learning quality score for each example in the dataset, based on predictions from an ensemble of models. + """Returns an ActiveLab quality score for each example in the dataset, based on predictions from an ensemble of models. This function is similar to :py:func:`get_active_learning_scores ` but allows for an - ensemble of multiple classifier models to be trained and will aggregate predictions from the models to compute the active learning quality score. + ensemble of multiple classifier models to be trained and will aggregate predictions from the models to compute the ActiveLab quality score. Parameters ---------- diff --git a/docs/source/tutorials/multiannotator.ipynb b/docs/source/tutorials/multiannotator.ipynb index 617551272b..b22a2d97c9 100644 --- a/docs/source/tutorials/multiannotator.ipynb +++ b/docs/source/tutorials/multiannotator.ipynb @@ -710,7 +710,7 @@ "You can also repeatedly iterate this process of getting better consensus labels using the model's out-of-sample predicted probabilities and then retraining the model with the improved labels to get even better predicted probabilities!\n", "For details, see our [examples](https://github.com/cleanlab/examples) notebook on [Iterative use of Cleanlab to Improve Classification Models (and Consensus Labels) from Data Labeled by Multiple Annotators](https://github.com/cleanlab/examples/blob/master/multiannotator_cifar10/multiannotator_cifar10.ipynb).\n", "\n", - "If possible, the best way to improve your model is to collect additional labels for both previously annotated data and extra not-yet-labeled examples. To decide which data is most informative to label next, use `cleanlab.multiannotator.get_active_learning_scores()` rather than the methods shown here. This is demonstrated in our [examples](https://github.com/cleanlab/examples) notebook on [Active Learning with Multiple Data Annotators](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb).\n", + "If possible, the best way to improve your model is to collect additional labels for both previously annotated data and extra not-yet-labeled examples (i.e. *active learning*). To decide which data is most informative to label next, use `cleanlab.multiannotator.get_active_learning_scores()` rather than the methods from this tutorial. This is demonstrated in our examples notebook on [Active Learning with Multiple Data Annotators via ActiveLab](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb).\n", "\n", "\n", "## How does cleanlab.multiannotator work?\n", From 70a2ed2fa2ac137af8d2522eea8796336d74c17b Mon Sep 17 00:00:00 2001 From: Ulyana Date: Sun, 12 Mar 2023 20:51:03 -0700 Subject: [PATCH 135/258] Add clipping of small probabilities to address issue #639 (#647) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/benchmarking/noise_generation.py | 13 +++++------ cleanlab/count.py | 8 +++++-- cleanlab/experimental/label_issues_batched.py | 16 +++++++++---- cleanlab/internal/constants.py | 23 +++++++++++++++++++ cleanlab/internal/label_quality_utils.py | 5 +++- cleanlab/internal/latent_algebra.py | 9 ++++---- cleanlab/internal/util.py | 6 ++--- cleanlab/internal/validation.py | 5 +++- cleanlab/multiannotator.py | 22 ++++++++++-------- cleanlab/rank.py | 13 ++++++----- 10 files changed, 80 insertions(+), 40 deletions(-) create mode 100644 cleanlab/internal/constants.py diff --git a/cleanlab/benchmarking/noise_generation.py b/cleanlab/benchmarking/noise_generation.py index 4c71b57264..63c1352bbb 100644 --- a/cleanlab/benchmarking/noise_generation.py +++ b/cleanlab/benchmarking/noise_generation.py @@ -26,6 +26,7 @@ import numpy as np from cleanlab.internal.util import value_counts +from cleanlab.internal.constants import FLOATING_POINT_COMPARISON def noise_matrix_is_valid(noise_matrix, py, *, verbose=False) -> bool: @@ -65,7 +66,7 @@ def noise_matrix_is_valid(noise_matrix, py, *, verbose=False) -> bool: joint_noise = np.multiply(noise_matrix, py) # / float(N) # Check that joint_probs is valid probability matrix - if not (abs(joint_noise.sum() - 1.0) < 1e-6): + if not (abs(joint_noise.sum() - 1.0) < FLOATING_POINT_COMPARISON): return False # Check that noise_matrix is a valid matrix @@ -386,11 +387,9 @@ def generate_n_rand_probabilities_that_sum_to_m( An array of probabilities. """ - epsilon = 1e-6 # Imprecision allowed for inequalities with floats - if n == 0: return np.array([]) - if (max_prob + epsilon) < m / float(n): + if (max_prob + FLOATING_POINT_COMPARISON) < m / float(n): raise ValueError( "max_prob must be greater or equal to m / n, but " + "max_prob = " @@ -402,7 +401,7 @@ def generate_n_rand_probabilities_that_sum_to_m( + ", m / n = " + str(m / float(n)) ) - if min_prob > (m + epsilon) / float(n): + if min_prob > (m + FLOATING_POINT_COMPARISON) / float(n): raise ValueError( "min_prob must be less or equal to m / n, but " + "max_prob = " @@ -422,7 +421,7 @@ def generate_n_rand_probabilities_that_sum_to_m( min_val = min(result) max_val = max(result) - while max_val > (max_prob + epsilon): + while max_val > (max_prob + FLOATING_POINT_COMPARISON): new_min = min_val + (max_val - max_prob) # This adjustment prevents the new max from always being max_prob. adjustment = (max_prob - new_min) * np.random.rand() @@ -433,7 +432,7 @@ def generate_n_rand_probabilities_that_sum_to_m( min_val = min(result) max_val = max(result) - while min_val < (min_prob - epsilon): + while min_val < (min_prob - FLOATING_POINT_COMPARISON): min_val = min(result) max_val = max(result) new_max = max_val - (min_prob - min_val) diff --git a/cleanlab/count.py b/cleanlab/count.py index cbfdb13f2e..c47590773e 100644 --- a/cleanlab/count.py +++ b/cleanlab/count.py @@ -38,6 +38,8 @@ from cleanlab.typing import LabelLike from cleanlab.internal.multilabel_utils import stack_complement, get_onehot_num_classes +from cleanlab.internal.constants import TINY_VALUE, CONFIDENT_THRESHOLDS_LOWER_BOUND + from cleanlab.internal.util import ( value_counts_fill_missing_classes, clip_values, @@ -49,7 +51,6 @@ get_unique_classes, is_torch_dataset, is_tensorflow_dataset, - TINY_VALUE, ) from cleanlab.internal.latent_algebra import ( compute_inv_noise_matrix, @@ -1449,7 +1450,10 @@ def get_confident_thresholds( np.mean(pred_probs[:, k][labels == k]) if k in unique_classes else BIG_VALUE for k in all_classes ] - return np.array(confident_thresholds) + confident_thresholds = np.clip( + confident_thresholds, a_min=CONFIDENT_THRESHOLDS_LOWER_BOUND, a_max=None + ) + return confident_thresholds def _get_confident_thresholds_multilabel( diff --git a/cleanlab/experimental/label_issues_batched.py b/cleanlab/experimental/label_issues_batched.py index d30df370dd..5cd395837f 100644 --- a/cleanlab/experimental/label_issues_batched.py +++ b/cleanlab/experimental/label_issues_batched.py @@ -34,6 +34,11 @@ from cleanlab.rank import find_top_issues, _compute_label_quality_scores from cleanlab.typing import LabelLike from cleanlab.internal.util import value_counts_fill_missing_classes +from cleanlab.internal.constants import ( + CONFIDENT_THRESHOLDS_LOWER_BOUND, + FLOATING_POINT_COMPARISON, + CLIPPING_LOWER_BOUND, +) import platform import multiprocessing as mp @@ -45,8 +50,6 @@ except ImportError: # pragma: no cover PSUTIL_EXISTS = False -EPS = 1e-6 # small number - # global variable for multiproc on linux adj_confident_thresholds_shared: np.ndarray labels_shared: LabelLike @@ -423,7 +426,7 @@ def get_num_issues(self, silent: bool = False) -> int: calibrated_prune_counts = ( self.prune_counts * self.class_counts - / np.clip(self.normalization, a_min=EPS, a_max=None) + / np.clip(self.normalization, a_min=CLIPPING_LOWER_BOUND, a_max=None) ) # avoid division by 0 return np.rint(np.sum(calibrated_prune_counts)).astype("int") else: # not calibrated @@ -509,6 +512,9 @@ def update_confident_thresholds(self, labels: LabelLike, pred_probs: np.ndarray) ) / np.clip( self.examples_per_class + batch_class_counts, a_min=1, a_max=None ) # avoid division by 0 + self.confident_thresholds = np.clip( + self.confident_thresholds, a_min=CONFIDENT_THRESHOLDS_LOWER_BOUND, a_max=None + ) self.examples_per_class += batch_class_counts self.examples_processed_thresh += batch_size @@ -581,7 +587,7 @@ def _update_num_label_issues( ) if self.n_jobs == 1: - adj_confident_thresholds = self.confident_thresholds - EPS + adj_confident_thresholds = self.confident_thresholds - FLOATING_POINT_COMPARISON pred_class = np.argmax(pred_probs, axis=1) batch_size = len(labels) if thorough: @@ -626,7 +632,7 @@ def _update_num_label_issues( ) else: # multiprocessing implementation global adj_confident_thresholds_shared - adj_confident_thresholds_shared = self.confident_thresholds - EPS + adj_confident_thresholds_shared = self.confident_thresholds - FLOATING_POINT_COMPARISON global labels_shared, pred_probs_shared labels_shared = labels diff --git a/cleanlab/internal/constants.py b/cleanlab/internal/constants.py new file mode 100644 index 0000000000..9a6452ae78 --- /dev/null +++ b/cleanlab/internal/constants.py @@ -0,0 +1,23 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . + + +FLOATING_POINT_COMPARISON = 1e-6 # floating point comparison for fuzzy equals +CLIPPING_LOWER_BOUND = 1e-6 # lower-bound clipping threshold for expected behavior +CONFIDENT_THRESHOLDS_LOWER_BOUND = ( + 2 * FLOATING_POINT_COMPARISON +) # lower bound imposed to clip confident thresholds from below, has to be larger than floating point comparison +TINY_VALUE = 1e-100 # very tiny value for clipping diff --git a/cleanlab/internal/label_quality_utils.py b/cleanlab/internal/label_quality_utils.py index 7317d134c0..3428144c13 100644 --- a/cleanlab/internal/label_quality_utils.py +++ b/cleanlab/internal/label_quality_utils.py @@ -22,6 +22,7 @@ from typing import Optional from cleanlab.count import get_confident_thresholds +from cleanlab.internal.constants import CLIPPING_LOWER_BOUND def _subtract_confident_thresholds( @@ -83,7 +84,9 @@ def _subtract_confident_thresholds( return pred_probs_adj -def get_normalized_entropy(pred_probs: np.ndarray, min_allowed_prob: float = 1e-6) -> np.ndarray: +def get_normalized_entropy( + pred_probs: np.ndarray, min_allowed_prob: float = CLIPPING_LOWER_BOUND +) -> np.ndarray: """Returns the normalized entropy of pred_probs. Normalized entropy is between 0 and 1. Higher values of entropy indicate higher uncertainty in the model's prediction of the correct label. diff --git a/cleanlab/internal/latent_algebra.py b/cleanlab/internal/latent_algebra.py index af3b0ec177..d42c16c436 100644 --- a/cleanlab/internal/latent_algebra.py +++ b/cleanlab/internal/latent_algebra.py @@ -28,7 +28,8 @@ import numpy as np from typing import Tuple -from cleanlab.internal.util import value_counts, clip_values, clip_noise_rates, TINY_VALUE +from cleanlab.internal.util import value_counts, clip_values, clip_noise_rates +from cleanlab.internal.constants import TINY_VALUE, CLIPPING_LOWER_BOUND def compute_ps_py_inv_noise_matrix( @@ -73,7 +74,7 @@ def compute_py_inv_noise_matrix(ps, noise_matrix) -> Tuple[np.ndarray, np.ndarra # No class should have probability 0, so we use .000001 # Make sure valid probabilities that sum to 1.0 - py = clip_values(py, low=1e-6, high=1.0, new_sum=1.0) + py = clip_values(py, low=CLIPPING_LOWER_BOUND, high=1.0, new_sum=1.0) # All the work is done in this function (below) return py, compute_inv_noise_matrix(py=py, noise_matrix=noise_matrix, ps=ps) @@ -267,8 +268,8 @@ def compute_py( err += " should be in [cnt, eqn, marginal, marginal_ps]" raise ValueError(err) - # Clip py (0,1), s.t. no class should have prob 0, hence 1e-5 - py = clip_values(py, low=1e-5, high=1.0, new_sum=1.0) + # Clip py (0,1), s.t. no class should have prob 0, hence 1e-6 + py = clip_values(py, low=CLIPPING_LOWER_BOUND, high=1.0, new_sum=1.0) return py diff --git a/cleanlab/internal/util.py b/cleanlab/internal/util.py index 79ee8e2337..ee2d53f2b0 100644 --- a/cleanlab/internal/util.py +++ b/cleanlab/internal/util.py @@ -25,9 +25,7 @@ from cleanlab.typing import DatasetLike, LabelLike from cleanlab.internal.validation import labels_to_array - - -TINY_VALUE = 1e-100 +from cleanlab.internal.constants import FLOATING_POINT_COMPARISON, TINY_VALUE def remove_noise_from_class(noise_matrix, class_without_noise) -> np.ndarray: @@ -234,7 +232,7 @@ def round_preserving_sum(iterable) -> np.ndarray: orig_sum = np.sum(floats).round() int_sum = np.sum(ints).round() # Adjust the integers so that they sum to orig_sum - while abs(int_sum - orig_sum) > 1e-6: + while abs(int_sum - orig_sum) > FLOATING_POINT_COMPARISON: diff = np.round(orig_sum - int_sum) increment = -1 if int(diff < 0.0) else 1 changes = min(int(abs(diff)), len(iterable)) diff --git a/cleanlab/internal/validation.py b/cleanlab/internal/validation.py index dce6cc9a6c..f36603515f 100644 --- a/cleanlab/internal/validation.py +++ b/cleanlab/internal/validation.py @@ -19,6 +19,7 @@ """ from cleanlab.typing import LabelLike, DatasetLike +from cleanlab.internal.constants import FLOATING_POINT_COMPARISON from typing import Any, List, Optional, Union import warnings import numpy as np @@ -95,7 +96,9 @@ def assert_valid_inputs( f"pred_probs must have at least {highest_class} columns, based on the largest class index which appears in labels." ) # Check for valid probabilities. - if (np.min(pred_probs) < 0) or (np.max(pred_probs) > 1): + if (np.min(pred_probs) < 0 - FLOATING_POINT_COMPARISON) or ( + np.max(pred_probs) > 1 + FLOATING_POINT_COMPARISON + ): raise ValueError("Values in pred_probs must be between 0 and 1.") if X is not None: warnings.warn("When X and pred_probs are both provided, the former may be ignored.") diff --git a/cleanlab/multiannotator.py b/cleanlab/multiannotator.py index 0b0dfb02d1..d708d05713 100644 --- a/cleanlab/multiannotator.py +++ b/cleanlab/multiannotator.py @@ -49,6 +49,8 @@ from cleanlab.rank import get_label_quality_scores from cleanlab.internal.util import get_num_classes, value_counts +from cleanlab.internal.constants import CLIPPING_LOWER_BOUND + from cleanlab.internal.multiannotator_utils import ( assert_valid_inputs_multiannotator, assert_valid_pred_probs, @@ -1296,7 +1298,7 @@ def _get_post_pred_probs_and_weights( consensus_label_subset != np.argmax(np.bincount(consensus_label_subset, minlength=num_classes)) ), - a_min=1e-6, + a_min=CLIPPING_LOWER_BOUND, a_max=None, ) @@ -1306,14 +1308,14 @@ def _get_post_pred_probs_and_weights( ) annotator_error = 1 - annotator_agreement_with_annotators adjusted_annotator_agreement = np.clip( - 1 - (annotator_error / most_likely_class_error), a_min=1e-6, a_max=None + 1 - (annotator_error / most_likely_class_error), a_min=CLIPPING_LOWER_BOUND, a_max=None ) # compute model weight model_error = np.mean(np.argmax(prior_pred_probs_subset, axis=1) != consensus_label_subset) - model_weight = np.max([(1 - (model_error / most_likely_class_error)), 1e-6]) * np.sqrt( - np.mean(num_annotations) - ) + model_weight = np.max( + [(1 - (model_error / most_likely_class_error)), CLIPPING_LOWER_BOUND] + ) * np.sqrt(np.mean(num_annotations)) # compute weighted average post_pred_probs = np.full(prior_pred_probs.shape, np.nan) @@ -1422,7 +1424,7 @@ def _get_post_pred_probs_and_weights_ensemble( consensus_label_subset != np.argmax(np.bincount(consensus_label_subset, minlength=num_classes)) ), - a_min=1e-6, + a_min=CLIPPING_LOWER_BOUND, a_max=None, ) @@ -1432,7 +1434,7 @@ def _get_post_pred_probs_and_weights_ensemble( ) annotator_error = 1 - annotator_agreement_with_annotators adjusted_annotator_agreement = np.clip( - 1 - (annotator_error / most_likely_class_error), a_min=1e-6, a_max=None + 1 - (annotator_error / most_likely_class_error), a_min=CLIPPING_LOWER_BOUND, a_max=None ) # compute model weight @@ -1441,9 +1443,9 @@ def _get_post_pred_probs_and_weights_ensemble( prior_pred_probs_subset = prior_pred_probs[idx][mask] model_error = np.mean(np.argmax(prior_pred_probs_subset, axis=1) != consensus_label_subset) - model_weight[idx] = np.max([(1 - (model_error / most_likely_class_error)), 1e-6]) * np.sqrt( - np.mean(num_annotations) - ) + model_weight[idx] = np.max( + [(1 - (model_error / most_likely_class_error)), CLIPPING_LOWER_BOUND] + ) * np.sqrt(np.mean(num_annotations)) # compute weighted average post_pred_probs = np.full(prior_pred_probs[0].shape, np.nan) diff --git a/cleanlab/rank.py b/cleanlab/rank.py index 60def50a09..9e51da9fd6 100644 --- a/cleanlab/rank.py +++ b/cleanlab/rank.py @@ -37,6 +37,10 @@ import warnings from cleanlab.internal.validation import assert_valid_inputs +from cleanlab.internal.constants import ( + CLIPPING_LOWER_BOUND, +) # lower-bound clipping threshold to prevents 0 in logs and division + from cleanlab.internal.label_quality_utils import ( _subtract_confident_thresholds, get_normalized_entropy, @@ -236,8 +240,6 @@ def get_label_quality_ensemble_scores( get_label_quality_scores """ - MIN_ALLOWED = 1e-6 # lower-bound clipping threshold to prevents 0 in logs and division - # Check pred_probs_list for errors assert isinstance( pred_probs_list, list @@ -277,7 +279,7 @@ def get_label_quality_ensemble_scores( # pred_probs for each model for pred_probs in pred_probs_list: pred_probs_clipped = np.clip( - pred_probs, a_min=MIN_ALLOWED, a_max=None + pred_probs, a_min=CLIPPING_LOWER_BOUND, a_max=None ) # lower-bound clipping threshold to prevents 0 in logs when calculating log loss pred_probs_clipped /= pred_probs_clipped.sum(axis=1)[:, np.newaxis] # renormalize @@ -574,15 +576,14 @@ def get_confidence_weighted_entropy_for_each_label( Lower scores indicate more likely mislabeled examples. """ - MIN_ALLOWED = 1e-6 # lower-bound clipping threshold to prevents 0 in logs and division self_confidence = get_self_confidence_for_each_label(labels, pred_probs) - self_confidence = np.clip(self_confidence, a_min=MIN_ALLOWED, a_max=None) + self_confidence = np.clip(self_confidence, a_min=CLIPPING_LOWER_BOUND, a_max=None) # Divide entropy by self confidence label_quality_scores = get_normalized_entropy(pred_probs) / self_confidence # Rescale - clipped_scores = np.clip(label_quality_scores, a_min=MIN_ALLOWED, a_max=None) + clipped_scores = np.clip(label_quality_scores, a_min=CLIPPING_LOWER_BOUND, a_max=None) label_quality_scores = np.log(label_quality_scores + 1) / clipped_scores return label_quality_scores From 00776646a7764e07cc4078e8c25e873ee413c5f7 Mon Sep 17 00:00:00 2001 From: Hui Wen <45724323+huiwengoh@users.noreply.github.com> Date: Mon, 13 Mar 2023 11:37:00 -0400 Subject: [PATCH 136/258] Fix bug with call to find_overlapping_issues without specifying labels (#652) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix None labels issue * build: ➕ add hypothesis to dev dependencies. Used for property-based testing * test: ✅ test that find_overlapping_classes can run by only providing a confident joint Resolves #651 --------- Co-authored-by: Elías Snorrason --- cleanlab/count.py | 7 ++++++- cleanlab/dataset.py | 15 ++++++++------- requirements-dev.txt | 1 + tests/test_dataset.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 58 insertions(+), 8 deletions(-) diff --git a/cleanlab/count.py b/cleanlab/count.py index c47590773e..faa2260118 100644 --- a/cleanlab/count.py +++ b/cleanlab/count.py @@ -389,7 +389,12 @@ def estimate_joint( multi_label=multi_label, ) else: - calibrated_cj = calibrate_confident_joint(confident_joint, labels, multi_label=multi_label) + if labels is not None: + calibrated_cj = calibrate_confident_joint( + confident_joint, labels, multi_label=multi_label + ) + else: + calibrated_cj = confident_joint assert isinstance(calibrated_cj, np.ndarray) if multi_label: diff --git a/cleanlab/dataset.py b/cleanlab/dataset.py index 9020b34539..78c9da670f 100644 --- a/cleanlab/dataset.py +++ b/cleanlab/dataset.py @@ -22,6 +22,7 @@ and which classes to merge (see :py:func:`find_overlapping_classes `). """ +from typing import Optional, cast import numpy as np import pandas as pd from cleanlab.count import estimate_joint @@ -248,7 +249,7 @@ def _2d_matrix_to_row_column_value_list(matrix): multi_label=multi_label, ) if num_examples is None: - num_examples = _get_num_examples(labels=labels) + num_examples = _get_num_examples(labels=labels, confident_joint=confident_joint) if asymmetric: rcv_list = _2d_matrix_to_row_column_value_list(joint) # Remove diagonal elements @@ -444,7 +445,7 @@ def health_summary( } -def _get_num_examples(labels=None) -> int: +def _get_num_examples(labels=None, confident_joint: Optional[np.ndarray] = None) -> int: """Helper method that finds the number of examples from the parameters or throws an error if neither parameter is provided. @@ -462,11 +463,11 @@ def _get_num_examples(labels=None) -> int: ValueError If `labels` is None.""" - if labels is not None: - num_examples = len(labels) - else: + if labels is None and confident_joint is None: raise ValueError( - "Error: num_examples is None. You must provide a value for num_examples " - "when calling this method using the joint as an input parameter." + "Error: num_examples is None. You must either provide confident_joint, " + "or provide both num_example and joint as input parameters." ) + _confident_joint = cast(np.ndarray, confident_joint) + num_examples = len(labels) if labels is not None else cast(int, np.sum(_confident_joint)) return num_examples diff --git a/requirements-dev.txt b/requirements-dev.txt index dd3d55e4fe..cab341d45a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,6 @@ # Python dependencies for development coverage != 6.3, != 6.3.* +hypothesis mypy pandas-stubs pre-commit diff --git a/tests/test_dataset.py b/tests/test_dataset.py index a96d26967a..1331fe5677 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -16,8 +16,11 @@ import requests import pytest +import hypothesis.extra.numpy as npst +import hypothesis.strategies as st import io import numpy as np +from hypothesis import given from cleanlab.dataset import ( health_summary, find_overlapping_classes, @@ -492,3 +495,43 @@ def test_value_error_missing_num_examples_with_joint(use_num_examples, use_label joint=joint, num_examples=len(labels) if use_num_examples else None, ) + + +confident_joint_strategy = npst.arrays( + np.int32, + shape=npst.array_shapes(min_dims=2, max_dims=2, min_side=2, max_side=10), + elements=st.integers(min_value=0, max_value=int(1e6)), +).filter(lambda arr: arr.shape[0] == arr.shape[1]) + + +@pytest.mark.issue_651 +@given(confident_joint=confident_joint_strategy) +def test_find_overlapping_classes_with_confident_joint(confident_joint): + # Setup + K = confident_joint.shape[0] + overlapping_classes = find_overlapping_classes(confident_joint=confident_joint) + + # Test that the output dataframe has the expected columns + expected_columns = [ + "Class Index A", + "Class Index B", + "Num Overlapping Examples", + "Joint Probability", + ] + assert set(overlapping_classes.columns) == set(expected_columns) + + # Class indices must be valid + assert overlapping_classes["Class Index A"].between(0, K - 1).all() + assert overlapping_classes["Class Index B"].between(0, K - 1).all() + + # Overlapping example count should be non-negative integers + assert (overlapping_classes["Num Overlapping Examples"] >= 0).all() + assert overlapping_classes["Num Overlapping Examples"].dtype == int + + # Joint probabilities should be between 0 and 1 + assert (overlapping_classes["Joint Probability"] >= 0).all() + assert (overlapping_classes["Joint Probability"] <= 1).all() + + # Joint probabilities sorted in descending order + if K > 2: + assert (overlapping_classes["Joint Probability"].diff()[1:] <= 0).all() From fa1db6e1d5013538f7fe3c6570b96cb04e30390b Mon Sep 17 00:00:00 2001 From: Hui Wen <45724323+huiwengoh@users.noreply.github.com> Date: Tue, 21 Mar 2023 02:36:41 -0400 Subject: [PATCH 137/258] Bug fixes + improvements to multiannotator module (#654) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/internal/multiannotator_utils.py | 35 +-- cleanlab/multiannotator.py | 285 +++++++++++++--------- tests/test_multiannotator.py | 29 ++- 3 files changed, 222 insertions(+), 127 deletions(-) diff --git a/cleanlab/internal/multiannotator_utils.py b/cleanlab/internal/multiannotator_utils.py index 27e73eb079..429321b13e 100644 --- a/cleanlab/internal/multiannotator_utils.py +++ b/cleanlab/internal/multiannotator_utils.py @@ -129,17 +129,23 @@ def assert_valid_inputs_multiannotator( def assert_valid_pred_probs( - pred_probs: np.ndarray, + pred_probs: Optional[np.ndarray] = None, pred_probs_unlabeled: Optional[np.ndarray] = None, ensemble: bool = False, ): - """Validate format of pred_probs for multiannotator functions""" + """Validate format of pred_probs for multiannotator active learning functions""" + if pred_probs is None and pred_probs_unlabeled is None: + raise ValueError( + "pred_probs and pred_probs_unlabeled cannot both be None, specify at least one of the two." + ) + if ensemble: - if pred_probs.ndim != 3: - error_message = "pred_probs must be a 3d array." - if pred_probs.ndim == 2: # pragma: no cover - error_message += " If you have a 2d pred_probs array, use the non-ensemble version of this function." - raise ValueError(error_message) + if pred_probs is not None: + if pred_probs.ndim != 3: + error_message = "pred_probs must be a 3d array." + if pred_probs.ndim == 2: # pragma: no cover + error_message += " If you have a 2d pred_probs array (ie. only one predictor), use the non-ensemble version of this function." + raise ValueError(error_message) if pred_probs_unlabeled is not None: if pred_probs_unlabeled.ndim != 3: @@ -148,19 +154,19 @@ def assert_valid_pred_probs( error_message += " If you have a 2d pred_probs_unlabeled array, use the non-ensemble version of this function." raise ValueError(error_message) + if pred_probs is not None and pred_probs_unlabeled is not None: if pred_probs.shape[2] != pred_probs_unlabeled.shape[2]: raise ValueError( "pred_probs and pred_probs_unlabeled must have the same number of classes" ) else: - if pred_probs.ndim != 2: - error_message = "pred_probs must be a 2d array." - if pred_probs.ndim == 3: # pragma: no cover - error_message += ( - " If you have a 3d pred_probs array, use the ensemble version of this function." - ) - raise ValueError(error_message) + if pred_probs is not None: + if pred_probs.ndim != 2: + error_message = "pred_probs must be a 2d array." + if pred_probs.ndim == 3: # pragma: no cover + error_message += " If you have a 3d pred_probs array, use the ensemble version of this function." + raise ValueError(error_message) if pred_probs_unlabeled is not None: if pred_probs_unlabeled.ndim != 2: @@ -169,6 +175,7 @@ def assert_valid_pred_probs( error_message += " If you have a 3d pred_probs_unlabeled array, use the non-ensemble version of this function." raise ValueError(error_message) + if pred_probs is not None and pred_probs_unlabeled is not None: if pred_probs.shape[1] != pred_probs_unlabeled.shape[1]: raise ValueError( "pred_probs and pred_probs_unlabeled must have the same number of classes" diff --git a/cleanlab/multiannotator.py b/cleanlab/multiannotator.py index d708d05713..c22a91548d 100644 --- a/cleanlab/multiannotator.py +++ b/cleanlab/multiannotator.py @@ -539,8 +539,8 @@ def get_label_quality_multiannotator_ensemble( def get_active_learning_scores( - labels_multiannotator: Union[pd.DataFrame, np.ndarray], - pred_probs: np.ndarray, + labels_multiannotator: Optional[Union[pd.DataFrame, np.ndarray]] = None, + pred_probs: Optional[np.ndarray] = None, pred_probs_unlabeled: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, np.ndarray]: """Returns an ActiveLab quality score for each example in the dataset, to estimate which examples are most informative to (re)label next in active learning. @@ -553,28 +553,35 @@ def get_active_learning_scores( To use an annotation budget most efficiently, select a batch of examples with the lowest scores and collect one additional label for each example, and repeat this process after retraining your classifier. + You can use this function to get active learning scores for: examples that already have one or more labels (specify ``labels_multiannotator`` and ``pred_probs`` + as arguments), or for unlabeled examples (specify ``pred_probs_unlabeled``), or for both types of examples (specify all of the above arguments). + To analyze a fixed dataset labeled by multiple annotators rather than collecting additional labels, try the :py:func:`get_label_quality_multiannotator ` (CROWDLAB) function instead. Parameters ---------- - labels_multiannotator : pd.DataFrame of np.ndarray + labels_multiannotator : pd.DataFrame of np.ndarray, optional 2D pandas DataFrame or array of multiple given labels for each example with shape ``(N, M)``, where N is the number of examples and M is the number of annotators. Note that this function also works with datasets where there is only one annotator (M=1). For more details, labels in the same format expected by the :py:func:`get_label_quality_multiannotator `. Note that examples that have no annotator labels should not be included in this DataFrame/array. - pred_probs : np.ndarray + This argument is optional if ``pred_probs`` is not provided (you might only provide ``pred_probs_unlabeled`` to only get active learning scores for the unlabeled examples). + pred_probs : np.ndarray, optional An array of shape ``(N, K)`` of predicted class probabilities from a trained classifier model. Predicted probabilities in the same format expected by the :py:func:`get_label_quality_scores `. + This argument is optional if you only want to get active learning scores for unlabeled examples (specify only ``pred_probs_unlabeled`` instead). pred_probs_unlabeled : np.ndarray, optional An array of shape ``(N, K)`` of predicted class probabilities from a trained classifier model for examples that have no annotator labels. Predicted probabilities in the same format expected by the :py:func:`get_label_quality_scores `. + This argument is optional if you only want to get active learning scores for already-labeled examples (specify only ``pred_probs`` instead). Returns ------- active_learning_scores : np.ndarray Array of shape ``(N,)`` indicating the ActiveLab quality scores for each example. + This array is empty if no already-labeled data was provided via ``labels_multiannotator``. Examples with the lowest scores are those we should label next in order to maximally improve our classifier model. active_learning_scores_unlabeled : np.ndarray @@ -584,60 +591,83 @@ def get_active_learning_scores( (scores for unlabeled data are directly comparable with the `active_learning_scores` for labeled data). """ - if isinstance(labels_multiannotator, np.ndarray): - labels_multiannotator = pd.DataFrame(labels_multiannotator) - assert_valid_pred_probs(pred_probs=pred_probs, pred_probs_unlabeled=pred_probs_unlabeled) - num_classes = get_num_classes(pred_probs=pred_probs) + # compute multiannotator stats if labeled data is provided + if pred_probs is not None: + if labels_multiannotator is None: + raise ValueError( + "labels_multiannotator cannot be None when passing in pred_probs. ", + "Either provide labels_multiannotator to obtain active learning scores for the labeled examples, " + "or just pass in pred_probs_unlabeled to get active learning scores for unlabeled examples.", + ) - # if all examples are only labeled by a single annotator - if labels_multiannotator.apply(lambda s: len(s.dropna()) == 1, axis=1).all(): - optimal_temp = 1.0 # do not temp scale for single annotator case, temperature is defined here for later use + if isinstance(labels_multiannotator, np.ndarray): + labels_multiannotator = pd.DataFrame(labels_multiannotator) - assert_valid_inputs_multiannotator( - labels_multiannotator, pred_probs, allow_single_label=True - ) + num_classes = get_num_classes(pred_probs=pred_probs) - consensus_label = get_majority_vote_label( - labels_multiannotator=labels_multiannotator, - pred_probs=pred_probs, - verbose=False, - ) - quality_of_consensus_labeled = get_label_quality_scores(consensus_label, pred_probs) - model_weight = 1 - annotator_weight = np.full(labels_multiannotator.shape[1], 1) - avg_annotator_weight = np.mean(annotator_weight) + # if all examples are only labeled by a single annotator + if labels_multiannotator.apply(lambda s: len(s.dropna()) == 1, axis=1).all(): + optimal_temp = 1.0 # do not temp scale for single annotator case, temperature is defined here for later use - else: - optimal_temp = find_best_temp_scaler(labels_multiannotator, pred_probs) - pred_probs = temp_scale_pred_probs(pred_probs, optimal_temp) + assert_valid_inputs_multiannotator( + labels_multiannotator, pred_probs, allow_single_label=True + ) - multiannotator_info = get_label_quality_multiannotator( - labels_multiannotator, - pred_probs, - return_annotator_stats=False, - return_detailed_quality=False, - return_weights=True, - ) + consensus_label = get_majority_vote_label( + labels_multiannotator=labels_multiannotator, + pred_probs=pred_probs, + verbose=False, + ) + quality_of_consensus_labeled = get_label_quality_scores(consensus_label, pred_probs) + model_weight = 1 + annotator_weight = np.full(labels_multiannotator.shape[1], 1) + avg_annotator_weight = np.mean(annotator_weight) - quality_of_consensus_labeled = multiannotator_info["label_quality"][ - "consensus_quality_score" - ] - model_weight = multiannotator_info["model_weight"] - annotator_weight = multiannotator_info["annotator_weight"] - avg_annotator_weight = np.mean(annotator_weight) - - # compute scores for labeled data - active_learning_scores = np.full(len(labels_multiannotator), np.nan) - for i in range(len(active_learning_scores)): - annotator_labels = labels_multiannotator.iloc[i] - active_learning_scores[i] = np.average( - (quality_of_consensus_labeled[i], 1 / num_classes), - weights=( - np.sum(annotator_weight[annotator_labels.notna()]) + model_weight, - avg_annotator_weight, - ), + # examples are annotated by multiple annotators + else: + optimal_temp = find_best_temp_scaler(labels_multiannotator, pred_probs) + pred_probs = temp_scale_pred_probs(pred_probs, optimal_temp) + + multiannotator_info = get_label_quality_multiannotator( + labels_multiannotator, + pred_probs, + return_annotator_stats=False, + return_detailed_quality=False, + return_weights=True, + ) + + quality_of_consensus_labeled = multiannotator_info["label_quality"][ + "consensus_quality_score" + ] + model_weight = multiannotator_info["model_weight"] + annotator_weight = multiannotator_info["annotator_weight"] + avg_annotator_weight = np.mean(annotator_weight) + + # compute scores for labeled data + active_learning_scores = np.full(len(labels_multiannotator), np.nan) + for i in range(len(active_learning_scores)): + annotator_labels = labels_multiannotator.iloc[i] + active_learning_scores[i] = np.average( + (quality_of_consensus_labeled[i], 1 / num_classes), + weights=( + np.sum(annotator_weight[annotator_labels.notna()]) + model_weight, + avg_annotator_weight, + ), + ) + + # no labeled data provided so do not estimate temperature and model/annotator weights + elif pred_probs_unlabeled is not None: + num_classes = get_num_classes(pred_probs=pred_probs_unlabeled) + optimal_temp = 1 + model_weight = 1 + avg_annotator_weight = 1 + active_learning_scores = np.array([]) + + else: + raise ValueError( + "pred_probs and pred_probs_unlabeled cannot both be None, specify at least one of the two." ) # compute scores for unlabeled data @@ -663,8 +693,8 @@ def get_active_learning_scores( def get_active_learning_scores_ensemble( - labels_multiannotator: Union[pd.DataFrame, np.ndarray], - pred_probs: np.ndarray, + labels_multiannotator: Optional[Union[pd.DataFrame, np.ndarray]] = None, + pred_probs: Optional[np.ndarray] = None, pred_probs_unlabeled: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, np.ndarray]: """Returns an ActiveLab quality score for each example in the dataset, based on predictions from an ensemble of models. @@ -676,14 +706,17 @@ def get_active_learning_scores_ensemble( ---------- labels_multiannotator : pd.DataFrame or np.ndarray Multiannotator labels in the same format expected by :py:func:`get_active_learning_scores `. + This argument is optional if ``pred_probs`` is not provided (in cases where you only provide ``pred_probs_unlabeled`` to get active learning scores for unlabeled examples). pred_probs : np.ndarray An array of shape ``(P, N, K)`` where P is the number of models, consisting of predicted class probabilities from the ensemble models. Note that this function also works with datasets where there is only one annotator (M=1). Each set of predicted probabilities with shape ``(N, K)`` is in the same format expected by the :py:func:`get_label_quality_scores `. + This argument is optional if you only want to get active learning scores for unlabeled examples (pass in ``pred_probs_unlabeled`` instead). pred_probs_unlabeled : np.ndarray, optional An array of shape ``(P, N, K)`` where P is the number of models, consisting of predicted class probabilities from a trained classifier model for examples that have no annotated labels so far (but which we may want to label in the future, and hence compute active learning quality scores for). Each set of predicted probabilities with shape ``(N, K)`` is in the same format expected by the :py:func:`get_label_quality_scores `. + This argument is optional if you only want to get active learning scores for labeled examples (pass in ``pred_probs`` instead). Returns ------- @@ -697,70 +730,91 @@ def get_active_learning_scores_ensemble( get_active_learning_scores """ - if isinstance(labels_multiannotator, np.ndarray): - labels_multiannotator = pd.DataFrame(labels_multiannotator) - assert_valid_pred_probs( pred_probs=pred_probs, pred_probs_unlabeled=pred_probs_unlabeled, ensemble=True ) - num_classes = get_num_classes(pred_probs=pred_probs[0]) + # compute multiannotator stats if labeled data is provided + if pred_probs is not None: + if labels_multiannotator is None: + raise ValueError( + "labels_multiannotator cannot be None when passing in pred_probs. ", + "You can either provide labels_multiannotator to obtain active learning scores for the labeled examples, " + "or just pass in pred_probs_unlabeled to get active learning scores for unlabeled examples.", + ) - # temp scale pred_probs + if isinstance(labels_multiannotator, np.ndarray): + labels_multiannotator = pd.DataFrame(labels_multiannotator) - # if all examples are only labeled by a single annotator - if labels_multiannotator.apply(lambda s: len(s.dropna()) == 1, axis=1).all(): - # do not temp scale for single annotator case, temperature is defined here for later use - optimal_temp = np.full(len(pred_probs), 1.0) + num_classes = get_num_classes(pred_probs=pred_probs[0]) - assert_valid_inputs_multiannotator( - labels_multiannotator, pred_probs, ensemble=True, allow_single_label=True - ) + # if all examples are only labeled by a single annotator + if labels_multiannotator.apply(lambda s: len(s.dropna()) == 1, axis=1).all(): + # do not temp scale for single annotator case, temperature is defined here for later use + optimal_temp = np.full(len(pred_probs), 1.0) - avg_pred_probs = np.mean(pred_probs, axis=0) - consensus_label = get_majority_vote_label( - labels_multiannotator=labels_multiannotator, - pred_probs=avg_pred_probs, - verbose=False, - ) - quality_of_consensus_labeled = get_label_quality_scores(consensus_label, avg_pred_probs) - model_weight = np.full(len(pred_probs), 1) - annotator_weight = np.full(labels_multiannotator.shape[1], 1) - avg_annotator_weight = np.mean(annotator_weight) + assert_valid_inputs_multiannotator( + labels_multiannotator, pred_probs, ensemble=True, allow_single_label=True + ) - else: - optimal_temp = np.full(len(pred_probs), np.NaN) - for i in range(len(pred_probs)): - curr_pred_probs = pred_probs[i] - curr_optimal_temp = find_best_temp_scaler(labels_multiannotator, curr_pred_probs) - pred_probs[i] = temp_scale_pred_probs(curr_pred_probs, curr_optimal_temp) - optimal_temp[i] = curr_optimal_temp - - multiannotator_info = get_label_quality_multiannotator_ensemble( - labels_multiannotator, - pred_probs, - return_annotator_stats=False, - return_detailed_quality=False, - return_weights=True, - ) + avg_pred_probs = np.mean(pred_probs, axis=0) + consensus_label = get_majority_vote_label( + labels_multiannotator=labels_multiannotator, + pred_probs=avg_pred_probs, + verbose=False, + ) + quality_of_consensus_labeled = get_label_quality_scores(consensus_label, avg_pred_probs) + model_weight = np.full(len(pred_probs), 1) + annotator_weight = np.full(labels_multiannotator.shape[1], 1) + avg_annotator_weight = np.mean(annotator_weight) - quality_of_consensus_labeled = multiannotator_info["label_quality"][ - "consensus_quality_score" - ] - model_weight = multiannotator_info["model_weight"] - annotator_weight = multiannotator_info["annotator_weight"] - avg_annotator_weight = np.mean(annotator_weight) - - # compute scores for labeled data - active_learning_scores = np.full(len(labels_multiannotator), np.nan) - for i in range(len(active_learning_scores)): - annotator_labels = labels_multiannotator.iloc[i] - active_learning_scores[i] = np.average( - (quality_of_consensus_labeled[i], 1 / num_classes), - weights=( - np.sum(annotator_weight[annotator_labels.notna()]) + np.sum(model_weight), - avg_annotator_weight, - ), + # examples are annotated by multiple annotators + else: + optimal_temp = np.full(len(pred_probs), np.NaN) + for i in range(len(pred_probs)): + curr_pred_probs = pred_probs[i] + curr_optimal_temp = find_best_temp_scaler(labels_multiannotator, curr_pred_probs) + pred_probs[i] = temp_scale_pred_probs(curr_pred_probs, curr_optimal_temp) + optimal_temp[i] = curr_optimal_temp + + multiannotator_info = get_label_quality_multiannotator_ensemble( + labels_multiannotator, + pred_probs, + return_annotator_stats=False, + return_detailed_quality=False, + return_weights=True, + ) + + quality_of_consensus_labeled = multiannotator_info["label_quality"][ + "consensus_quality_score" + ] + model_weight = multiannotator_info["model_weight"] + annotator_weight = multiannotator_info["annotator_weight"] + avg_annotator_weight = np.mean(annotator_weight) + + # compute scores for labeled data + active_learning_scores = np.full(len(labels_multiannotator), np.nan) + for i in range(len(active_learning_scores)): + annotator_labels = labels_multiannotator.iloc[i] + active_learning_scores[i] = np.average( + (quality_of_consensus_labeled[i], 1 / num_classes), + weights=( + np.sum(annotator_weight[annotator_labels.notna()]) + np.sum(model_weight), + avg_annotator_weight, + ), + ) + + # no labeled data provided so do not estimate temperature and model/annotator weights + elif pred_probs_unlabeled is not None: + num_classes = get_num_classes(pred_probs=pred_probs_unlabeled[0]) + optimal_temp = np.full(len(pred_probs_unlabeled), 1.0) + model_weight = np.full(len(pred_probs_unlabeled), 1) + avg_annotator_weight = 1 + active_learning_scores = np.array([]) + + else: + raise ValueError( + "pred_probs and pred_probs_unlabeled cannot both be None, specify at least one of the two." ) # compute scores for unlabeled data @@ -861,6 +915,7 @@ def get_majority_vote_label( tied_idx[idx] = label_mode[max_pred_probs] # tiebreak 2: using empirical class frequencies + # current tiebreak will select the minority class (to prevent larger class imbalance) if len(tied_idx) > 0: if pred_probs is not None: num_classes = pred_probs.shape[1] @@ -872,14 +927,14 @@ def get_majority_vote_label( lambda s: pd.Series(np.bincount(s[s.notna()], minlength=num_classes)), axis=1 ).sum() for idx, label_mode in tied_idx.copy().items(): - max_frequency = np.where( - class_frequencies[label_mode] == np.max(class_frequencies[label_mode]) + min_frequency = np.where( + class_frequencies[label_mode] == np.min(class_frequencies[label_mode]) )[0] - if len(max_frequency) == 1: - majority_vote_label[idx] = label_mode[max_frequency[0]] + if len(min_frequency) == 1: + majority_vote_label[idx] = label_mode[min_frequency[0]] del tied_idx[idx] else: - tied_idx[idx] = label_mode[max_frequency] + tied_idx[idx] = label_mode[min_frequency] # tiebreak 3: using initial annotator quality scores if len(tied_idx) > 0: @@ -888,7 +943,13 @@ def get_majority_vote_label( annotator_agreement_with_consensus = nontied_labels_multiannotator.apply( lambda s: np.mean(s[pd.notna(s)] == nontied_majority_vote_label[pd.notna(s)]), axis=0, - ).to_numpy() + ) + + # impute average annotator accuracy for any annotator that do not overlap with consensus + mask = annotator_agreement_with_consensus.isna() + avg_annotator_agreement = np.mean(annotator_agreement_with_consensus[~mask]) + annotator_agreement_with_consensus[mask] = avg_annotator_agreement + for idx, label_mode in tied_idx.copy().items(): label_quality_score = np.array( [ diff --git a/tests/test_multiannotator.py b/tests/test_multiannotator.py index a428897870..74be98796b 100644 --- a/tests/test_multiannotator.py +++ b/tests/test_multiannotator.py @@ -434,6 +434,13 @@ def test_get_active_learning_scores(): assert len(active_learning_scores) == len(pred_probs) assert len(active_learning_scores_unlabeled) == 0 + # test case where only passing unlabeled examples + active_learning_scores, active_learning_scores_unlabeled = get_active_learning_scores( + pred_probs_unlabeled=pred_probs_unlabeled + ) + assert len(active_learning_scores) == 0 + assert len(active_learning_scores_unlabeled) == len(pred_probs_unlabeled) + # test case where number of classes do not match try: active_learning_scores, active_learning_scores_unlabeled = get_active_learning_scores( @@ -485,6 +492,13 @@ def test_get_active_learning_scores_ensemble(): assert len(active_learning_scores) == len(labels) assert len(active_learning_scores_unlabeled) == 0 + # test case where only passing unlabeled examples + active_learning_scores, active_learning_scores_unlabeled = get_active_learning_scores_ensemble( + pred_probs_unlabeled=pred_probs_unlabeled + ) + assert len(active_learning_scores) == 0 + assert len(active_learning_scores_unlabeled) == len(labels_unlabeled) + # test case where number of classes do not match try: ( @@ -640,9 +654,22 @@ def test_get_consensus_label(): [0.2, 0.4, 0.4], ] ) - consensus_label = get_majority_vote_label(labels_tiebreaks, pred_probs_tiebreaks) + # more tiebreak testing (without pred_probs + non-overlapping annotators) + labels_tiebreaks = np.array( + [ + [1, np.NaN, np.NaN, 2, np.NaN], + [np.NaN, 1, 0, np.NaN, np.NaN], + [np.NaN, np.NaN, 0, np.NaN, np.NaN], + [np.NaN, 2, np.NaN, np.NaN, np.NaN], + [2, np.NaN, 0, 2, np.NaN], + [np.NaN, np.NaN, np.NaN, 2, 1], + ] + ) + consensus_label = get_majority_vote_label(labels_tiebreaks) + assert all(consensus_label == np.array([1, 1, 0, 2, 2, 1])) + def test_impute_nonoverlaping_annotators(): labels = np.array( From a5f35f3b7b9bc031aaff4502429ff4e770c41acc Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Fri, 24 Mar 2023 01:15:39 -0700 Subject: [PATCH 138/258] additional faq question on handling train vs test data (#655) --- docs/source/tutorials/faq.ipynb | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/docs/source/tutorials/faq.ipynb b/docs/source/tutorials/faq.ipynb index a1978e1417..6854a1d0c1 100644 --- a/docs/source/tutorials/faq.ipynb +++ b/docs/source/tutorials/faq.ipynb @@ -7,7 +7,9 @@ "source": [ "# FAQ\n", "\n", - "Answers to frequently asked questions about the [cleanlab](https://github.com/cleanlab/cleanlab) open-source package." + "Answers to frequently asked questions about the [cleanlab](https://github.com/cleanlab/cleanlab) open-source package.\n", + "\n", + "The code snippets in this FAQ come from a fully executable notebook you can run via Colab or locally by downloading it [here](https://github.com/cleanlab/cleanlab/blob/master/docs/source/tutorials/faq.ipynb).\n" ] }, { @@ -150,7 +152,7 @@ "id": "b386dfc8", "metadata": {}, "source": [ - "If you have already found issues via:" + "Otherwise if you have already found issues via:" ] }, { @@ -168,7 +170,7 @@ "id": "ad9ca03e", "metadata": {}, "source": [ - "then you can see your trained classifier's class prediction for each flagged example via: " + "then you can see your trained classifier's class prediction for each flagged example like this: " ] }, { @@ -186,7 +188,7 @@ "id": "a668b74b", "metadata": {}, "source": [ - "where you can see the classifier's class prediction for every example via:" + "Here you can see the classifier's class prediction for every example via:" ] }, { @@ -208,6 +210,24 @@ "You will be able to produce a much better version of your dataset interactively using [Cleanlab Studio](https://cleanlab.ai/studio/?utm_source=github&utm_medium=docs&utm_campaign=clostostudio), which helps you efficiently fix issues like this in large datasets." ] }, + { + "cell_type": "markdown", + "id": "bcc97591", + "metadata": {}, + "source": [ + "### How shoud I handle label errors in train vs test data?\n", + "\n", + "If you do not address label errors in your test data, you may not even know when you have produced a better ML model because the evaluation is too noisy. For the best-trained models and most reliable evaluation of them, you should fix label errors in both your training and testing data.\n", + "\n", + "To do this efficiently, first use cleanlab to automatically find label issues in both sets. You can simply merge these two sets into one larger dataset and run cross-validation training + `find_label_issues()` on the merged datataset. Calling the [`CleanLearning.find_label_issues()`](../cleanlab/classification.html) method on your merged dataset does both these steps for you with any scikit-learn compatible classifier you choose.\n", + "\n", + "After finding the label issues, be **wary** about auto-correcting the labels for test examples (as cautioned against above). Instead make sure you are only manually fixing labels for your test data. You can use [Cleanlab Studio](https://cleanlab.ai/studio/) to fix labels efficiently.\n", + "\n", + "Auto-correcting labels for your training data is fair game, which should improve your ML performance. You can often boost ML performance further by manually fixing the training examples flagged with label issues, as demonstrated in this article:\n", + "\n", + "[**Handling Mislabeled Tabular Data to Improve Your XGBoost Model**](https://cleanlab.ai/blog/label-errors-tabular-datasets/)" + ] + }, { "cell_type": "markdown", "id": "21f42f24", @@ -597,7 +617,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.15" + "version": "3.9.12" } }, "nbformat": 4, From bed94f19e1cce6f672e879cecea51afb2445a202 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Fri, 24 Mar 2023 02:42:10 -0700 Subject: [PATCH 139/258] Update readme to better reflect current package (#656) --- README.md | 86 +++++++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 169c43dfd5..2b75e6dab9 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,11 @@

    -cleanlab automatically detects problems in a ML dataset. This data-centric AI package facilitates **machine learning with messy, real-world data** by providing **clean lab**els for robust training and flagging errors in your data. +cleanlab helps you **clean** data and **lab**els by automatically detecting issues in a ML dataset. To facilitate **machine learning with messy, real-world data**, this data-centric AI package uses your *existing* models to estimate dataset problems that can be fixed to train even *better* models. ```python -# cleanlab works with **any classifier**. Yup, you can use sklearn/PyTorch/TensorFlow/XGBoost/etc. +# cleanlab works with **any classifier**. Yup, you can use PyTorch/TensorFlow/OpenAI/XGBoost/etc. cl = cleanlab.classification.CleanLearning(sklearn.YourFavoriteClassifier()) # cleanlab finds data and label issues in **any dataset**... in ONE line of code! @@ -41,59 +41,20 @@ Get started with: [documentation](https://docs.cleanlab.ai/), [tutorials](https: ----- -
    News! (2022) -- cleanlab made accessible for everybody, not just ML researchers (click to learn more) -

    -

      -
    • Nov 2022 📖 cleanlab 2.2.0 released! Added better algorithms for: label issues in multi-label classification, data with some classes absent, and estimating the number of label errors in a dataset.
    • -
    • Sep 2022 📖 cleanlab 2.1.0 released! Added support for: data labeled by multiple annotators in cleanlab.multiannotator, token classification with text data in cleanlab.token_classification, out-of-distribution detection in cleanlab.outlier, and CleanLearning with non-numpy-array data (e.g. pandas dataframes, tensorflow/pytorch datasets, etc) in cleanlab.classification.CleanLearning.
    • -
    • April 2022 📖 cleanlab 2.0.0 released! Lays foundations for this library to grow into a general-purpose data-centric AI toolkit.
    • -
    • March 2022 📖 Documentation migrated to new website: docs.cleanlab.ai with quickstart tutorials for image/text/audio/tabular data.
    • -
    • Feb 2022 💻 APIs simplified to make cleanlab accessible for everybody, not just ML researchers
    • -
    • Long-time cleanlab user? Here's how to migrate to cleanlab versions >= 2.0.0.
    • -
    -

    -
    - -
    News! (2021) -- cleanlab finds pervasive label errors in the most common ML datasets (click to learn more) -

    -

    -

    -
    - -
    News! (2020) -- cleanlab supports all OS, achieves state-of-the-art performance (click to learn more) -

    -

      -
    • Dec 2020 🎉 cleanlab supports NeurIPS workshop paper (Northcutt, Athalye, & Lin, 2020).
    • -
    • Dec 2020 🤖 cleanlab supports Positive-Unlabeled (PU) learning.
    • -
    • Feb 2020 🤖 cleanlab now natively supports Mac, Linux, and Windows.
    • -
    • Feb 2020 🤖 cleanlab now supports Co-Teaching (Han et al., 2018).
    • -
    • Jan 2020 🎉 cleanlab achieves state-of-the-art on CIFAR-10 with noisy labels. Code to reproduce: examples/cifar10. This is a great place to see how to use cleanlab on real datasets (with predicted probabilities from trained model already precomputed for you).
    • -
    -

    -
    - -Release notes for past versions are [here](https://github.com/cleanlab/cleanlab/releases). -Details behind updates are explained in our [blog](https://cleanlab.ai/blog/) and [research papers](https://cleanlab.ai/research/). - - ## So fresh, so cleanlab -cleanlab **clean**s your data's **lab**els via state-of-the-art *confident learning* algorithms, published in this [paper](https://jair.org/index.php/jair/article/view/12125) and [blog](https://l7.curtisnorthcutt.com/confident-learning). See some of the datasets cleaned with cleanlab at [labelerrors.com](https://labelerrors.com). This package helps you find data and label issues so you can train reliable ML models. +cleanlab **clean**s your data's **lab**els via state-of-the-art *confident learning* algorithms, published in this [paper](https://jair.org/index.php/jair/article/view/12125) and [blog](https://l7.curtisnorthcutt.com/confident-learning). See some of the datasets cleaned with cleanlab at [labelerrors.com](https://labelerrors.com). This package helps you find label issues and other data issues, so you can train reliable ML models. cleanlab is: 1. **backed by theory** - with [provable guarantees](https://arxiv.org/abs/1911.00068) of exact estimation of noise and label errors, even with imperfect models. 2. **fast** - - Code is parallelized (< 1 second to find label issues in ImageNet with pre-computed predictions). + - Code is parallelized and scalable. 4. **easy-to-use** - - Find label issues or train noise-robust models in one line of code (no hyperparameters by default). + - Find mislabeled data, bad annotators, outliers, or train noise-robust models -- all in one line of code. 6. **general** - - Works with **[any dataset](https://labelerrors.com/)** and **any model**, e.g., TensorFlow, PyTorch, sklearn, XGBoost, Huggingface, etc. + - Works with **[any dataset](https://labelerrors.com/)** (text, image, tabular, audio, ...) and **any model** (TensorFlow, PyTorch, JAX, HuggingFace, OpenAI, XGBoost, scikit-learn, ...)
    ![](https://raw.githubusercontent.com/cleanlab/assets/master/cleanlab/label-errors-examples.png) @@ -107,11 +68,11 @@ cleanlab supports Linux, macOS, and Windows and runs on Python 3.7+. - Get started [here](https://docs.cleanlab.ai/)! Install via `pip` or `conda` as described [here](https://docs.cleanlab.ai/). - Developers who install the bleeding-edge from source should refer to [this master branch documentation](https://docs.cleanlab.ai/master/index.html). - +- For help, check out our detailed [FAQ](https://docs.cleanlab.ai/stable/tutorials/faq.html), [Github Issues](https://github.com/cleanlab/cleanlab/issues?q=is%3Aissue), or [Slack](https://cleanlab.ai/slack). We welcome any questions! ## Use cleanlab with any model for most ML tasks -All features of cleanlab work with **any dataset** and **any model**. Yes, any model: scikit-learn, PyTorch, Tensorflow, Keras, JAX, HuggingFace, MXNet, XGBoost, etc. +All features of cleanlab work with **any dataset** and **any model**. Yes, any model: PyTorch, Tensorflow, Keras, JAX, HuggingFace, OpenAI, XGBoost, scikit-learn, etc. If you use a sklearn-compatible classifier, all cleanlab methods work out-of-the-box.
    @@ -121,7 +82,7 @@ It’s also easy to use your favorite non-sklearn-compatible model (click to cleanlab can find label issues from any model's predicted class probabilities if you can produce them yourself. -Some other cleanlab functionality requires your model to be sklearn-compatible. +Some cleanlab functionality may require your model to be sklearn-compatible. There's nothing you need to do if your model already has `.fit()`, `.predict()`, and `.predict_proba()` methods. Otherwise, just wrap your custom model into a Python class that inherits the `sklearn.base.BaseEstimator`: @@ -526,6 +487,28 @@ cleanlab is based on peer-reviewed research. Here are relevant papers to cite if
    +
    ActiveLab: Active learning with data re-labeling (ICLR '23) (click to show bibtex) + + @inproceedings{goh2023activelab, + title={ActiveLab: Active Learning with Re-Labeling by Multiple Annotators}, + author={Goh, Hui Wen and Mueller, Jonas}, + booktitle={ICLR Workshop on Trustworthy ML}, + year={2023} + } + +
    + +
    Incorrect Annotations in Multi-Label Classification (ICLR '23) (click to show bibtex) + + @inproceedings{thyagarajan2023multilabel, + title={Identifying Incorrect Annotations in Multi-Label Classification Data}, + author={Thyagarajan, Aditya and Snorrason, Elías and Northcutt, Curtis and Mueller, Jonas}, + booktitle={ICLR Workshop on Trustworthy ML}, + year={2023} + } + +
    + To understand/cite other cleanlab functionality not described above, check out our [additional publications](https://cleanlab.ai/research/). @@ -538,6 +521,8 @@ To understand/cite other cleanlab functionality not described above, check out o - [NeurIPS 2021 paper: Pervasive Label Errors in Test Sets Destabilize Machine Learning Benchmarks](https://arxiv.org/abs/2103.14749) +- [Release notes for past versions](https://github.com/cleanlab/cleanlab/releases) + - [Cleanlab Studio](https://cleanlab.ai/studio/?utm_source=github&utm_medium=readme&utm_campaign=clostostudio): No-code Data Improvement While this open-source library **finds** data issues, an interface is needed to efficiently **fix** these issues in your dataset. [Cleanlab Studio](https://cleanlab.ai/studio/?utm_source=github&utm_medium=readme&utm_campaign=clostostudio) is a no-code platform to find and fix problems in real-world ML datasets. Studio automatically runs optimized versions of the algorithms from this open-source library on top of AutoML models fit to your data, and presents detected issues in a smart data editing interface. Think of it like a data cleaning assistant that helps you quickly improve the quality of your data (via AI/automation + streamlined UX). @@ -555,14 +540,15 @@ While this open-source library **finds** data issues, an interface is needed to * Have an issue with cleanlab? [Search existing issues](https://github.com/cleanlab/cleanlab/issues?q=is%3Aissue) or [submit a new issue](https://github.com/cleanlab/cleanlab/issues/new). * Need professional help with cleanlab? -Join our [\#help Slack channel](https://cleanlab.ai/slack) and message one of our core developers, Jonas Mueller, or schedule a meeting via email: team@cleanlab.ai +Join our [\#help Slack channel](https://cleanlab.ai/slack) and message us there, or reach out via email: team@cleanlab.ai ## License -Copyright (c) 2017-2023 Cleanlab Inc. +Copyright (c) 2017 Cleanlab Inc. cleanlab is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. cleanlab is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See [GNU Affero General Public LICENSE](https://github.com/cleanlab/cleanlab/blob/master/LICENSE) for details. +You can email us to discuss licensing: team@cleanlab.ai From 229718da367f76c6d12a0910216734fb0a089316 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Fri, 24 Mar 2023 10:25:06 -0700 Subject: [PATCH 140/258] formatting+typos --- docs/source/tutorials/faq.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/tutorials/faq.ipynb b/docs/source/tutorials/faq.ipynb index 6854a1d0c1..fc8c93fa16 100644 --- a/docs/source/tutorials/faq.ipynb +++ b/docs/source/tutorials/faq.ipynb @@ -215,15 +215,15 @@ "id": "bcc97591", "metadata": {}, "source": [ - "### How shoud I handle label errors in train vs test data?\n", + "### How should I handle label errors in train vs. test data?\n", "\n", - "If you do not address label errors in your test data, you may not even know when you have produced a better ML model because the evaluation is too noisy. For the best-trained models and most reliable evaluation of them, you should fix label errors in both your training and testing data.\n", + "If you do not address label errors in your test data, you may not even know when you have produced a better ML model because the evaluation is too noisy. For the best-trained models and most reliable evaluation of them, you should fix label errors in both training and testing data.\n", "\n", - "To do this efficiently, first use cleanlab to automatically find label issues in both sets. You can simply merge these two sets into one larger dataset and run cross-validation training + `find_label_issues()` on the merged datataset. Calling the [`CleanLearning.find_label_issues()`](../cleanlab/classification.html) method on your merged dataset does both these steps for you with any scikit-learn compatible classifier you choose.\n", + "To do this efficiently, first use cleanlab to automatically find label issues in both sets. You can simply merge these two sets into one larger dataset and run cross-validation training + `find_label_issues()` on the merged datataset. Calling the [CleanLearning.find_label_issues()](../cleanlab/classification.html) method on your merged dataset does both these steps for you with any scikit-learn compatible classifier you choose.\n", "\n", - "After finding the label issues, be **wary** about auto-correcting the labels for test examples (as cautioned against above). Instead make sure you are only manually fixing labels for your test data. You can use [Cleanlab Studio](https://cleanlab.ai/studio/) to fix labels efficiently.\n", + "After finding label issues, be **wary** about auto-correcting the labels for test examples (as cautioned against above). Instead manually fix the labels for your test data via careful review of the flagged issues. You can use [Cleanlab Studio](https://cleanlab.ai/studio/) to fix labels efficiently.\n", "\n", - "Auto-correcting labels for your training data is fair game, which should improve your ML performance. You can often boost ML performance further by manually fixing the training examples flagged with label issues, as demonstrated in this article:\n", + "Auto-correcting labels for your training data is fair game, which should improve ML performance (if properly evaluated with clean test labels). You can boost ML performance further by manually fixing the training examples flagged with label issues, as demonstrated in this article:\n", "\n", "[**Handling Mislabeled Tabular Data to Improve Your XGBoost Model**](https://cleanlab.ai/blog/label-errors-tabular-datasets/)" ] From 7519e5d1f29a7b0f90b1082a0b3333c5877d138e Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Mon, 27 Mar 2023 23:25:24 -0700 Subject: [PATCH 141/258] update version for 2.3.1 release (#658) --- cleanlab/version.py | 12 +++++------- docs/source/conf.py | 1 + 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cleanlab/version.py b/cleanlab/version.py index c3ec42f9af..2103522c75 100644 --- a/cleanlab/version.py +++ b/cleanlab/version.py @@ -17,7 +17,11 @@ __version__ = "2.3.1" -# 2.3.1 - Not yet released, you are using bleeding-edge developer version. See its documentation at: https://docs.cleanlab.ai/master/ +# 2.3.2 - Not yet released, you are using bleeding-edge developer version. See its documentation at: https://docs.cleanlab.ai/master/ + +# ------------------------------------------------ +# | PREVIOUS MAJOR VERSION RELEASE NOTES SUMMARY | +# ------------------------------------------------ # 2.3.0 - Extending cleanlab beyond label errors into a complete library for data-centric AI # @@ -25,12 +29,6 @@ # - Active learning with data re-labeling (ActiveLab) # - KerasWrapperModel and KerasSequentialWrapper to make arbitrary Keras models compatible with scikit-learn # - Computational improvements for detecting label issues (better efficiency and mini-batch estimation that works with lower memory) -# -# See release for a full changelog. - -# ------------------------------------------ -# | PREVIOUS VERSION RELEASE NOTES SUMMARY | -# ------------------------------------------ # 2.2.0 - Re-invented algorithms for multi-label classification and support for datasets with missing classes # diff --git a/docs/source/conf.py b/docs/source/conf.py index 2665fc5a31..aa0522a279 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -147,6 +147,7 @@ # Add new tags to RELEASE_VERSIONS before release # fmt: off "RELEASE_VERSIONS": [ + "v2.3.1", "v2.3.0", "v2.2.0", "v2.1.0", From 29a930df374f2d28957312734809d3c55a2f07eb Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Wed, 29 Mar 2023 00:42:47 -0500 Subject: [PATCH 142/258] bump git version past stable version (#659) --- cleanlab/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/version.py b/cleanlab/version.py index 2103522c75..5ec97902e3 100644 --- a/cleanlab/version.py +++ b/cleanlab/version.py @@ -15,7 +15,7 @@ # along with cleanlab. If not, see . -__version__ = "2.3.1" +__version__ = "2.3.2" # 2.3.2 - Not yet released, you are using bleeding-edge developer version. See its documentation at: https://docs.cleanlab.ai/master/ From 6e247d554349dfbcca4831b090f90556917f9304 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Tue, 4 Apr 2023 02:44:49 -0700 Subject: [PATCH 143/258] link faq again at bottom of readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2b75e6dab9..19dabd8b3b 100644 --- a/README.md +++ b/README.md @@ -537,7 +537,7 @@ While this open-source library **finds** data issues, an interface is needed to * Have code improvements for cleanlab? See the [development guide](DEVELOPMENT.md). -* Have an issue with cleanlab? [Search existing issues](https://github.com/cleanlab/cleanlab/issues?q=is%3Aissue) or [submit a new issue](https://github.com/cleanlab/cleanlab/issues/new). +* Have an issue with cleanlab? Search [our FAQ](https://docs.cleanlab.ai/stable/tutorials/faq.html) and [existing issues](https://github.com/cleanlab/cleanlab/issues?q=is%3Aissue), or [submit a new issue](https://github.com/cleanlab/cleanlab/issues/new). * Need professional help with cleanlab? Join our [\#help Slack channel](https://cleanlab.ai/slack) and message us there, or reach out via email: team@cleanlab.ai From 8cf089e1520c00c0d0a98477d988fb1559ad6f65 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Tue, 4 Apr 2023 03:55:58 -0700 Subject: [PATCH 144/258] add section on practicing data-centric ai to readme (#660) --- README.md | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 19dabd8b3b..de2b61b004 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ cleanlab.dataset.health_summary(labels, confident_joint=cl.confident_joint) Get started with: [documentation](https://docs.cleanlab.ai/), [tutorials](https://docs.cleanlab.ai/stable/tutorials/image.html), [examples](https://github.com/cleanlab/examples), and [blogs](https://cleanlab.ai/blog/). - Learn to run cleanlab on your data in 5 minutes for classification with: [image](https://docs.cleanlab.ai/stable/tutorials/image.html), [text](https://docs.cleanlab.ai/stable/tutorials/text.html), [audio](https://docs.cleanlab.ai/stable/tutorials/audio.html), or [tabular](https://docs.cleanlab.ai/stable/tutorials/tabular.html) data. -- Use cleanlab to automatically: [find mislabeled data + train robust models](https://docs.cleanlab.ai/stable/tutorials/indepth_overview.html), [detect outliers](https://docs.cleanlab.ai/stable/tutorials/outliers.html), [estimate consensus + annotator-quality for multi-annotator datasets](https://docs.cleanlab.ai/stable/tutorials/multiannotator.html), [suggest which data is best to (re)label next](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb). +- Use cleanlab to automatically: [find mislabeled data + train robust models](https://docs.cleanlab.ai/stable/tutorials/indepth_overview.html), [detect outliers](https://docs.cleanlab.ai/stable/tutorials/outliers.html), [estimate consensus + annotator-quality for multi-annotator datasets](https://docs.cleanlab.ai/stable/tutorials/multiannotator.html), [suggest which data is best to (re)label next (active learning)](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb). [![pypi](https://img.shields.io/pypi/v/cleanlab.svg)](https://pypi.org/pypi/cleanlab/) @@ -47,14 +47,10 @@ cleanlab **clean**s your data's **lab**els via state-of-the-art *confident learn cleanlab is: -1. **backed by theory** - - with [provable guarantees](https://arxiv.org/abs/1911.00068) of exact estimation of noise and label errors, even with imperfect models. -2. **fast** - - Code is parallelized and scalable. -4. **easy-to-use** - - Find mislabeled data, bad annotators, outliers, or train noise-robust models -- all in one line of code. -6. **general** - - Works with **[any dataset](https://labelerrors.com/)** (text, image, tabular, audio, ...) and **any model** (TensorFlow, PyTorch, JAX, HuggingFace, OpenAI, XGBoost, scikit-learn, ...) +1. **backed by theory** -- with [provable guarantees](https://arxiv.org/abs/1911.00068) of exact label noise estimation, even with imperfect models. +2. **fast** -- code is parallelized and scalable. +4. **easy to use** -- one line of code to find mislabeled data, bad annotators, outliers, or train noise-robust models. +6. **general** -- works with **[any dataset](https://labelerrors.com/)** (text, image, tabular, audio,...) + **any model** (PyTorch, OpenAI, XGBoost,...)
    ![](https://raw.githubusercontent.com/cleanlab/assets/master/cleanlab/label-errors-examples.png) @@ -70,6 +66,15 @@ cleanlab supports Linux, macOS, and Windows and runs on Python 3.7+. - Developers who install the bleeding-edge from source should refer to [this master branch documentation](https://docs.cleanlab.ai/master/index.html). - For help, check out our detailed [FAQ](https://docs.cleanlab.ai/stable/tutorials/faq.html), [Github Issues](https://github.com/cleanlab/cleanlab/issues?q=is%3Aissue), or [Slack](https://cleanlab.ai/slack). We welcome any questions! +**Practicing data-centric AI can look like this:** +1. Train initial ML model on original dataset. +2. Utilize this model to diagnose data issues (via cleanlab methods) and improve the dataset. +3. Train the same model on the improved dataset. +4. Try various modeling techniques to further improve performance. + +Most folks jump from Step 1 → 4, but you may achieve big gains without *any* change to your modeling code by using cleanlab! +Continuously boost performance by iterating Steps 2 → 4 (and try to evaluate with *cleaned* data). + ## Use cleanlab with any model for most ML tasks All features of cleanlab work with **any dataset** and **any model**. Yes, any model: PyTorch, Tensorflow, Keras, JAX, HuggingFace, OpenAI, XGBoost, scikit-learn, etc. @@ -126,13 +131,16 @@ cleanlab is useful across a wide variety of Machine Learning tasks. Specific tas 3. [Token classification](https://docs.cleanlab.ai/stable/tutorials/token_classification.html) (e.g. entity recognition in text) 4. [Classification with data labeled by multiple annotators](https://docs.cleanlab.ai/stable/tutorials/multiannotator.html) 5. [Active learning with multiple annotators](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb) (suggest which data to label or re-label to improve model most) -6. [Out of distribution detection](https://docs.cleanlab.ai/stable/tutorials/outliers.html) +6. [Outlier and out of distribution detection](https://docs.cleanlab.ai/stable/tutorials/outliers.html) For many other ML tasks, cleanlab can still help you improve your dataset if appropriately applied. ## Cool cleanlab applications +Many practical applications are demonstrated in our [Example Notebooks](https://github.com/cleanlab/examples). \ +After going through those, consider these more unusual use-cases of this package: +
    Reproducing results in Confident Learning paper (click to learn more) @@ -408,7 +416,6 @@ Now that you have `indices_of_label_errors`, you can remove those label issues a
    -Many other practical applications are demonstrated in our [Example Notebooks](https://github.com/cleanlab/examples) ## Citation and related publications From c4648483ca2ba80d61812b7f99f01cd0629ab07d Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Wed, 5 Apr 2023 00:59:33 -0700 Subject: [PATCH 145/258] clarify no class is also an option --- docs/source/tutorials/multilabel_classification.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/tutorials/multilabel_classification.ipynb b/docs/source/tutorials/multilabel_classification.ipynb index 6c40844957..b1b002975e 100644 --- a/docs/source/tutorials/multilabel_classification.ipynb +++ b/docs/source/tutorials/multilabel_classification.ipynb @@ -395,7 +395,7 @@ "source": [ "## 2. Format data, labels, and model predictions\n", "\n", - "In multi-label classification, each example in the dataset is labeled as belonging to one **or more** of *K* possible classes. To find label issues, cleanlab requires predicted class probabilities from a trained classifier. \n", + "In multi-label classification, each example in the dataset is labeled as belonging to one **or more** of *K* possible classes (or none of the classes at all). To find label issues, cleanlab requires predicted class probabilities from a trained classifier. \n", "Here we produce out-of-sample `pred_probs` by employing cross-validation to fit a multi-label **RandomForestClassifier** model via sklearn's [OneVsRestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) framework. `OneVsRestClassifier` offers an easy way to apply any multi-class classifier model from sklearn to multi-label classification tasks. It is done for simplicity here, but we advise against this approach as it does not properly model dependencies between classes.\n", "\n", "To instead train a state-of-the-art Pytorch neural network for multi-label classification and produce `pred_probs` on a real image dataset (that properly account for dependencies between classes), see our [example](https://github.com/cleanlab/examples) notebook [\"Train a neural network for multi-label classification on the CelebA dataset\"](https://github.com/cleanlab/examples/blob/master/multilabel_classification/pytorch_network_training.ipynb). " From 64c6cfcdd5f99ee26105e98b444075c650ca567c Mon Sep 17 00:00:00 2001 From: Hui Wen <45724323+huiwengoh@users.noreply.github.com> Date: Thu, 6 Apr 2023 00:21:17 -0400 Subject: [PATCH 146/258] Pass confident joint computed in CleanLearning to filter.find_label_issues (#661) --- cleanlab/classification.py | 7 ++++ tests/test_classification.py | 65 +++++++++++++++++++++++++++++++++++- 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/cleanlab/classification.py b/cleanlab/classification.py index c419ef5463..7f28042311 100644 --- a/cleanlab/classification.py +++ b/cleanlab/classification.py @@ -839,6 +839,7 @@ def find_label_issues( pred_probs=pred_probs, thresholds=thresholds, ) + # if pulearning == the integer specifying the class without noise. if self.num_classes == 2 and self.pulearning is not None: # pragma: no cover # pulearning = 1 (no error in 1 class) implies p(label=1|true_label=0) = 0 @@ -851,6 +852,12 @@ def find_label_issues( self.confident_joint[self.pulearning][1 - self.pulearning] = 0 self.confident_joint[1 - self.pulearning][1 - self.pulearning] = 1 + # Add confident joint to find label issue args if it is not previously specified + if "confident_joint" not in self.find_label_issues_kwargs.keys(): + # however does not add if users specify filter_by="confident_learning", as it will throw a warning + if not self.find_label_issues_kwargs.get("filter_by") == "confident_learning": + self.find_label_issues_kwargs["confident_joint"] = self.confident_joint + labels = labels_to_array(labels) if self.verbose: print("Using predicted probabilities to identify label issues ...") diff --git a/tests/test_classification.py b/tests/test_classification.py index 41c9a47b69..ca99bccd91 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -27,7 +27,11 @@ from cleanlab.benchmarking.noise_generation import generate_noise_matrix_from_trace from cleanlab.benchmarking.noise_generation import generate_noisy_labels from cleanlab.internal.latent_algebra import compute_inv_noise_matrix -from cleanlab.count import compute_confident_joint, estimate_cv_predicted_probabilities +from cleanlab.count import ( + compute_confident_joint, + estimate_cv_predicted_probabilities, + get_confident_thresholds, +) from cleanlab.filter import find_label_issues SEED = 1 @@ -776,6 +780,65 @@ def test_cj_in_find_label_issues_kwargs(filter_by, seed): assert num_issues[0] == num_issues[1] +def test_find_label_issues_uses_thresholds(): + X = DATA["X_train"] + labels = DATA["labels"] + pred_probs = estimate_cv_predicted_probabilities(X=X, labels=labels) + + confident_thresholds = get_confident_thresholds(labels=labels, pred_probs=pred_probs) + confident_joint = compute_confident_joint(labels=labels, pred_probs=pred_probs) + + # regular find label issues with no args + cl = CleanLearning() + label_issues_reg = cl.find_label_issues(labels=labels, pred_probs=pred_probs) + + # find label issues with specified confident thresholds + cl = CleanLearning() + label_issues_thres = cl.find_label_issues( + labels=labels, pred_probs=pred_probs, thresholds=confident_thresholds + ) + + # find label issues with specified confident joint + cl = CleanLearning( + find_label_issues_kwargs={ + "confident_joint": confident_joint, + } + ) + label_issues_cj = cl.find_label_issues(labels=labels, pred_probs=pred_probs) + + # the labels issues in above three calls should be the same + assert np.sum(label_issues_reg["is_label_issue"]) == np.sum( + label_issues_thres["is_label_issue"] + ) + assert np.sum(label_issues_reg["is_label_issue"]) == np.sum(label_issues_cj["is_label_issue"]) + + # find label issues with different specified confident thresholds + confident_thresholds_alt = np.full(pred_probs.shape[1], 0.25) + cl = CleanLearning() + label_issues_thres_alt = cl.find_label_issues( + labels=labels, pred_probs=pred_probs, thresholds=confident_thresholds_alt + ) + + # find label issues with different specified confident joint + confident_joint_alt = compute_confident_joint( + labels=labels, pred_probs=pred_probs, thresholds=confident_thresholds_alt + ) + cl = CleanLearning( + find_label_issues_kwargs={ + "confident_joint": confident_joint_alt, + } + ) + label_issues_cj_alt = cl.find_label_issues(labels=labels, pred_probs=pred_probs) + + # the number of issues for these 2 alt calls should be same as one another, but different from above 3 + assert np.sum(label_issues_thres_alt["is_label_issue"]) == np.sum( + label_issues_cj_alt["is_label_issue"] + ) + assert np.sum(label_issues_thres_alt["is_label_issue"]) != np.sum( + label_issues_reg["is_label_issue"] + ) + + def test_find_issues_missing_classes(): labels = np.array([0, 0, 2, 2]) pred_probs = np.array( From 352d904491425d866f51846fbd6df76831177ce5 Mon Sep 17 00:00:00 2001 From: Yiran Shi <60683228+Steven-Yiran@users.noreply.github.com> Date: Thu, 6 Apr 2023 00:29:22 -0400 Subject: [PATCH 147/258] Add Example codeblock to the docstrings of important functions in the dataset module (#662) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/dataset.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/cleanlab/dataset.py b/cleanlab/dataset.py index 78c9da670f..40d0222b91 100644 --- a/cleanlab/dataset.py +++ b/cleanlab/dataset.py @@ -139,6 +139,26 @@ def find_overlapping_classes( issues via the approach published in `Northcutt et al., 2021 `_. + Examples + -------- + >>> from cleanlab.dataset import find_overlapping_classes + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import cross_val_predict + >>> data, labels = get_data_labels_from_dataset(yourFavoriteDataset) + >>> yourFavoriteModel = LogisticRegression() + >>> num_crossval_folds = 3 + >>> pred_probs = cross_val_predict( + yourFavoriteModel, + data, + labels, + cv=num_crossval_folds, + method="predict_proba", + ) + >>> df = find_overlapping_classes( + labels=labels, + pred_probs=pred_probs, + ) # lists pairs of classes that are often mislabeled as one another + Note ---- The joint distribution of noisy and true labels is asymmetric, and therefore the joint From 6d67175e2af8b9a0bbde2c36421e537a8fb2c27c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Fri, 7 Apr 2023 11:33:45 -0700 Subject: [PATCH 148/258] Extract function for computating ood scores from distances (#664) * extract distance-to-score computation to separate function * move function to a new internal.outlier module --- cleanlab/internal/outlier.py | 68 +++++++++++++++++++++++ cleanlab/outlier.py | 9 +-- docs/source/cleanlab/internal/index.rst | 1 + docs/source/cleanlab/internal/outlier.rst | 8 +++ 4 files changed, 80 insertions(+), 6 deletions(-) create mode 100644 cleanlab/internal/outlier.py create mode 100644 docs/source/cleanlab/internal/outlier.rst diff --git a/cleanlab/internal/outlier.py b/cleanlab/internal/outlier.py new file mode 100644 index 0000000000..57004a0a21 --- /dev/null +++ b/cleanlab/internal/outlier.py @@ -0,0 +1,68 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . + +""" +Helper functions used internally for outlier detection tasks. +""" + +import numpy as np + + +def transform_distances_to_scores(distances: np.ndarray, k: int, t: int) -> np.ndarray: + """Returns an outlier score for each example based on its average distance to its k nearest neighbors. + + The transformation of a distance, :math:`d` , to a score, :math:`o` , is based on the following formula: + + .. math:: + o = \\exp\\left(-dt\\right) + + where :math:`t` scales the distance to a score in the range [0,1]. + + Parameters + ---------- + distances : np.ndarray + An array of distances of shape ``(N, num_neighbors)``, where N is the number of examples. + Each row contains the distances to each example's `num_neighbors` nearest neighbors. + It is assumed that each row is sorted in ascending order. + + k : int + Number of neighbors used to compute the average distance to each example. + This assumes that the second dimension of distances is k or greater, but it + uses slicing to avoid indexing errors. + + t : int + Controls transformation of distances between examples into similarity scores that lie in [0,1]. + + Returns + ------- + ood_features_scores : np.ndarray + An array of outlier scores of shape ``(N,)`` for N examples. + + Examples + -------- + >>> import numpy as np + >>> from cleanlab.outlier import transform_distances_to_scores + >>> distances = np.array([[0.0, 0.1, 0.25], + ... [0.15, 0.2, 0.3]]) + >>> transform_distances_to_scores(distances, k=2, t=1) + array([0.95122942, 0.83945702]) + """ + # Calculate average distance to k-nearest neighbors + avg_knn_distances = distances[:, :k].mean(axis=1) + + # Map ood_features_scores to range 0-1 with 0 = most concerning + ood_features_scores: np.ndarray = np.exp(-1 * avg_knn_distances * t) + return ood_features_scores diff --git a/cleanlab/outlier.py b/cleanlab/outlier.py index 0ac6ce5d4b..2e7950f34a 100644 --- a/cleanlab/outlier.py +++ b/cleanlab/outlier.py @@ -25,11 +25,12 @@ from cleanlab.count import get_confident_thresholds from sklearn.neighbors import NearestNeighbors from sklearn.exceptions import NotFittedError -from typing import Optional, Union, Tuple, Dict +from typing import Optional, Union, Tuple, Dict, cast from cleanlab.internal.label_quality_utils import ( _subtract_confident_thresholds, get_normalized_entropy, ) +from cleanlab.internal.outlier import transform_distances_to_scores from cleanlab.internal.validation import assert_valid_inputs, labels_to_array from cleanlab.typing import LabelLike @@ -446,11 +447,7 @@ def _get_ood_features_scores( # neighbor of each point is the point itself, at a distance of zero. distances, _ = knn.kneighbors(features) - # Calculate average distance to k-nearest neighbors - avg_knn_distances = distances[:, :k].mean(axis=1) - - # Map ood_features_scores to range 0-1 with 0 = most concerning - ood_features_scores: np.ndarray = np.exp(-1 * avg_knn_distances * t) + ood_features_scores = transform_distances_to_scores(distances, cast(int, k), t) return (ood_features_scores, knn) diff --git a/docs/source/cleanlab/internal/index.rst b/docs/source/cleanlab/internal/index.rst index 0601499cb4..6642479281 100644 --- a/docs/source/cleanlab/internal/index.rst +++ b/docs/source/cleanlab/internal/index.rst @@ -17,5 +17,6 @@ internal label_quality_utils multilabel_utils multilabel_scorer + outlier token_classification_utils validation diff --git a/docs/source/cleanlab/internal/outlier.rst b/docs/source/cleanlab/internal/outlier.rst new file mode 100644 index 0000000000..26c516758b --- /dev/null +++ b/docs/source/cleanlab/internal/outlier.rst @@ -0,0 +1,8 @@ +outlier +======= + +.. automodule:: cleanlab.internal.outlier + :autosummary: + :members: + :undoc-members: + :show-inheritance: From 2435a5a27fc7f7274b6bf388c184cc874fb5c142 Mon Sep 17 00:00:00 2001 From: Yiran Shi <60683228+Steven-Yiran@users.noreply.github.com> Date: Fri, 7 Apr 2023 20:42:13 -0400 Subject: [PATCH 149/258] Added code block examples for remaining methods in the cleanlab.dataset module (#663) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/dataset.py | 62 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/cleanlab/dataset.py b/cleanlab/dataset.py index 40d0222b91..f13a267817 100644 --- a/cleanlab/dataset.py +++ b/cleanlab/dataset.py @@ -54,6 +54,25 @@ def rank_classes_by_label_quality( Only provide **exactly one of the above input options**, do not provide a combination. + Examples + -------- + >>> from cleanlab.dataset import rank_classes_by_label_quality + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import cross_val_predict + >>> data, labels = get_data_labels_from_dataset(yourFavoriteDataset) + >>> yourFavoriteModel = LogisticRegression() + >>> pred_probs = cross_val_predict( + yourFavoriteModel, + data, + labels, + cv=3, + method="predict_proba", + ) # generate cross-validation estimates for each input data point + >>> df = rank_classes_by_label_quality( + labels=labels, + pred_probs=pred_probs, + ) # report overall label quality scores summarizing the examples annotated as each class + **Parameters**: For parameter info, see the docstring of :py:func:`find_overlapping_classes `. Returns @@ -146,14 +165,13 @@ def find_overlapping_classes( >>> from sklearn.model_selection import cross_val_predict >>> data, labels = get_data_labels_from_dataset(yourFavoriteDataset) >>> yourFavoriteModel = LogisticRegression() - >>> num_crossval_folds = 3 >>> pred_probs = cross_val_predict( yourFavoriteModel, data, labels, - cv=num_crossval_folds, + cv=3, method="predict_proba", - ) + ) # generate cross-validation estimates for each input data point >>> df = find_overlapping_classes( labels=labels, pred_probs=pred_probs, @@ -317,6 +335,25 @@ def overall_label_health_score( Only provide **exactly one of the above input options**, do not provide a combination. + Examples + -------- + >>> from cleanlab.dataset import overall_label_health_score + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import cross_val_predict + >>> data, labels = get_data_labels_from_dataset(yourFavoriteDataset) + >>> yourFavoriteModel = LogisticRegression() + >>> pred_probs = cross_val_predict( + yourFavoriteModel, + data, + labels, + cv=3, + method="predict_proba", + ) # generate cross-validation estimates for each input data point + >>> score = overall_label_health_score( + labels=labels, + pred_probs=pred_probs, + ) # a score measuring the overall quality of all labels in a dataset. + **Parameters**: For parameter info, see the docstring of :py:func:`find_overlapping_classes `. Returns @@ -372,6 +409,25 @@ def health_summary( Only provide **exactly one of the above input options**, do not provide a combination. + Examples + -------- + >>> from cleanlab.dataset import health_summary + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import cross_val_predict + >>> data, labels = get_data_labels_from_dataset(yourFavoriteDataset) + >>> yourFavoriteModel = LogisticRegression() + >>> pred_probs = cross_val_predict( + yourFavoriteModel, + data, + labels, + cv=3, + method="predict_proba", + ) # generate cross-validation estimates for each input data point + >>> summary = health_summary( + labels=labels, + pred_probs=pred_probs, + ) # dictionary summarizing the overall label quality of the classes in your dataset + **Parameters**: For parameter info, see the docstring of :py:func:`find_overlapping_classes `. Returns From 1e653def930889ce36f7b154506df62dd02873d4 Mon Sep 17 00:00:00 2001 From: Hui Wen <45724323+huiwengoh@users.noreply.github.com> Date: Tue, 11 Apr 2023 16:05:02 -0400 Subject: [PATCH 150/258] remove min batch size restriction in LabelInspector (#665) --- cleanlab/experimental/label_issues_batched.py | 2 -- tests/test_filter_count.py | 6 ++++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cleanlab/experimental/label_issues_batched.py b/cleanlab/experimental/label_issues_batched.py index 5cd395837f..ef11f91b65 100644 --- a/cleanlab/experimental/label_issues_batched.py +++ b/cleanlab/experimental/label_issues_batched.py @@ -744,8 +744,6 @@ def _batch_check(labels: LabelLike, pred_probs: np.ndarray, num_class: int) -> n """ batch_size = pred_probs.shape[0] labels = np.asarray(labels) - if batch_size < 10: - raise ValueError("Please run this with batches containing at least 10 examples.") if len(labels) != batch_size: raise ValueError("labels and pred_probs must have same length") if pred_probs.shape[1] != num_class: diff --git a/tests/test_filter_count.py b/tests/test_filter_count.py index 26c82c6214..e2cbf07bdd 100644 --- a/tests/test_filter_count.py +++ b/tests/test_filter_count.py @@ -921,12 +921,18 @@ def test_batched_label_issues(): batch_size=len(data["labels"]) + 100, n_jobs=4, ) + f5 = find_label_issues_batched( + labels=data["labels"], + pred_probs=data["pred_probs"], + batch_size=1, + ) f_single = find_label_issues_batched( labels=data["labels"], pred_probs=data["pred_probs"], batch_size=len(data["labels"]), n_jobs=1, ) + assert np.all(f4 == f5) assert np.all(f4 == f3) assert np.all(f4 == f2) assert np.all(f_single == f4) From f8c1866c58597d752d57f0b770a485dfc2023ef2 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Fri, 14 Apr 2023 10:20:11 +0530 Subject: [PATCH 151/258] move methods to multilabel_classification module (#657) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit adds new dataset summarization methods for multilabel settings --------- Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Co-authored-by: Elías Snorrason --- cleanlab/dataset.py | 47 +-- cleanlab/filter.py | 164 +-------- cleanlab/internal/multilabel_scorer.py | 2 +- cleanlab/multilabel_classification.py | 123 ------- .../multilabel_classification/__init__.py | 4 + cleanlab/multilabel_classification/dataset.py | 317 ++++++++++++++++++ cleanlab/multilabel_classification/filter.py | 258 ++++++++++++++ cleanlab/multilabel_classification/rank.py | 191 +++++++++++ .../tutorials/multilabel_classification.ipynb | 35 +- setup.cfg | 1 + tests/test_dataset.py | 10 + tests/test_filter_count.py | 38 ++- tests/test_multilabel_classification.py | 219 +++++++++++- 13 files changed, 1074 insertions(+), 335 deletions(-) delete mode 100644 cleanlab/multilabel_classification.py create mode 100644 cleanlab/multilabel_classification/__init__.py create mode 100644 cleanlab/multilabel_classification/dataset.py create mode 100644 cleanlab/multilabel_classification/filter.py create mode 100644 cleanlab/multilabel_classification/rank.py diff --git a/cleanlab/dataset.py b/cleanlab/dataset.py index f13a267817..cefe4b33e4 100644 --- a/cleanlab/dataset.py +++ b/cleanlab/dataset.py @@ -54,6 +54,9 @@ def rank_classes_by_label_quality( Only provide **exactly one of the above input options**, do not provide a combination. + **Parameters**: For information about the arguments to this method, see the documentation of + :py:func:`find_overlapping_classes `. + Examples -------- >>> from cleanlab.dataset import rank_classes_by_label_quality @@ -73,7 +76,6 @@ def rank_classes_by_label_quality( pred_probs=pred_probs, ) # report overall label quality scores summarizing the examples annotated as each class - **Parameters**: For parameter info, see the docstring of :py:func:`find_overlapping_classes `. Returns ------- @@ -95,13 +97,16 @@ def rank_classes_by_label_quality( By default, the DataFrame is ordered by "Label Quality Score", ascending. """ + if multi_label: + raise ValueError( + "For multilabel data, please instead call: multilabel_classification.dataset.overall_multilabel_health_score()" + ) if joint is None: joint = estimate_joint( labels=labels, pred_probs=pred_probs, confident_joint=confident_joint, - multi_label=multi_label, ) if num_examples is None: num_examples = _get_num_examples(labels=labels) @@ -242,12 +247,6 @@ class 0, 1, ..., K-1. `pred_probs` should have been computed using 3 (or The `confident_joint` can be computed using :py:func:`count.compute_confident_joint `. If not provided, it is computed from the given (noisy) `labels` and `pred_probs`. - multi_label : bool, optional - If ``True``, labels should be an iterable (e.g. list) of iterables, containing a - list of labels for each example, instead of just a single label. - The multi-label setting supports classification tasks where an example has 1 or more labels. - Example of a multi-labeled `labels` input: ``[[0,1], [1], [0,2], [0,1,2], [0], [1], ...]``. - Returns ------- overlapping_classes : pd.DataFrame @@ -279,12 +278,16 @@ def _2d_matrix_to_row_column_value_list(matrix): return [(*i, v) for i, v in np.ndenumerate(matrix)] + if multi_label: + raise ValueError( + "For multilabel data, please instead call: multilabel_classification.dataset.common_multilabel_issues()" + ) + if joint is None: joint = estimate_joint( labels=labels, pred_probs=pred_probs, confident_joint=confident_joint, - multi_label=multi_label, ) if num_examples is None: num_examples = _get_num_examples(labels=labels, confident_joint=confident_joint) @@ -335,6 +338,9 @@ def overall_label_health_score( Only provide **exactly one of the above input options**, do not provide a combination. + **Parameters**: For information about the arguments to this method, see the documentation of + :py:func:`find_overlapping_classes `. + Examples -------- >>> from cleanlab.dataset import overall_label_health_score @@ -354,21 +360,22 @@ def overall_label_health_score( pred_probs=pred_probs, ) # a score measuring the overall quality of all labels in a dataset. - **Parameters**: For parameter info, see the docstring of :py:func:`find_overlapping_classes `. - Returns ------- health_score : float A score between 0 and 1, where 1 implies all labels in the dataset are estimated to be correct. A score of 0.5 implies that half of the dataset's labels are estimated to have issues. """ + if multi_label: + raise ValueError( + "For multilabel data, please instead call: multilabel_classification.dataset.overall_multilabel_health_score()" + ) if joint is None: joint = estimate_joint( labels=labels, pred_probs=pred_probs, confident_joint=confident_joint, - multi_label=multi_label, ) if num_examples is None: num_examples = _get_num_examples(labels=labels) @@ -409,6 +416,9 @@ def health_summary( Only provide **exactly one of the above input options**, do not provide a combination. + **Parameters**: For information about the arguments to this method, see the documentation of + :py:func:`find_overlapping_classes `. + Examples -------- >>> from cleanlab.dataset import health_summary @@ -428,7 +438,6 @@ def health_summary( pred_probs=pred_probs, ) # dictionary summarizing the overall label quality of the classes in your dataset - **Parameters**: For parameter info, see the docstring of :py:func:`find_overlapping_classes `. Returns ------- @@ -442,12 +451,15 @@ def health_summary( """ from cleanlab.internal.util import smart_display_dataframe + if multi_label: + raise ValueError( + "For multilabel data, please call multilabel_classification.dataset.health_summary" + ) if joint is None: joint = estimate_joint( labels=labels, pred_probs=pred_probs, confident_joint=confident_joint, - multi_label=multi_label, ) if num_examples is None: num_examples = _get_num_examples(labels=labels) @@ -474,7 +486,6 @@ def health_summary( num_examples=num_examples, joint=joint, confident_joint=confident_joint, - multi_label=multi_label, ) if verbose: print("Overall Class Quality and Noise across your dataset (below)") @@ -489,7 +500,6 @@ def health_summary( num_examples=num_examples, joint=joint, confident_joint=confident_joint, - multi_label=multi_label, ) if verbose: print( @@ -508,7 +518,6 @@ def health_summary( num_examples=num_examples, joint=joint, confident_joint=confident_joint, - multi_label=multi_label, verbose=verbose, ) if verbose: @@ -525,9 +534,7 @@ def _get_num_examples(labels=None, confident_joint: Optional[np.ndarray] = None) """Helper method that finds the number of examples from the parameters or throws an error if neither parameter is provided. - Parameters - ---------- - For parameter info, see the docstring of `dataset.find_overlapping_classes` + **Parameters:** For information about the arguments to this method, see the documentation of `dataset.find_overlapping_classes` Returns ------- diff --git a/cleanlab/filter.py b/cleanlab/filter.py index e6f2cc803d..70b2d8d5da 100644 --- a/cleanlab/filter.py +++ b/cleanlab/filter.py @@ -17,10 +17,9 @@ """ Methods to identify which examples have label issues in a classification dataset. The documentation below assumes a dataset with ``N`` examples and ``K`` classes. -This module considers two types of datasets: - -* standard (multi-class) classification where each example is labeled as belonging to exactly one of K classes (e.g. ``labels = np.array([0,0,1,0,2,1])``) -* multi-label classification where each example can be labeled as belonging to multiple classes (e.g. ``labels = [[1,2],[1],[0],[],...]``) +This module is for standard (multi-class) classification where each example is labeled as belonging to exactly one of K classes (e.g. ``labels = np.array([0,0,1,0,2,1])``). +Some methods here also work for multi-label classification data where each example can be labeled as belonging to multiple classes (e.g. ``labels = [[1,2],[1],[0],[],...]``), +but we encourage using the methods in the ``cleanlab.multilabel_classification`` module instead for such data. """ import numpy as np @@ -28,7 +27,7 @@ import multiprocessing import sys import warnings -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, Optional, Tuple, List from functools import reduce import platform @@ -43,6 +42,7 @@ ) from cleanlab.internal.multilabel_utils import stack_complement, get_onehot_num_classes, int2onehot from cleanlab.typing import LabelLike +from cleanlab.multilabel_classification.filter import find_multilabel_issues_per_class # tqdm is a package to print time-to-complete when multiprocessing is used. # This package is not necessary, but when installed improves user experience for large datasets. @@ -77,13 +77,13 @@ def find_label_issues( return_indices_ranked_by: Optional[str] = None, rank_by_kwargs: Optional[Dict[str, Any]] = None, filter_by: str = "prune_by_noise_rate", - multi_label: bool = False, frac_noise: float = 1.0, - num_to_remove_per_class: Optional[int] = None, + num_to_remove_per_class: Optional[List[int]] = None, min_examples_per_class=1, confident_joint: Optional[np.ndarray] = None, n_jobs: Optional[int] = None, verbose: bool = False, + multi_label: bool = False, ) -> np.ndarray: """ Identifies potentially bad labels in a classification dataset using confident learning. @@ -109,8 +109,6 @@ def find_label_issues( *Format requirements*: for dataset with K classes, each label must be integer in 0, 1, ..., K-1. For a standard (multi-class) classification dataset where each example is labeled with one class, `labels` should be 1D array of shape ``(N,)``, for example: ``labels = [1,0,2,1,1,0...]``. - For a multi-label classification dataset where each example can belong to multiple (or no) classes, - `labels` should be an iterable of iterables (e.g. ``List[List[int]]``) whose i-th element corresponds to list of classes that i-th example belongs to (e.g. ``labels = [[1,2],[1],[0],[],...]``). pred_probs : np.ndarray, optional An array of shape ``(N, K)`` of model-predicted class probabilities, @@ -150,13 +148,6 @@ class 0, 1, ..., K-1. - ``'low_normalized_margin'``: filters the examples with *smallest* normalized margin label quality score. The number of issues returned matches :py:func:`count.num_label_issues `. - ``'low_self_confidence'``: filters the examples with *smallest* self confidence label quality score. The number of issues returned matches :py:func:`count.num_label_issues `. - multi_label : bool, optional - If ``True``, labels should be an iterable (e.g. list) of iterables, containing a - list of class labels for each example, instead of just a single label. - The multi-label setting supports classification tasks where an example can belong to more than 1 class or none of the classes (rather than exactly one class as in standard multi-class classification). - Example of a multi-labeled `labels` input: ``[[0,1], [1], [0,2], [0,1,2], [0], [1], [], ...]``. This says the first example in dataset belongs to both class 0 and class 1, according to its given label. - Each row of `pred_probs` no longer needs to sum to 1 in multi-label settings, since one example can now belong to multiple classes simultaneously. - frac_noise : float, default=1.0 Used to only return the "top" ``frac_noise * num_label_issues``. The choice of which "top" label issues to return is dependent on the `filter_by` method used. It works by reducing the @@ -167,7 +158,6 @@ class 0, 1, ..., K-1. When ``frac_noise=1.0``, return all "confident" estimated noise indices (recommended). frac_noise * number_of_mislabeled_examples_in_class_k. - Note: specifying `frac_noise` is not yet supported if `multi_label` is True. num_to_remove_per_class : array_like An iterable of length K, the number of classes. @@ -195,7 +185,6 @@ class 0, 1, ..., K-1. Entry ``(j, k)`` in the matrix is the number of examples confidently counted into the pair of ``(noisy label=j, true label=k)`` classes. The `confident_joint` can be computed using :py:func:`count.compute_confident_joint `. If not provided, it is computed from the given (noisy) `labels` and `pred_probs`. - If `multi_label` is True, `confident_joint` should instead be a one-vs-rest array with shape ``(K, 2, 2)`` as returned by :py:func:`count.compute_confident_joint ` function. n_jobs : optional Number of processing threads used by multiprocessing. Default ``None`` @@ -312,7 +301,10 @@ class 0, 1, ..., K-1. if multi_label: if not isinstance(labels, list): raise TypeError("`labels` must be list when `multi_label=True`.") - + warnings.warn( + "The multi_label argument to filter.find_label_issues() is deprecated and will be removed in future versions. Please use `multilabel_classification.filter.find_label_issues()` instead.", + DeprecationWarning, + ) return _find_label_issues_multilabel( labels, pred_probs, @@ -482,7 +474,7 @@ def _find_label_issues_multilabel( rank_by_kwargs={}, filter_by: str = "prune_by_noise_rate", frac_noise: float = 1.0, - num_to_remove_per_class: Optional[int] = None, + num_to_remove_per_class: Optional[List[int]] = None, min_examples_per_class=1, confident_joint: Optional[np.ndarray] = None, n_jobs: Optional[int] = None, @@ -493,7 +485,6 @@ def _find_label_issues_multilabel( This is done via a one-vs-rest reduction for each class and the results are subsequently aggregated across all classes. Here `labels` must be formatted as an iterable of iterables, e.g. ``List[List[int]]``. """ - if filter_by in ["low_normalized_margin", "low_self_confidence"]: num_errors = sum( find_label_issues( @@ -529,7 +520,7 @@ def _find_label_issues_multilabel( return label_issues_mask - per_class_issues = _find_multilabel_issues_per_class( + per_class_issues = find_multilabel_issues_per_class( labels, pred_probs, return_indices_ranked_by, @@ -561,132 +552,6 @@ def _find_label_issues_multilabel( return label_issues_idx[np.argsort(label_quality_scores_issues)] -def _find_multilabel_issues_per_class( - labels: list, - pred_probs: np.ndarray, - return_indices_ranked_by: Optional[str] = None, - rank_by_kwargs={}, - filter_by: str = "prune_by_noise_rate", - frac_noise: float = 1.0, - num_to_remove_per_class: Optional[int] = None, - min_examples_per_class=1, - confident_joint: Optional[np.ndarray] = None, - n_jobs: Optional[int] = None, - verbose: bool = False, -) -> Union[np.ndarray, Tuple[List[np.ndarray], List[Any], List[np.ndarray]]]: - """ - Parameters - ---------- - labels : List[List[int]] - List of noisy labels for multi-label classification where each example can belong to multiple classes (e.g. ``labels = [[1,2],[1],[0],[],...]`` indicates the first example in dataset belongs to both class 1 and class 2. - - - pred_probs : np.ndarray - An array of shape ``(N, K)`` of model-predicted probabilities, - ``P(label=k|x)``. Each row of this matrix corresponds - to an example `x` and contains the model-predicted probabilities that - `x` belongs to each possible class, for each of the K classes. The - columns must be ordered such that these probabilities correspond to - class 0, 1, ..., K-1. They need not sum to 1.0 - - - return_indices_ranked_by : {None, 'self_confidence', 'normalized_margin', 'confidence_weighted_entropy'}, default=None - Refer to documentation for this argument in filter.find_label_issues() for details. - - rank_by_kwargs : dict, optional - Refer to documentation for this argument in filter.find_label_issues() for details. - - filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given', - 'low_normalized_margin', 'low_self_confidence'}, default='prune_by_noise_rate' - Refer to documentation for this argument in filter.find_label_issues() for details. - - frac_noise : float, default=1.0 - Refer to documentation for this argument in filter.find_label_issues() for details. - - num_to_remove_per_class : array_like - Refer to documentation for this argument in filter.find_label_issues() for details. - - min_examples_per_class : int, default=1 - Refer to documentation for this argument in filter.find_label_issues() for details. - - confident_joint : np.ndarray, optional - An array of shape ``(K, 2, 2)`` representing a one-vs-rest formatted confident joint. - Entry ``(c, i, j)`` in this array is the number of examples confidently counted into a ``(class c, noisy label=i, true label=j)`` bin, - where `i, j` are either 0 or 1 to denote whether this example belongs to class `c` or not - (recall examples can belong to multiple classes in multi-label classification). - The `confident_joint` can be computed using :py:func:`count.compute_confident_joint `. - If not provided, it is computed from the given (noisy) `labels` and `pred_probs`. - - n_jobs : optional - Refer to documentation for this argument in filter.find_label_issues() for details. - - verbose : optional - If ``True``, prints when multiprocessing happens. - - Returns - ------- - per_class_label_issues : list(np.ndarray) - If `return_indices_ranked_by` left unspecified, returns a list of boolean **masks** for the entire dataset - where ``True`` represents a label issue and ``False`` represents an example that is - accurately labeled with high confidence. - If `return_indices_ranked_by` is specified, returns a list of shorter arrays of **indices** of examples identified to have - label issues (i.e. those indices where the mask would be ``True``), sorting by likelihood that the corresponding label is correct is not supported yet. - - Note - ---- - Obtain the *indices* of label issues in your dataset by setting - `return_indices_ranked_by`. - - """ - y_one, num_classes = get_onehot_num_classes(labels, pred_probs) - if return_indices_ranked_by is None: - bissues = np.zeros(y_one.shape).astype(bool) - else: - label_issues_list = [] - labels_list = [] - pred_probs_list = [] - if confident_joint is not None: - confident_joint_shape = confident_joint.shape - if confident_joint_shape == (num_classes, num_classes): - warnings.warn( - f"The new recommended format for `confident_joint` in multi_label settings is (num_classes,2,2) as output by compute_confident_joint(...,multi_label=True). Your K x K confident_joint in the old format is being ignored." - ) - confident_joint = None - elif confident_joint_shape != (num_classes, 2, 2): - raise ValueError("confident_joint should be of shape (num_classes, 2, 2)") - for class_num, (label, pred_prob_for_class) in enumerate(zip(y_one.T, pred_probs.T)): - pred_probs_binary = stack_complement(pred_prob_for_class) - if confident_joint is None: - conf = None - else: - conf = confident_joint[class_num] - binary_label_issues = find_label_issues( - labels=label, - pred_probs=pred_probs_binary, - return_indices_ranked_by=return_indices_ranked_by, - frac_noise=frac_noise, - rank_by_kwargs=rank_by_kwargs, - filter_by=filter_by, - multi_label=False, - num_to_remove_per_class=num_to_remove_per_class, - min_examples_per_class=min_examples_per_class, - confident_joint=conf, - n_jobs=n_jobs, - verbose=verbose, - ) - - if return_indices_ranked_by is None: - bissues[:, class_num] = binary_label_issues - else: - label_issues_list.append(binary_label_issues) - labels_list.append(label) - pred_probs_list.append(pred_probs_binary) - if return_indices_ranked_by is None: - return bissues - else: - return label_issues_list, labels_list, pred_probs_list - - def _keep_at_least_n_per_class( prune_count_matrix: np.ndarray, n: int, *, frac_noise: float = 1.0 ) -> np.ndarray: @@ -897,9 +762,6 @@ class 0, 1, ..., K-1. `pred_probs` should have been computed using 3 (or label issue and ``False`` represents an example that is accurately labeled with high confidence. - Note - ---- - Multi-label classification is not supported in this method. """ assert_valid_inputs(X=None, y=labels, pred_probs=pred_probs, multi_label=False) diff --git a/cleanlab/internal/multilabel_scorer.py b/cleanlab/internal/multilabel_scorer.py index dd58312a2f..4ad836d283 100644 --- a/cleanlab/internal/multilabel_scorer.py +++ b/cleanlab/internal/multilabel_scorer.py @@ -488,7 +488,7 @@ def get_class_label_quality_scores( >>> labels = np.array([[0, 1, 0], [1, 0, 1]]) >>> pred_probs = np.array([[0.1, 0.9, 0.7], [0.4, 0.1, 0.6]]) >>> scorer = MultilabelScorer() # Use the default base scorer (SELF_CONFIDENCE) - >>> class_label_quality_scores = scorer.get_class_label_quality_scores(labels, pred_probs) + >>> class_label_quality_scores = scorer.get_label_quality_scores_per_class(labels, pred_probs) >>> class_label_quality_scores array([[0.9, 0.9, 0.3], [0.4, 0.9, 0.6]]) diff --git a/cleanlab/multilabel_classification.py b/cleanlab/multilabel_classification.py deleted file mode 100644 index b49b6203da..0000000000 --- a/cleanlab/multilabel_classification.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (C) 2017-2023 Cleanlab Inc. -# This file is part of cleanlab. -# -# cleanlab is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published -# by the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# cleanlab is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with cleanlab. If not, see . - -""" -Methods to rank the severity of label issues in multi-label classification datasets. -Here each example can belong to one or more classes, or none of the classes at all. -Unlike in standard multi-class classification, predicted class probabilities from model need not sum to 1 for each row in multi-label classification. -""" - -import numpy as np # noqa: F401: Imported for type annotations -import numpy.typing as npt -from typing import List, TypeVar, Dict, Any - -from cleanlab.internal.validation import assert_valid_inputs -from cleanlab.internal.util import get_num_classes -from cleanlab.internal.multilabel_scorer import MultilabelScorer, ClassLabelScorer, Aggregator -from cleanlab.internal.multilabel_utils import int2onehot - - -T = TypeVar("T", bound=npt.NBitBase) - - -def get_label_quality_scores( - labels: List[List[int]], - pred_probs: npt.NDArray["np.floating[T]"], - *, - method: str = "self_confidence", - adjust_pred_probs: bool = False, - aggregator_kwargs: Dict[str, Any] = {"method": "exponential_moving_average", "alpha": 0.8} -) -> npt.NDArray["np.floating[T]"]: - """Computes a label quality score each example in a multi-label classification dataset. - - Scores are between 0 and 1 with lower scores indicating examples whose label more likely contains an error. - For each example, this method internally computes a separate score for each individual class - and then aggregates these per-class scores into an overall label quality score for the example. - - To estimate exactly which examples are mislabeled in a multi-label classification dataset, - you can also use :py:func:`filter.find_label_issues ` with argument ``multi_label=True``. - - Parameters - ---------- - labels : List[List[int]] - Multi-label classification labels for each example, which is allowed to belong to multiple classes. - The i-th element of `labels` corresponds to list of classes that i-th example belongs to (e.g. ``labels = [[1,2],[1],[0],..]``). - - Important - --------- - *Format requirements*: For dataset with K classes, individual class labels must be integers in 0, 1, ..., K-1. - - pred_probs : np.ndarray - A 2D array of shape ``(N, K)`` of model-predicted class probabilities ``P(label=k|x)``. - Each row of this matrix corresponds to an example `x` and contains the predicted probabilities - that `x` belongs to each possible class, for each of the K classes. - The columns of this array must be ordered such that these probabilities correspond to class 0, 1, ..., K-1. - In multi-label classification (where classes are not mutually exclusive), the rows of `pred_probs` need not sum to 1. - - Note - ---- - Estimated label quality scores are most accurate when they are computed based on out-of-sample ``pred_probs`` from your model. - To obtain out-of-sample predicted probabilities for every example in your dataset, you can use :ref:`cross-validation `. - This is encouraged to get better results. - - method : {"self_confidence", "normalized_margin", "confidence_weighted_entropy"}, default = "self_confidence" - Method to calculate separate per class annotation scores that are subsequently aggregated to form an overall label quality score. - These scores are separately calculated for each class based on the corresponding column of `pred_probs` in a one-vs-rest manner, - and are standard label quality scores for multi-class classification. - - See also - -------- - :py:func:`rank.get_label_quality_scores ` function for details about each option. - - adjust_pred_probs : bool, default = False - Account for class imbalance in the label-quality scoring by adjusting predicted probabilities - via subtraction of class confident thresholds and renormalization. - Set this to ``True`` if you prefer to account for class-imbalance. - See `Northcutt et al., 2021 `_. - - aggregator_kwargs : dict, default = {"method": "exponential_moving_average", "alpha": 0.8} - A dictionary of hyperparameter values for aggregating per class scores into an overall label quality score for each example. - Options for ``"method"`` include: ``"exponential_moving_average"`` or ``"softmin"`` or your own callable function. - See :py:class:`internal.multilabel_scorer.Aggregator ` for details about each option and other possible hyperparameters. - - Returns - ------- - label_quality_scores : np.ndarray - A 1D array of shape ``(N,)`` with a label quality score (between 0 and 1) for each example in the dataset. - Lower scores indicate examples whose label is more likely to contain annotation errors. - - - Examples - -------- - >>> from cleanlab.multilabel_classification import get_label_quality_scores - >>> import numpy as np - >>> labels = [[1], [0,2]] - >>> pred_probs = np.array([[0.1, 0.9, 0.1], [0.4, 0.1, 0.9]]) - >>> scores = get_label_quality_scores(labels, pred_probs) - >>> scores - array([0.9, 0.5]) - """ - - assert_valid_inputs( - X=None, y=labels, pred_probs=pred_probs, multi_label=True, allow_one_class=True - ) - num_classes = get_num_classes(labels=labels, pred_probs=pred_probs, multi_label=True) - binary_labels = int2onehot(labels, K=num_classes) - base_scorer = ClassLabelScorer.from_str(method) - base_scorer_kwargs = {"adjust_pred_probs": adjust_pred_probs} - aggregator = Aggregator(**aggregator_kwargs) - scorer = MultilabelScorer(base_scorer, aggregator) - return scorer(binary_labels, pred_probs, base_scorer_kwargs=base_scorer_kwargs) diff --git a/cleanlab/multilabel_classification/__init__.py b/cleanlab/multilabel_classification/__init__.py new file mode 100644 index 0000000000..fcddecdb5d --- /dev/null +++ b/cleanlab/multilabel_classification/__init__.py @@ -0,0 +1,4 @@ +from .rank import get_label_quality_scores +from . import rank +from . import dataset +from . import filter diff --git a/cleanlab/multilabel_classification/dataset.py b/cleanlab/multilabel_classification/dataset.py new file mode 100644 index 0000000000..e3987a5ece --- /dev/null +++ b/cleanlab/multilabel_classification/dataset.py @@ -0,0 +1,317 @@ +import pandas as pd +import numpy as np +from typing import Optional, cast, Dict, Any # noqa: F401 +from cleanlab.multilabel_classification.filter import ( + find_multilabel_issues_per_class, + find_label_issues, +) +from cleanlab.internal.multilabel_utils import get_onehot_num_classes +from collections import defaultdict + + +def common_multilabel_issues( + labels=list, + pred_probs=None, + *, + class_names=None, + confident_joint=None, +) -> pd.DataFrame: + """Summarizes which classes in a multi-label dataset appear most often mislabeled overall. + + Since classes are not mutually exclusive in multi-label classification, this method summarizes the label issues for each class independently of the others. + + Parameters + ---------- + labels : List[List[int]] + List of noisy labels for multi-label classification where each example can belong to multiple classes. + Refer to documentation for this argument in :py:func:`multilabel_classification.filter.find_label_issues ` for further details. + + pred_probs : np.ndarray + An array of shape ``(N, K)`` of model-predicted class probabilities. + Refer to documentation for this argument in :py:func:`multilabel_classification.filter.find_label_issues ` for further details. + + class_names : Iterable[str], optional + A list or other iterable of the string class names. Its order must match the label indices. + If class 0 is 'dog' and class 1 is 'cat', then ``class_names = ['dog', 'cat']``. + If provided, the returned DataFrame will have an extra *Class Name* column with this info. + + confident_joint : np.ndarray, optional + An array of shape ``(K, 2, 2)`` representing a one-vs-rest formatted confident joint. + Refer to documentation for this argument in :py:func:`multilabel_classification.filter.find_label_issues ` for details. + + Returns + ------- + common_multilabel_issues : pd.DataFrame + DataFrame where each row corresponds to a class summarized by the following columns: + + * *Class Name*: The name of the class if class_names is provided. + * *Class Index*: The index of the class. + * *In Given Label*: Whether the Class is originally annotated True or False in the given label. + * *In Suggested Label*: Whether the Class should be True or False in the suggested label (based on model's prediction). + * *Num Examples*: Number of examples flagged as a label issue where this Class is True/False "In Given Label" but cleanlab estimates the annotation should actually be as specified "In Suggested Label". I.e. the number of examples in your dataset where this Class was labeled as True but likely should have been False (or vice versa). + * *Issue Probability*: The *Num Examples* column divided by the total number of examples in the dataset; i.e. the relative overall frequency of each type of label issue in your dataset. + + By default, the rows in this DataFrame are ordered by "Issue Probability" (descending). + """ + + num_examples = _get_num_examples_multilabel(labels=labels, confident_joint=confident_joint) + summary_issue_counts = defaultdict(list) + y_one, num_classes = get_onehot_num_classes(labels, pred_probs) + label_issues_list, labels_list, pred_probs_list = find_multilabel_issues_per_class( + labels=labels, + pred_probs=pred_probs, + confident_joint=confident_joint, + return_indices_ranked_by="self_confidence", + ) + + for class_num, (label, issues_for_class) in enumerate(zip(y_one.T, label_issues_list)): + binary_label_issues = np.zeros(len(label)).astype(bool) + binary_label_issues[issues_for_class] = True + true_but_false_count = sum(np.logical_and(label == 1, binary_label_issues)) + false_but_true_count = sum(np.logical_and(label == 0, binary_label_issues)) + + if class_names is not None: + summary_issue_counts["Class Name"].append(class_names[class_num]) + summary_issue_counts["Class Index"].append(class_num) + summary_issue_counts["In Given Label"].append(True) + summary_issue_counts["In Suggested Label"].append(False) + summary_issue_counts["Num Examples"].append(true_but_false_count) + summary_issue_counts["Issue Probability"].append(true_but_false_count / num_examples) + + if class_names is not None: + summary_issue_counts["Class Name"].append(class_names[class_num]) + summary_issue_counts["Class Index"].append(class_num) + summary_issue_counts["In Given Label"].append(False) + summary_issue_counts["In Suggested Label"].append(True) + summary_issue_counts["Num Examples"].append(false_but_true_count) + summary_issue_counts["Issue Probability"].append(false_but_true_count / num_examples) + return ( + pd.DataFrame.from_dict(summary_issue_counts) + .sort_values(by=["Issue Probability"], ascending=False) + .reset_index(drop=True) + ) + + +def rank_classes_by_multilabel_quality( + labels=None, + pred_probs=None, + *, + class_names=None, + joint=None, + confident_joint=None, +) -> pd.DataFrame: + """ + Returns a DataFrame with three overall label quality scores per class for a multi-label dataset. + + These numbers summarize all examples annotated with the class (details listed below under the Returns parameter). + By default, classes are ordered by "Label Quality Score", so the most problematic classes are reported first in the DataFrame. + + Score values are unnormalized and may be very small. What matters is their relative ranking across the classes. + + **Parameters**: For information about the arguments to this method, see the documentation of + :py:func:`common_multilabel_issues `. + + Returns + ------- + overall_label_quality : pd.DataFrame + Pandas DataFrame with one row per class and columns: "Class Index", "Label Issues", + "Inverse Label Issues", "Label Issues", "Inverse Label Noise", "Label Quality Score". + Some entries are overall quality scores between 0 and 1, summarizing how good overall the labels + appear to be for that class (lower values indicate more erroneous labels). + Other entries are estimated counts of annotation errors related to this class. + Here is what each column represents: + + * *Class Name*: The name of the class if class_names is provided. + * *Class Index*: The index of the class in 0, 1, ..., K-1. + * *Label Issues*: Estimated number of examples in the dataset that are labeled as belonging to class k but actually should not belong to this class. + * *Inverse Label Issues*: Estimated number of examples in the dataset that should actually be labeled as class k but did not receive this label. + * *Label Noise*: Estimated proportion of examples in the dataset that are labeled as class k but should not be. For each class k: this is computed by dividing the number of examples with "Label Issues" that were labeled as class k by the total number of examples labeled as class k. + * *Inverse Label Noise*: Estimated proportion of examples in the dataset that should actually be labeled as class k but did not receive this label. + * *Label Quality Score*: Estimated proportion of examples labeled as class k that have been labeled correctly, i.e. ``1 - label_noise``. + + By default, the DataFrame is ordered by "Label Quality Score" (in ascending order), so the classes with the most label issues appear first. + """ + + issues_df = common_multilabel_issues( + labels=labels, pred_probs=pred_probs, class_names=class_names, confident_joint=joint + ) + issues_dict = defaultdict(defaultdict) # type: Dict[str, Any] + num_examples = _get_num_examples_multilabel(labels=labels, confident_joint=confident_joint) + return_columns = [ + "Class Name", + "Class Index", + "Label Issues", + "Inverse Label Issues", + "Label Noise", + "Inverse Label Noise", + "Label Quality Score", + ] + if class_names is None: + return_columns = return_columns[1:] + for class_num, row in issues_df.iterrows(): + if row["In Given Label"]: + if class_names is not None: + issues_dict[row["Class Index"]]["Class Name"] = row["Class Name"] + issues_dict[row["Class Index"]]["Label Issues"] = int( + row["Issue Probability"] * num_examples + ) + issues_dict[row["Class Index"]]["Label Noise"] = row["Issue Probability"] + issues_dict[row["Class Index"]]["Label Quality Score"] = ( + 1 - issues_dict[row["Class Index"]]["Label Noise"] + ) + else: + if class_names is not None: + issues_dict[row["Class Index"]]["Class Name"] = row["Class Name"] + issues_dict[row["Class Index"]]["Inverse Label Issues"] = int( + row["Issue Probability"] * num_examples + ) + issues_dict[row["Class Index"]]["Inverse Label Noise"] = row["Issue Probability"] + + issues_df_dict = defaultdict(list) + for i in issues_dict: + issues_df_dict["Class Index"].append(i) + for j in issues_dict[i]: + issues_df_dict[j].append(issues_dict[i][j]) + return ( + pd.DataFrame.from_dict(issues_df_dict) + .sort_values(by="Label Quality Score", ascending=True) + .reset_index(drop=True) + )[return_columns] + + +def _get_num_examples_multilabel(labels=None, confident_joint: Optional[np.ndarray] = None) -> int: + """Helper method that finds the number of examples from the parameters or throws an error + if neither parameter is provided. + + Parameters + ---------- + For parameter info, see the docstring of :py:func:`common_multilabel_issues ` + + Returns + ------- + num_examples : int + The number of examples in the dataset. + + Raises + ------ + ValueError + If `labels` is None.""" + + if labels is None and confident_joint is None: + raise ValueError( + "Error: num_examples is None. You must either provide confident_joint, " + "or provide both num_example and joint as input parameters." + ) + _confident_joint = cast(np.ndarray, confident_joint) + num_examples = len(labels) if labels is not None else cast(int, np.sum(_confident_joint[0])) + return num_examples + + +def overall_multilabel_health_score( + labels=None, + pred_probs=None, + *, + confident_joint=None, +) -> float: + """Returns a single score between 0 and 1 measuring the overall quality of all labels in a multi-label classification dataset. + Intuitively, the score is the average correctness of the given labels across all examples in the + dataset. So a score of 1 suggests your data is perfectly labeled and a score of 0.5 suggests + half of the examples in the dataset may be incorrectly labeled. Thus, a higher + score implies a higher quality dataset. + + **Parameters**: For information about the arguments to this method, see the documentation of + :py:func:`common_multilabel_issues `. + + Returns + ------- + health_score : float + A overall score between 0 and 1, where 1 implies all labels in the dataset are estimated to be correct. + A score of 0.5 implies that half of the dataset's labels are estimated to have issues. + """ + num_examples = _get_num_examples_multilabel(labels=labels) + issues = find_label_issues( + labels=labels, pred_probs=pred_probs, confident_joint=confident_joint + ) + return 1.0 - sum(issues) / num_examples + + +def multilabel_health_summary( + labels=None, + pred_probs=None, + *, + class_names=None, + num_examples=None, + confident_joint=None, + verbose=True, +) -> Dict: + """Prints a health summary of your multi-label datasets including useful statistics like: + + * The classes with the most and least label issues + * Overall data label quality health score statistics for your dataset + + **Parameters**: For information about the arguments to this method, see the documentation of + :py:func:`common_multilabel_issues `. + + Returns + ------- + summary : dict + A dictionary containing keys (see the corresponding functions' documentation to understand the values): + + - ``"overall_label_health_score"``, corresponding to output of :py:func:`overall_multilabel_health_score ` + - ``"classes_by_multilabel_quality"``, corresponding to output of :py:func:`rank_classes_by_multilabel_quality ` + - ``"common_multilabel_issues"``, corresponding to output of :py:func:`common_multilabel_issues ` + """ + from cleanlab.internal.util import smart_display_dataframe + + if num_examples is None: + num_examples = _get_num_examples_multilabel(labels=labels) + + if verbose: + longest_line = f"| for your dataset with {num_examples:,} examples " + print( + "-" * (len(longest_line) - 1) + + "\n" + + f"| Generating a Cleanlab Dataset Health Summary{' ' * (len(longest_line) - 49)}|\n" + + longest_line + + f"| Note, Cleanlab is not a medical doctor... yet.{' ' * (len(longest_line) - 51)}|\n" + + "-" * (len(longest_line) - 1) + + "\n", + ) + + df_class_label_quality = rank_classes_by_multilabel_quality( + labels=labels, + pred_probs=pred_probs, + class_names=class_names, + confident_joint=confident_joint, + ) + if verbose: + print("Overall Class Quality and Noise across your dataset (below)") + print("-" * 60, "\n", flush=True) + smart_display_dataframe(df_class_label_quality) + + df_common_issues = common_multilabel_issues( + labels=labels, + pred_probs=pred_probs, + class_names=class_names, + confident_joint=confident_joint, + ) + if verbose: + print( + "\nCommon multilabel issues are" + "\n" + "-" * 83 + "\n", + flush=True, + ) + smart_display_dataframe(df_common_issues) + print() + + health_score = overall_multilabel_health_score( + labels=labels, + pred_probs=pred_probs, + confident_joint=confident_joint, + ) + if verbose: + print("\nGenerated with <3 from Cleanlab.\n") + return { + "overall_multilabel_health_score": health_score, + "classes_by_multilabel_quality": df_class_label_quality, + "common_multilabel_issues": df_common_issues, + } diff --git a/cleanlab/multilabel_classification/filter.py b/cleanlab/multilabel_classification/filter.py new file mode 100644 index 0000000000..c59ea701da --- /dev/null +++ b/cleanlab/multilabel_classification/filter.py @@ -0,0 +1,258 @@ +import warnings +from typing import Optional, Union, Tuple, List, Any +import numpy as np + + +def find_label_issues( + labels: list, + pred_probs: np.ndarray, + return_indices_ranked_by: Optional[str] = None, + rank_by_kwargs={}, + filter_by: str = "prune_by_noise_rate", + frac_noise: float = 1.0, + num_to_remove_per_class: Optional[List[int]] = None, + min_examples_per_class=1, + confident_joint: Optional[np.ndarray] = None, + n_jobs: Optional[int] = None, + verbose: bool = False, +) -> np.ndarray: + """ + Identifies potentially mislabeled examples in a multi-label classification dataset. + An example is flagged as with a label issue if *any* of the classes appear to be incorrectly annotated for this example. + + Parameters + ---------- + labels : List[List[int]] + List of noisy labels for multi-label classification where each example can belong to multiple classes. + This is an iterable of iterables where the i-th element of `labels` corresponds to a list of classes that the i-th example belongs to, + according to the original data annotation (e.g. ``labels = [[1,2],[1],[0],..]``). + This method will return the indices i where the inner list ``labels[i]`` is estimated to have some error. + For a dataset with K classes, each class must be represented as an integer in 0, 1, ..., K-1 within the labels. + + pred_probs : np.ndarray + An array of shape ``(N, K)`` of model-predicted class probabilities. + Each row of this matrix corresponds to an example `x` + and contains the predicted probability that `x` belongs to each possible class, + for each of the K classes (along its columns). + The columns need not sum to 1 but must be ordered such that + these probabilities correspond to class 0, 1, ..., K-1. + + Note + ---- + Estimated label quality scores are most accurate when they are computed based on out-of-sample ``pred_probs`` from your model. + To obtain out-of-sample predicted probabilities for every example in your dataset, you can use :ref:`cross-validation `. + This is encouraged to get better results. + + return_indices_ranked_by : {None, 'self_confidence', 'normalized_margin', 'confidence_weighted_entropy'}, default=None + This function can return a boolean mask (if None) or an array of the example-indices with issues sorted based on the specified ranking method. + Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details. + + rank_by_kwargs : dict, optional + Optional keyword arguments to pass into scoring functions for ranking by + label quality score (see :py:func:`rank.get_label_quality_scores + `). + + filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given', + 'low_normalized_margin', 'low_self_confidence'}, default='prune_by_noise_rate' + The specific Confident Learning method to determine precisely which examples have label issues in a dataset. + Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details. + + frac_noise : float, default=1.0 + This will return the "top" frac_noise * num_label_issues estimated label errors, dependent on the filtering method used, + Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details. + + num_to_remove_per_class : array_like + An iterable that specifies the number of mislabeled examples to return from each class. + Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details. + + min_examples_per_class : int, default=1 + The minimum number of examples required per class below which examples from this class will not be flagged as label issues. + Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details. + + confident_joint : np.ndarray, optional + An array of shape ``(K, 2, 2)`` representing a one-vs-rest formatted confident joint, as is appropriate for multi-label classification tasks. + Entry ``(c, i, j)`` in this array is the number of examples confidently counted into a ``(class c, noisy label=i, true label=j)`` bin, + where `i, j` are either 0 or 1 to denote whether this example belongs to class `c` or not + (recall examples can belong to multiple classes in multi-label classification). + The `confident_joint` can be computed using :py:func:`count.compute_confident_joint ` with multi_label=True. + If not provided, it is computed from the given (noisy) `labels` and `pred_probs`. + + n_jobs : optional + Number of processing threads used by multiprocessing. + Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details. + + verbose : optional + If ``True``, prints when multiprocessing happens. + + Returns + ------- + label_issues : np.ndarray + If `return_indices_ranked_by` left unspecified, returns a boolean **mask** for the entire dataset + where ``True`` represents an example suffering from some label issue and + ``False`` represents an example that appears accurately labeled. + + If `return_indices_ranked_by` is specified, this method instead returns a list of **indices** of examples identified with + label issues (i.e. those indices where the mask would be ``True``). + Indices are sorted by the likelihood that *all* classes are correctly annotated for the corresponding example. + + Note + ---- + Obtain the *indices* of examples with label issues in your dataset by setting + `return_indices_ranked_by`. + + """ + from cleanlab.filter import _find_label_issues_multilabel + + return _find_label_issues_multilabel( + labels=labels, + pred_probs=pred_probs, + return_indices_ranked_by=return_indices_ranked_by, + rank_by_kwargs=rank_by_kwargs, + filter_by=filter_by, + frac_noise=frac_noise, + num_to_remove_per_class=num_to_remove_per_class, + min_examples_per_class=min_examples_per_class, + confident_joint=confident_joint, + n_jobs=n_jobs, + verbose=verbose, + ) + + +def find_multilabel_issues_per_class( + labels: list, + pred_probs: np.ndarray, + return_indices_ranked_by: Optional[str] = None, + rank_by_kwargs={}, + filter_by: str = "prune_by_noise_rate", + frac_noise: float = 1.0, + num_to_remove_per_class: Optional[List[int]] = None, + min_examples_per_class=1, + confident_joint: Optional[np.ndarray] = None, + n_jobs: Optional[int] = None, + verbose: bool = False, +) -> Union[np.ndarray, Tuple[List[np.ndarray], List[Any], List[np.ndarray]]]: + """ + Identifies potentially bad labels for each example and each class in a multi-label classification dataset. + Whereas :py:func:`find_label_issues ` + estimates which examples have an erroneous annotation for *any* class, this method estimates which specific classes are incorrectly annotated as well. + This method returns a list of size K, the number of classes in the dataset. + + Parameters + ---------- + labels : List[List[int]] + List of noisy labels for multi-label classification where each example can belong to multiple classes. + Refer to documentation for this argument in :py:func:`find_label_issues ` for further details. + This method will identify whether ``labels[i][k]`` appears correct, for every example ``i`` and class ``k``. + + pred_probs : np.ndarray + An array of shape ``(N, K)`` of model-predicted class probabilities. + Refer to documentation for this argument in :py:func:`find_label_issues ` for further details. + + return_indices_ranked_by : {None, 'self_confidence', 'normalized_margin', 'confidence_weighted_entropy'}, default=None + This function can return a boolean mask (if this argument is ``None``) or a sorted array of indices based on the specified ranking method (if not ``None``). + Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details. + + rank_by_kwargs : dict, optional + Optional keyword arguments to pass into scoring functions for ranking by. + label quality score (see :py:func:`rank.get_label_quality_scores + `). + + filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given', + 'low_normalized_margin', 'low_self_confidence'}, default='prune_by_noise_rate' + The specific method that can be used to filter or prune examples with label issues from a dataset. + Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details. + + frac_noise : float, default=1.0 + This will return the "top" frac_noise * num_label_issues estimated label errors, dependent on the filtering method used, + Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details. + + num_to_remove_per_class : array_like + This parameter is an iterable that specifies the number of mislabeled examples to return from each class. + Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details. + + min_examples_per_class : int, default=1 + The minimum number of examples required per class to avoid flagging as label issues. + Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details. + + confident_joint : np.ndarray, optional + An array of shape ``(K, 2, 2)`` representing a one-vs-rest formatted confident joint. + Refer to documentation for this argument in :py:func:`cleanlab.multilabel_classification.filter.find_label_issues ` for details. + + n_jobs : optional + Number of processing threads used by multiprocessing. + Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details. + + verbose : optional + If ``True``, prints when multiprocessing happens. + + Returns + ------- + per_class_label_issues : list(np.ndarray) + By default, this is a list of length K containing the examples where each class appears incorrectly annotated. + ``per_class_label_issues[k]`` is a Boolean mask of the same length as the dataset, + where ``True`` values indicate examples where class ``k`` appears incorrectly annotated. + + For more details, refer to :py:func:`cleanlab.multilabel_classification.filter.find_label_issues `. + + Otherwise if `return_indices_ranked_by` is not ``None``, then this method returns 3 objects (each of length K, the number of classes): `label_issues_list`, `labels_list`, `pred_probs_list`. + + * *label_issues_list*: an ordered list of indices of examples where class k appears incorrectly annotated, sorted by the likelihood that class k is correctly annotated. + + * *labels_list*: a binary one-hot representation of the original labels, useful if you want to compute label quality scores. + + * *pred_probs_list*: a one-vs-rest representation of the original predicted probabilities of shape ``(N, 2)``, useful if you want to compute label quality scores. + ``pred_probs_list[k][i][0]`` is the estimated probability that example ``i`` belongs to class ``k``, and is equal to: ``1 - ``pred_probs_list[k][i][1]``. + """ + import cleanlab.filter + from cleanlab.internal.multilabel_utils import get_onehot_num_classes, stack_complement + + y_one, num_classes = get_onehot_num_classes(labels, pred_probs) + if return_indices_ranked_by is None: + bissues = np.zeros(y_one.shape).astype(bool) + else: + label_issues_list = [] + labels_list = [] + pred_probs_list = [] + if confident_joint is not None: + confident_joint_shape = confident_joint.shape + if confident_joint_shape == (num_classes, num_classes): + warnings.warn( + f"The new recommended format for `confident_joint` in multi_label settings is (num_classes,2,2) as output by compute_confident_joint(...,multi_label=True). Your K x K confident_joint in the old format is being ignored." + ) + confident_joint = None + elif confident_joint_shape != (num_classes, 2, 2): + raise ValueError("confident_joint should be of shape (num_classes, 2, 2)") + for class_num, (label, pred_prob_for_class) in enumerate(zip(y_one.T, pred_probs.T)): + pred_probs_binary = stack_complement(pred_prob_for_class) + if confident_joint is None: + conf = None + else: + conf = confident_joint[class_num] + if num_to_remove_per_class is not None: + ml_num_to_remove_per_class = [num_to_remove_per_class[class_num], 0] + else: + ml_num_to_remove_per_class = None + binary_label_issues = cleanlab.filter.find_label_issues( + labels=label, + pred_probs=pred_probs_binary, + return_indices_ranked_by=return_indices_ranked_by, + frac_noise=frac_noise, + rank_by_kwargs=rank_by_kwargs, + filter_by=filter_by, + num_to_remove_per_class=ml_num_to_remove_per_class, + min_examples_per_class=min_examples_per_class, + confident_joint=conf, + n_jobs=n_jobs, + verbose=verbose, + ) + + if return_indices_ranked_by is None: + bissues[:, class_num] = binary_label_issues + else: + label_issues_list.append(binary_label_issues) + labels_list.append(label) + pred_probs_list.append(pred_probs_binary) + if return_indices_ranked_by is None: + return bissues + else: + return label_issues_list, labels_list, pred_probs_list diff --git a/cleanlab/multilabel_classification/rank.py b/cleanlab/multilabel_classification/rank.py new file mode 100644 index 0000000000..57c24bc27e --- /dev/null +++ b/cleanlab/multilabel_classification/rank.py @@ -0,0 +1,191 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . + +""" +Methods to rank the severity of label issues in multi-label classification datasets. +Here each example can belong to one or more classes, or none of the classes at all. +Unlike in standard multi-class classification, predicted class probabilities from model need not sum to 1 for each row in multi-label classification. +""" + +import numpy as np # noqa: F401: Imported for type annotations +import numpy.typing as npt +from typing import List, TypeVar, Dict, Any, Optional, Tuple + +from cleanlab.internal.validation import assert_valid_inputs +from cleanlab.internal.util import get_num_classes +from cleanlab.internal.multilabel_utils import int2onehot +from cleanlab.internal.multilabel_scorer import MultilabelScorer, ClassLabelScorer, Aggregator + + +T = TypeVar("T", bound=npt.NBitBase) + + +def _labels_to_binary( + labels: List[List[int]], + pred_probs: npt.NDArray["np.floating[T]"], +) -> np.ndarray: + """Validate the inputs to the multilabel scorer. Also transform the labels to a binary representation.""" + assert_valid_inputs( + X=None, y=labels, pred_probs=pred_probs, multi_label=True, allow_one_class=True + ) + num_classes = get_num_classes(labels=labels, pred_probs=pred_probs, multi_label=True) + binary_labels = int2onehot(labels, K=num_classes) + return binary_labels + + +def _create_multilabel_scorer( + method: str, + adjust_pred_probs: bool, + aggregator_kwargs: Optional[Dict[str, Any]] = None, +) -> Tuple[MultilabelScorer, Dict]: + """This function acts as a factory that creates a MultilabelScorer.""" + base_scorer = ClassLabelScorer.from_str(method) + base_scorer_kwargs = {"adjust_pred_probs": adjust_pred_probs} + if aggregator_kwargs: + aggregator = Aggregator(**aggregator_kwargs) + scorer = MultilabelScorer(base_scorer, aggregator) + else: + scorer = MultilabelScorer(base_scorer) + return scorer, base_scorer_kwargs + + +def get_label_quality_scores( + labels: List[List[int]], + pred_probs: npt.NDArray["np.floating[T]"], + *, + method: str = "self_confidence", + adjust_pred_probs: bool = False, + aggregator_kwargs: Dict[str, Any] = {"method": "exponential_moving_average", "alpha": 0.8}, +) -> npt.NDArray["np.floating[T]"]: + """Computes a label quality score each example in a multi-label classification dataset. + + Scores are between 0 and 1 with lower scores indicating examples whose label more likely contains an error. + For each example, this method internally computes a separate score for each individual class + and then aggregates these per-class scores into an overall label quality score for the example. + + + Parameters + ---------- + labels : List[List[int]] + List of noisy labels for multi-label classification where each example can belong to multiple classes. + Refer to documentation for this argument in :py:func:`multilabel_classification.filter.find_label_issues ` for further details. + + pred_probs : np.ndarray + An array of shape ``(N, K)`` of model-predicted class probabilities. + Refer to documentation for this argument in :py:func:`multilabel_classification.filter.find_label_issues ` for further details. + + method : {"self_confidence", "normalized_margin", "confidence_weighted_entropy"}, default = "self_confidence" + Method to calculate separate per-class annotation scores for an example that are then aggregated into an overall label quality score for the example. + These scores are separately calculated for each class based on the corresponding column of `pred_probs` in a one-vs-rest manner, + and are standard label quality scores for binary classification (based on whether the class should or should not apply to this example). + + See also + -------- + :py:func:`rank.get_label_quality_scores ` function for details about each option. + + adjust_pred_probs : bool, default = False + Account for class imbalance in the label-quality scoring by adjusting predicted probabilities. + Refer to documentation for this argument in :py:func:`rank.get_label_quality_scores ` for details. + + + aggregator_kwargs : dict, default = {"method": "exponential_moving_average", "alpha": 0.8} + A dictionary of hyperparameter values to use when aggregating per-class scores into an overall label quality score for each example. + Options for ``"method"`` include: ``"exponential_moving_average"`` or ``"softmin"`` or your own callable function. + See :py:class:`internal.multilabel_scorer.Aggregator ` for details about each option and other possible hyperparameters. + + To get a score for each class annotation for each example, use the :py:func:`multilabel_classification.classification.rank.get_label_quality_scores_per_class ` method instead. + + Returns + ------- + label_quality_scores : np.ndarray + A 1D array of shape ``(N,)`` with a label quality score (between 0 and 1) for each example in the dataset. + Lower scores indicate examples whose label is more likely to contain some annotation error (for any of the classes). + + Examples + -------- + >>> from cleanlab.multilabel_classification import get_label_quality_scores + >>> import numpy as np + >>> labels = [[1], [0,2]] + >>> pred_probs = np.array([[0.1, 0.9, 0.1], [0.4, 0.1, 0.9]]) + >>> scores = get_label_quality_scores(labels, pred_probs) + >>> scores + array([0.9, 0.5]) + """ + binary_labels = _labels_to_binary(labels, pred_probs) + scorer, base_scorer_kwargs = _create_multilabel_scorer( + method=method, + adjust_pred_probs=adjust_pred_probs, + aggregator_kwargs=aggregator_kwargs, + ) + return scorer(binary_labels, pred_probs, base_scorer_kwargs=base_scorer_kwargs) + + +def get_label_quality_scores_per_class( + labels: List[List[int]], + pred_probs: npt.NDArray["np.floating[T]"], + *, + method: str = "self_confidence", + adjust_pred_probs: bool = False, +) -> np.ndarray: + """ + Computes a quality score quantifying how likely each individual class annotation is correct in a multi-label classification dataset. + This is similar to :py:func:`get_label_quality_scores ` + but instead returns the per-class results without aggregation. + For a dataset with K classes, each example receives K scores from this method. + Refer to documentation in :py:func:`get_label_quality_scores ` for details. + + Parameters + ---------- + labels : List[List[int]] + List of noisy labels for multi-label classification where each example can belong to multiple classes. + Refer to documentation for this argument in :py:func:`find_label_issues ` for further details. + + pred_probs : np.ndarray + An array of shape ``(N, K)`` of model-predicted class probabilities. + Refer to documentation for this argument in :py:func:`find_label_issues ` for further details. + + method : {"self_confidence", "normalized_margin", "confidence_weighted_entropy"}, default = "self_confidence" + Method to calculate separate per-class annotation scores (that quantify how likely a particular class annotation is correct for a particular example). + Refer to documentation for this argument in :py:func:`get_label_quality_scores ` for further details. + + adjust_pred_probs : bool, default = False + Account for class imbalance in the label-quality scoring by adjusting predicted probabilities. + Refer to documentation for this argument in :py:func:`rank.get_label_quality_scores ` for details. + + Returns + ------- + label_quality_scores : list(np.ndarray) + A list containing K arrays, each of shape (N,). Here K is the number of classes in the dataset and N is the number of examples. + ``label_quality_scores[k][i]`` is a score between 0 and 1 quantifying how likely the annotation for class ``k`` is correct for example ``i``. + + Examples + -------- + >>> from cleanlab.multilabel_classification import get_label_quality_scores + >>> import numpy as np + >>> labels = [[1], [0,2]] + >>> pred_probs = np.array([[0.1, 0.9, 0.1], [0.4, 0.1, 0.9]]) + >>> scores = get_label_quality_scores(labels, pred_probs) + >>> scores + array([0.9, 0.5]) + """ + binary_labels = _labels_to_binary(labels, pred_probs) + scorer, base_scorer_kwargs = _create_multilabel_scorer( + method=method, + adjust_pred_probs=adjust_pred_probs, + ) + return scorer.get_class_label_quality_scores( + labels=binary_labels, pred_probs=pred_probs, base_scorer_kwargs=base_scorer_kwargs + ) diff --git a/docs/source/tutorials/multilabel_classification.ipynb b/docs/source/tutorials/multilabel_classification.ipynb index b1b002975e..49a824c031 100644 --- a/docs/source/tutorials/multilabel_classification.ipynb +++ b/docs/source/tutorials/multilabel_classification.ipynb @@ -19,17 +19,16 @@ "Quickstart\n", "
    \n", " \n", - "cleanlab finds label issues based on two inputs: `labels` formatted as a list of lists of integer class indices that apply to each example in your dataset, and `pred_probs` from a trained multi-label classification model (which do not need to sum to 1 since the classes are not mutually exclusive). Once you have these, run the code below to find label issues in your dataset.\n", + "cleanlab finds label issues based on two inputs: `labels` formatted as a list of lists of integer class indices that apply to each example in your dataset, and `pred_probs` from a trained multi-label classification model (which do not need to sum to 1 since the classes are not mutually exclusive). Once you have these, run the code below to find label issues in your multi-label dataset:\n", "\n", "
    \n", " \n", "```ipython3 \n", - "from cleanlab.filter import find_label_issues\n", + "from cleanlab.multilabel_classification.filter import find_label_issues\n", "\n", "ranked_label_issues = find_label_issues(\n", " labels=labels,\n", " pred_probs=pred_probs,\n", - " multi_label=True,\n", " return_indices_ranked_by=\"self_confidence\",\n", ")\n", "```\n", @@ -105,13 +104,8 @@ "from sklearn.model_selection import StratifiedKFold\n", "import matplotlib.pyplot as plt\n", "\n", - "from cleanlab.filter import find_label_issues\n", - "import cleanlab.internal.multilabel_utils as mlutils\n", - "from cleanlab.internal.multilabel_utils import onehot2int, int2onehot\n", - "from cleanlab.benchmarking.noise_generation import (\n", - " generate_noise_matrix_from_trace,\n", - " generate_noisy_labels,\n", - ")" + "from cleanlab.multilabel_classification.filter import find_label_issues\n", + "from cleanlab.multilabel_classification.rank import get_label_quality_scores" ] }, { @@ -132,6 +126,11 @@ "```ipython3\n", "# Note: This pulldown content is for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", " \n", + "from cleanlab.benchmarking.noise_generation import (\n", + " generate_noise_matrix_from_trace,\n", + " generate_noisy_labels,\n", + ")\n", + "\n", "def make_multilabel_data(\n", " means=[[-5, 3.5], [0, 2], [-3, 6]],\n", " covs=[[[3, -1.5], [-1.5, 1]], [[5, -1.5], [-1.5, 1]], [[3, -1.5], [-1.5, 1]]],\n", @@ -255,6 +254,11 @@ }, "outputs": [], "source": [ + "from cleanlab.benchmarking.noise_generation import (\n", + " generate_noise_matrix_from_trace,\n", + " generate_noisy_labels,\n", + ")\n", + "\n", "def make_multilabel_data(\n", " means=[[-5, 3.5], [0, 2], [-3, 6]],\n", " covs=[[[3, -1.5], [-1.5, 1]], [[5, -1.5], [-1.5, 1]], [[3, -1.5], [-1.5, 1]]],\n", @@ -434,7 +438,7 @@ "\n", "`labels` should be a list of lists, whose *i*-th entry is a list of (integer) class indices that apply to the *i*-th example in the dataset. If your classes are represented as string names, you should map these to integer indices. The label for an example that belongs to none of the classes should just be an empty list `[]`.\n", "\n", - "Once you have `pred_probs` and `labels` in the appropriate formats, you can find label issues with cleanlab for any multi-label dataset!\n", + "Once you have `pred_probs` and `labels` appropriately formatted, you can find/analyze label issues in any multi-label dataset via methods from the `cleanlab.multilabel_classification` module!\n", "\n", "Here's what these look like for the first few examples in our synthetic multi-label dataset: " ] @@ -474,7 +478,6 @@ "issues = find_label_issues(\n", " labels=labels,\n", " pred_probs=pred_probs,\n", - " multi_label=True,\n", " return_indices_ranked_by=\"self_confidence\",\n", ")\n", "\n", @@ -486,8 +489,6 @@ "id": "d6af5833", "metadata": {}, "source": [ - "Note we specified the `multi_label` option above to distinguish the task from *multi-class classification* (otherwise assumed as the default task).\n", - "\n", "Let's look at the samples that cleanlab thinks are most likely to be mislabeled. You can see that cleanlab was able to identify most of `true_errors` in our small dataset (despite not having access to this variable, which you won't have in your own applications)." ] }, @@ -518,8 +519,6 @@ "metadata": {}, "outputs": [], "source": [ - "from cleanlab.multilabel_classification import get_label_quality_scores\n", - "\n", "scores = get_label_quality_scores(labels, pred_probs)\n", "\n", "print(f\"Label quality scores of the first 10 examples in dataset:\\n{scores[:10]}\")" @@ -530,6 +529,8 @@ "id": "d65af827-aeda-4b6b-9ae7-b1f0b84700d5", "metadata": {}, "source": [ + "**Note:** For multi-label data, make sure to use the versions of `find_label_issues()` and `get_label_quality_scores()` from the `cleanlab.multilabel_classification` module. There exist other versions of these methods for other types of data.\n", + "\n", "### How to format labels given as a one-hot (multi-hot) binary matrix?\n", "\n", "For multi-label classification, cleanlab expects labels to be formatted as a list of lists, where each entry is an integer corresponding to a particular class. Here are some functions you can use to easily convert labels between this format and a binary matrix format commonly used to train multi-label classification models." @@ -591,7 +592,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.15" + "version": "3.9.12" } }, "nbformat": 4, diff --git a/setup.cfg b/setup.cfg index 8888d3b898..2f044be3d5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,3 +6,4 @@ per-file-ignores = cleanlab/token_classification/__init__.py: F401 cleanlab/benchmarking/__init__.py: F401 cleanlab/models/__init__.py: F401 + cleanlab/multilabel_classification/__init__.py: F401 diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 1331fe5677..092e5f3587 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -448,6 +448,16 @@ def test_real_datasets(dataset_name): ) +@pytest.mark.parametrize("dataset_name", ["mnist"]) +def test_multilabel_error(dataset_name): + print("\n" + dataset_name.capitalize() + "\n") + class_names = eval(dataset_name) + pred_probs, labels = _get_pred_probs_labels_from_labelerrors_datasets(dataset_name) + # if this runs without issue no all four datasets, the test passes + with pytest.raises(ValueError) as e: + _ = find_overlapping_classes(labels=labels, pred_probs=pred_probs, multi_label=True) + + @pytest.mark.parametrize("asymmetric", [True, False]) @pytest.mark.parametrize("dataset_name", ["mnist", "imdb"]) def test_symmetry_df_size(asymmetric, dataset_name): diff --git a/tests/test_filter_count.py b/tests/test_filter_count.py index e2cbf07bdd..1c126a0bf3 100644 --- a/tests/test_filter_count.py +++ b/tests/test_filter_count.py @@ -14,6 +14,7 @@ # You should have received a copy of the GNU Affero General Public License # along with cleanlab. If not, see . +import cleanlab.multilabel_classification.dataset from cleanlab import count, filter from cleanlab.count import ( get_confident_thresholds, @@ -496,6 +497,7 @@ def test_pruning_order_method(): @pytest.mark.parametrize("multi_label", [True, False]) +@pytest.mark.parametrize("use_dataset_function", [True, False]) @pytest.mark.parametrize( "filter_by", ["prune_by_noise_rate", "prune_by_class", "both", "confident_learning"] ) @@ -503,17 +505,37 @@ def test_pruning_order_method(): "return_indices_ranked_by", [None, "self_confidence", "normalized_margin", "confidence_weighted_entropy"], ) -def test_find_label_issues_multi_label(multi_label, filter_by, return_indices_ranked_by): +def test_find_label_issues_multi_label( + multi_label, use_dataset_function, filter_by, return_indices_ranked_by +): """Note: argmax_not_equal method is not compatible with multi_label == True""" dataset = multilabel_data if multi_label else data + if multi_label: + if use_dataset_function: + noise_idx = cleanlab.multilabel_classification.filter.find_label_issues( + labels=dataset["labels"], + pred_probs=dataset["pred_probs"], + filter_by=filter_by, + return_indices_ranked_by=return_indices_ranked_by, + ) + else: + with pytest.warns(DeprecationWarning): + noise_idx = filter.find_label_issues( + labels=dataset["labels"], + pred_probs=dataset["pred_probs"], + filter_by=filter_by, + multi_label=multi_label, + return_indices_ranked_by=return_indices_ranked_by, + ) + else: + noise_idx = filter.find_label_issues( + labels=dataset["labels"], + pred_probs=dataset["pred_probs"], + filter_by=filter_by, + multi_label=multi_label, + return_indices_ranked_by=return_indices_ranked_by, + ) - noise_idx = filter.find_label_issues( - labels=dataset["labels"], - pred_probs=dataset["pred_probs"], - filter_by=filter_by, - multi_label=multi_label, - return_indices_ranked_by=return_indices_ranked_by, - ) if return_indices_ranked_by is not None: noise_bool = np.zeros(len(dataset["labels"])).astype(bool) noise_bool[noise_idx] = True diff --git a/tests/test_multilabel_classification.py b/tests/test_multilabel_classification.py index ac1d3afb57..452dbc379f 100644 --- a/tests/test_multilabel_classification.py +++ b/tests/test_multilabel_classification.py @@ -25,7 +25,15 @@ from cleanlab.internal import multilabel_scorer as ml_scorer from cleanlab.internal.multilabel_utils import stack_complement, get_onehot_num_classes, onehot2int -from cleanlab import multilabel_classification as multilabel_classfication +from cleanlab import multilabel_classification as ml_classification +from cleanlab.multilabel_classification.dataset import ( + common_multilabel_issues, + rank_classes_by_multilabel_quality, + overall_multilabel_health_score, + multilabel_health_summary, +) +from cleanlab.multilabel_classification.rank import get_label_quality_scores_per_class +from cleanlab.multilabel_classification import filter @pytest.fixture @@ -84,6 +92,41 @@ def pred_probs(): ) +@pytest.fixture +def pred_probs_multilabel(): + return np.array( + [ + [0.9, 0.1, 0.0, 0.4, 0.1], + [0.7, 0.8, 0.2, 0.3, 0.1], + [0.9, 0.8, 0.4, 0.2, 0.1], + [0.1, 0.1, 0.8, 0.3, 0.1], + [0.4, 0.5, 0.1, 0.1, 0.1], + [0.1, 0.1, 0.2, 0.1, 0.1], + [0.8, 0.1, 0.2, 0.1, 0.1], + ] + ) + + +@pytest.fixture +def labels_multilabel(): + return [[0], [0, 1], [0, 1], [2], [0, 2, 3], [], []] + + +@pytest.fixture +def data_multilabel(num_classes=5): + labels = [] + pred_probs = [] + for i in range(0, 100): + q = [0.1] * num_classes + pos = i % num_classes + labels.append([pos]) + if i > 90: + pos = (pos + 2) % num_classes + q[pos] = 0.9 + pred_probs.append(q) + return labels, np.array(pred_probs) + + @pytest.fixture def cv(): return sklearn.model_selection.StratifiedKFold( @@ -102,18 +145,18 @@ def dummy_features(labels): def test_public_label_quality_scores(labels, pred_probs): formatted_labels = onehot2int(labels) assert isinstance(formatted_labels, list) - scores1 = multilabel_classfication.get_label_quality_scores(formatted_labels, pred_probs) + scores1 = ml_classification.get_label_quality_scores(formatted_labels, pred_probs) assert len(scores1) == len(labels) assert (scores1 >= 0).all() and (scores1 <= 1).all() - scores2 = multilabel_classfication.get_label_quality_scores( + scores2 = ml_classification.get_label_quality_scores( formatted_labels, pred_probs, method="confidence_weighted_entropy" ) assert not np.isclose(scores1, scores2).all() - scores3 = multilabel_classfication.get_label_quality_scores( + scores3 = ml_classification.get_label_quality_scores( formatted_labels, pred_probs, adjust_pred_probs=True ) assert not np.isclose(scores1, scores3).all() - scores4 = multilabel_classfication.get_label_quality_scores( + scores4 = ml_classification.get_label_quality_scores( formatted_labels, pred_probs, method="normalized_margin", @@ -121,7 +164,7 @@ def test_public_label_quality_scores(labels, pred_probs): aggregator_kwargs={"method": "exponential_moving_average"}, ) assert not np.isclose(scores1, scores4).all() - scores5 = multilabel_classfication.get_label_quality_scores( + scores5 = ml_classification.get_label_quality_scores( formatted_labels, pred_probs, method="normalized_margin", @@ -129,7 +172,7 @@ def test_public_label_quality_scores(labels, pred_probs): aggregator_kwargs={"method": "softmin"}, ) assert not np.isclose(scores4, scores5).all() - scores6 = multilabel_classfication.get_label_quality_scores( + scores6 = ml_classification.get_label_quality_scores( formatted_labels, pred_probs, method="normalized_margin", @@ -137,7 +180,7 @@ def test_public_label_quality_scores(labels, pred_probs): aggregator_kwargs={"method": "softmin", "temperature": 0.002}, ) assert not np.isclose(scores5, scores6).all() - scores7 = multilabel_classfication.get_label_quality_scores( + scores7 = ml_classification.get_label_quality_scores( formatted_labels, pred_probs, method="normalized_margin", @@ -147,13 +190,13 @@ def test_public_label_quality_scores(labels, pred_probs): assert np.isclose(scores6, scores7, rtol=1e-3).all() with pytest.raises(ValueError) as e: - _ = multilabel_classfication.get_label_quality_scores( + _ = ml_classification.get_label_quality_scores( formatted_labels, pred_probs, method="badchoice" ) assert "Invalid method name: badchoice" in str(e.value) with pytest.raises(ValueError) as e: - _ = multilabel_classfication.get_label_quality_scores( + _ = ml_classification.get_label_quality_scores( formatted_labels, pred_probs, aggregator_kwargs={"method": "invalid"} ) assert "Invalid aggregation method specified: 'invalid'" in str(e.value) @@ -172,7 +215,7 @@ def base_scores(self): ids=lambda x: x.__name__ if callable(x) else str(x), ) def test_aggregator_callable(self, method): - aggregator = multilabel_classfication.Aggregator(method=method) + aggregator = ml_scorer.Aggregator(method=method) assert callable(aggregator.method), "Aggregator should store a callable method" assert callable(aggregator), "Aggregator should be callable" @@ -189,24 +232,24 @@ def test_aggregator_callable(self, method): ids=["min", "max", "mean", "median", "exponential_moving_average", "softmin"], ) def test_aggregator_score(self, base_scores, method, expected_score): - aggregator = multilabel_classfication.Aggregator(method=method) + aggregator = ml_scorer.Aggregator(method=method) scores = aggregator(base_scores) assert np.isclose(scores, np.array([expected_score]), rtol=1e-3).all() assert scores.shape == (1,) def test_invalid_method(self): with pytest.raises(ValueError) as e: - _ = multilabel_classfication.Aggregator(method="invalid_method") + _ = ml_scorer.Aggregator(method="invalid_method") assert "Invalid aggregation method specified: 'invalid_method'" in str( e.value ), "String constructor has limited options" with pytest.raises(TypeError) as e: - _ = multilabel_classfication.Aggregator(method=1) + _ = ml_scorer.Aggregator(method=1) assert "Expected callable method" in str(e.value), "Non-callable methods are not valid" def test_invalid_score(self, base_scores): - aggregator = multilabel_classfication.Aggregator(method=np.min) + aggregator = ml_scorer.Aggregator(method=np.min) with pytest.raises(ValueError) as e: _ = aggregator(base_scores[0]) assert "Expected 2D array" in str(e.value), "Aggregator expects 2D array" @@ -304,6 +347,152 @@ def test_is_multilabel(labels): assert not ml_scorer._is_multilabel(labels[:, 0]) +@pytest.mark.parametrize("class_names", [None, ["Apple", "Cat", "Dog", "Peach", "Bird"]]) +def test_common_multilabel_issues(class_names, pred_probs_multilabel, labels_multilabel): + df = common_multilabel_issues( + labels=labels_multilabel, pred_probs=pred_probs_multilabel, class_names=class_names + ) + expected_issue_probabilities = [ + 0.14285714285714285, + 0.14285714285714285, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ] + assert len(df) == 10 + assert np.isclose(np.array(expected_issue_probabilities), df["Issue Probability"]).all() + if class_names: + expected_res = [ + "Apple", + "Dog", + "Apple", + "Cat", + "Cat", + "Dog", + "Peach", + "Peach", + "Bird", + "Bird", + ] + assert list(df["Class Name"]) == expected_res + else: + assert "Class Name" not in df.columns + + +@pytest.mark.parametrize("min_examples_per_class", [10, 90]) +def test_multilabel_min_examples_per_class(data_multilabel, min_examples_per_class): + labels, pred_probs = data_multilabel + issues = filter.find_label_issues( + labels=labels, pred_probs=pred_probs, min_examples_per_class=min_examples_per_class + ) + if min_examples_per_class == 10: + assert sum(issues) == 9 + else: + assert sum(issues) == 0 + + +@pytest.mark.parametrize("num_to_remove_per_class", [None, [1, 1, 0, 0, 2], [1, 1, 0, 0, 1]]) +def test_multilabel_num_to_remove_per_class(data_multilabel, num_to_remove_per_class): + labels, pred_probs = data_multilabel + + issues = filter.find_label_issues( + labels=labels, pred_probs=pred_probs, num_to_remove_per_class=num_to_remove_per_class + ) + num_issues = sum(issues) + if num_to_remove_per_class is None: + assert num_issues == 9 + else: + assert num_issues == sum(num_to_remove_per_class) + + +@pytest.mark.parametrize("class_names", [None, ["Apple", "Cat", "Dog", "Peach", "Bird"]]) +def test_rank_classes_by_multilabel_quality(pred_probs_multilabel, labels_multilabel, class_names): + df_ranked = rank_classes_by_multilabel_quality( + pred_probs=pred_probs_multilabel, labels=labels_multilabel, class_names=class_names + ) + expected_Label_Issues = [1, 0, 0, 0, 0] + + expected_Label_Noise = [0.14285714285714285, 0.0, 0.0, 0.0, 0.0] + + expected_Label_Quality_Score = [0.8571428571428572, 1.0, 1.0, 1.0, 1.0] + + expected_Inverse_Label_Issues = [0, 1, 0, 0, 0] + + expected_Inverse_Label_Noise = [0.0, 0.14285714285714285, 0.0, 0.0, 0.0] + assert list(df_ranked["Label Issues"]) == expected_Label_Issues + + assert np.isclose(np.array(expected_Label_Noise), df_ranked["Label Noise"]).all() + assert np.isclose( + np.array(expected_Label_Quality_Score), df_ranked["Label Quality Score"] + ).all() + assert list(df_ranked["Inverse Label Issues"]) == expected_Inverse_Label_Issues + assert np.isclose( + np.array(expected_Inverse_Label_Noise), df_ranked["Inverse Label Noise"] + ).all() + if class_names: + expected_res = [ + "Dog", + "Apple", + "Cat", + "Peach", + "Bird", + ] + assert list(df_ranked["Class Name"]) == expected_res + else: + assert "Class Name" not in df_ranked.columns + + +def test_overall_multilabel_health_score(data_multilabel): + labels, pred_probs = data_multilabel + overall_label_health_score = overall_multilabel_health_score( + pred_probs=pred_probs, labels=labels + ) + assert np.isclose(overall_label_health_score, 0.91) + + +def test_get_class_label_quality_scores(): + pred_probs = np.array( + [ + [0.9, 0.1, 0.0, 0.4, 0.1], + [0.7, 0.8, 0.2, 0.3, 0.1], + [0.9, 0.8, 0.4, 0.2, 0.1], + [0.1, 0.1, 0.8, 0.3, 0.1], + [0.4, 0.5, 0.1, 0.1, 0.1], + [0.1, 0.1, 0.2, 0.1, 0.1], + [0.8, 0.1, 0.2, 0.1, 0.1], + ] + ) + labels = [[0], [0, 1], [0, 1], [2], [0, 2, 3], [], []] + scores = get_label_quality_scores_per_class(pred_probs=pred_probs, labels=labels) + expected_res = [ + [0.9, 0.9, 1.0, 0.6, 0.9], + [0.7, 0.8, 0.8, 0.7, 0.9], + [0.9, 0.8, 0.6, 0.8, 0.9], + [0.9, 0.9, 0.8, 0.7, 0.9], + [0.4, 0.5, 0.1, 0.1, 0.9], + [0.9, 0.9, 0.8, 0.9, 0.9], + [0.2, 0.9, 0.8, 0.9, 0.9], + ] + assert np.isclose(scores, np.array(expected_res)).all() + + +def test_health_summary_multilabel(pred_probs_multilabel, labels_multilabel): + health_summary_multilabel = multilabel_health_summary( + pred_probs=pred_probs_multilabel, labels=labels_multilabel + ) + expected_keys = [ + "classes_by_multilabel_quality", + "common_multilabel_issues", + "overall_multilabel_health_score", + ] + assert sorted(health_summary_multilabel.keys()) == expected_keys + + @pytest.mark.parametrize( "input", [ From 0fdf398db2c7be500b3690e4c41869f6468ea44f Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Fri, 14 Apr 2023 10:46:09 -0500 Subject: [PATCH 152/258] move int2onehot, onehot2int to top of multilabel tutorial (#666) --- docs/source/tutorials/multilabel_classification.ipynb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/source/tutorials/multilabel_classification.ipynb b/docs/source/tutorials/multilabel_classification.ipynb index 49a824c031..d37ba8298a 100644 --- a/docs/source/tutorials/multilabel_classification.ipynb +++ b/docs/source/tutorials/multilabel_classification.ipynb @@ -105,7 +105,8 @@ "import matplotlib.pyplot as plt\n", "\n", "from cleanlab.multilabel_classification.filter import find_label_issues\n", - "from cleanlab.multilabel_classification.rank import get_label_quality_scores" + "from cleanlab.multilabel_classification.rank import get_label_quality_scores\n", + "from cleanlab.internal.multilabel_utils import int2onehot, onehot2int" ] }, { @@ -543,8 +544,6 @@ "metadata": {}, "outputs": [], "source": [ - "from cleanlab.internal.multilabel_utils import int2onehot, onehot2int\n", - "\n", "labels_binary_format = int2onehot(labels, K=num_class)\n", "labels_list_format = onehot2int(labels_binary_format)" ] From d45a508ed6a0d96e5984e564b85c7f25965d7b6b Mon Sep 17 00:00:00 2001 From: Ulyana Date: Fri, 14 Apr 2023 20:23:40 -0700 Subject: [PATCH 153/258] Update softmax to be more numerically stable (#667) --- cleanlab/internal/multilabel_scorer.py | 6 ++++-- cleanlab/token_classification/rank.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cleanlab/internal/multilabel_scorer.py b/cleanlab/internal/multilabel_scorer.py index 4ad836d283..4d1cbd4d3c 100644 --- a/cleanlab/internal/multilabel_scorer.py +++ b/cleanlab/internal/multilabel_scorer.py @@ -243,8 +243,10 @@ def softmin( def softmax(scores: np.ndarray) -> np.ndarray: """Softmax function.""" - exp_scores = np.exp(scores / temperature) - return exp_scores / np.sum(exp_scores, axis=axis, keepdims=True) + scores = scores / temperature + scores_max = np.amax(scores, axis=axis, keepdims=True) + exp_scores_shifted = np.exp(scores - scores_max) + return exp_scores_shifted / np.sum(exp_scores_shifted, axis=axis, keepdims=True) return np.einsum("ij,ij->i", s, softmax(1 - s)) diff --git a/cleanlab/token_classification/rank.py b/cleanlab/token_classification/rank.py index 40d379e0f9..be41568aa2 100644 --- a/cleanlab/token_classification/rank.py +++ b/cleanlab/token_classification/rank.py @@ -281,8 +281,10 @@ def _softmin_sentence_score( return np.array([np.mean(scores) for scores in token_scores]) def softmax(scores: np.ndarray) -> np.ndarray: - exp_scores = np.exp(scores / temperature) - return exp_scores / np.sum(exp_scores) + scores = scores / temperature + scores_max = np.amax(scores, axis=0, keepdims=True) + exp_scores_shifted = np.exp(scores - scores_max) + return exp_scores_shifted / np.sum(exp_scores_shifted, axis=0, keepdims=True) def fun(scores: np.ndarray) -> float: return np.dot(scores, softmax(1 - np.array(scores))) From ee5ed9bdf0b9567b2d9b28c6b1448e2924b9c4c5 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Mon, 17 Apr 2023 17:38:37 -0500 Subject: [PATCH 154/258] Ensure multilabel docs appear in the documentation (#669) --- cleanlab/dataset.py | 10 +++++---- cleanlab/multilabel_classification/dataset.py | 8 ++++--- cleanlab/multilabel_classification/rank.py | 4 ++-- .../cleanlab/multilabel_classification.rst | 8 ------- .../multilabel_classification/dataset.rst | 8 +++++++ .../multilabel_classification/filter.rst | 8 +++++++ .../multilabel_classification/index.rst | 22 +++++++++++++++++++ .../multilabel_classification/rank.rst | 8 +++++++ docs/source/index.rst | 2 +- 9 files changed, 60 insertions(+), 18 deletions(-) delete mode 100644 docs/source/cleanlab/multilabel_classification.rst create mode 100644 docs/source/cleanlab/multilabel_classification/dataset.rst create mode 100644 docs/source/cleanlab/multilabel_classification/filter.rst create mode 100644 docs/source/cleanlab/multilabel_classification/index.rst create mode 100644 docs/source/cleanlab/multilabel_classification/rank.rst diff --git a/cleanlab/dataset.py b/cleanlab/dataset.py index cefe4b33e4..28fbae31e4 100644 --- a/cleanlab/dataset.py +++ b/cleanlab/dataset.py @@ -402,11 +402,13 @@ def health_summary( multi_label=False, verbose=True, ) -> dict: - """Prints a health summary of your datasets including useful statistics like: + """Prints a health summary of your dataset. - * The classes with the most and least label issues - * Classes that overlap and could potentially be merged - * Overall data label quality health score statistics for your dataset + This summary includes useful statistics like: + + * The classes with the most and least label issues. + * Classes that overlap and could potentially be merged. + * Overall label quality scores, summarizing how accurate the labels appear overall. This method works by providing any one (and only one) of the following inputs: diff --git a/cleanlab/multilabel_classification/dataset.py b/cleanlab/multilabel_classification/dataset.py index e3987a5ece..8fa25ac9e8 100644 --- a/cleanlab/multilabel_classification/dataset.py +++ b/cleanlab/multilabel_classification/dataset.py @@ -244,10 +244,12 @@ def multilabel_health_summary( confident_joint=None, verbose=True, ) -> Dict: - """Prints a health summary of your multi-label datasets including useful statistics like: + """Prints a health summary of your multi-label dataset. - * The classes with the most and least label issues - * Overall data label quality health score statistics for your dataset + This summary includes useful statistics like: + + * The classes with the most and least label issues. + * Overall label quality scores, summarizing how accurate the labels appear across the entire dataset. **Parameters**: For information about the arguments to this method, see the documentation of :py:func:`common_multilabel_issues `. diff --git a/cleanlab/multilabel_classification/rank.py b/cleanlab/multilabel_classification/rank.py index 57c24bc27e..39fa07ff2b 100644 --- a/cleanlab/multilabel_classification/rank.py +++ b/cleanlab/multilabel_classification/rank.py @@ -17,7 +17,7 @@ """ Methods to rank the severity of label issues in multi-label classification datasets. Here each example can belong to one or more classes, or none of the classes at all. -Unlike in standard multi-class classification, predicted class probabilities from model need not sum to 1 for each row in multi-label classification. +Unlike in standard multi-class classification, model-predicted class probabilities need not sum to 1 for each row in multi-label classification. """ import numpy as np # noqa: F401: Imported for type annotations @@ -70,7 +70,7 @@ def get_label_quality_scores( adjust_pred_probs: bool = False, aggregator_kwargs: Dict[str, Any] = {"method": "exponential_moving_average", "alpha": 0.8}, ) -> npt.NDArray["np.floating[T]"]: - """Computes a label quality score each example in a multi-label classification dataset. + """Computes a label quality score for each example in a multi-label classification dataset. Scores are between 0 and 1 with lower scores indicating examples whose label more likely contains an error. For each example, this method internally computes a separate score for each individual class diff --git a/docs/source/cleanlab/multilabel_classification.rst b/docs/source/cleanlab/multilabel_classification.rst deleted file mode 100644 index 02c35acebc..0000000000 --- a/docs/source/cleanlab/multilabel_classification.rst +++ /dev/null @@ -1,8 +0,0 @@ -multilabel_classification -========================= - -.. automodule:: cleanlab.multilabel_classification - :autosummary: - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/source/cleanlab/multilabel_classification/dataset.rst b/docs/source/cleanlab/multilabel_classification/dataset.rst new file mode 100644 index 0000000000..b1c2544548 --- /dev/null +++ b/docs/source/cleanlab/multilabel_classification/dataset.rst @@ -0,0 +1,8 @@ +dataset +======= + +.. automodule:: cleanlab.multilabel_classification.dataset + :autosummary: + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/source/cleanlab/multilabel_classification/filter.rst b/docs/source/cleanlab/multilabel_classification/filter.rst new file mode 100644 index 0000000000..2991848414 --- /dev/null +++ b/docs/source/cleanlab/multilabel_classification/filter.rst @@ -0,0 +1,8 @@ +filter +====== + +.. automodule:: cleanlab.multilabel_classification.filter + :autosummary: + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/source/cleanlab/multilabel_classification/index.rst b/docs/source/cleanlab/multilabel_classification/index.rst new file mode 100644 index 0000000000..a73dcd25a1 --- /dev/null +++ b/docs/source/cleanlab/multilabel_classification/index.rst @@ -0,0 +1,22 @@ +multilabel_classification +========================= + +Methods to detect data and label issues in multi-label classification datasets. + +In multi-class classification, each example in the dataset belongs to exactly 1 out of K classes (e.g. if classifying animals as: {dog, cat, rat}). + +In multi-label classification, each example in the dataset can belong to 1 or more classes (out of K possible classes), or none of the classes at all (e.g. if classifying animals as: {predator, pet, reptile}). + + + +.. automodule:: cleanlab.multilabel_classification + :autosummary: + :members: + :undoc-members: + :show-inheritance: + +.. toctree:: + + filter + rank + dataset \ No newline at end of file diff --git a/docs/source/cleanlab/multilabel_classification/rank.rst b/docs/source/cleanlab/multilabel_classification/rank.rst new file mode 100644 index 0000000000..4c7b2c35be --- /dev/null +++ b/docs/source/cleanlab/multilabel_classification/rank.rst @@ -0,0 +1,8 @@ +rank +==== + +.. automodule:: cleanlab.multilabel_classification.rank + :autosummary: + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index a99ef826a8..c7546f9563 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -151,7 +151,7 @@ Please see our `contributing guidelines Date: Mon, 17 Apr 2023 23:38:49 -0500 Subject: [PATCH 155/258] update fasttext installation requirement (#671) --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 070524aa36..f23a4bdec1 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -18,7 +18,7 @@ tensorflow==2.9.1 tensorflow-io==0.26.0 speechbrain==0.5.13 huggingface_hub==0.11.1 -fasttext==0.9.2 +fasttext-wheel==0.9.2 torch==1.13.1 skorch==0.12.1 torchvision==0.14.1 From b22b17914c9d39ced1724afc00c927295c70547f Mon Sep 17 00:00:00 2001 From: "Curtis G. Northcutt" Date: Tue, 18 Apr 2023 15:01:53 -0700 Subject: [PATCH 156/258] add dcai course to resources --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index de2b61b004..e928fee33c 100644 --- a/README.md +++ b/README.md @@ -522,6 +522,8 @@ To understand/cite other cleanlab functionality not described above, check out o ## Other resources +- [Introduction to Data-centric AI (MIT IAP Course 2023)](https://dcai.csail.mit.edu/) + - [Cleanlab Blog](https://cleanlab.ai/blog/) - [Blog post: Introduction to Confident Learning](https://l7.curtisnorthcutt.com/confident-learning) From 64edc95630e609da1130fc08cd7c8de61498ecb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Tue, 25 Apr 2023 11:26:43 -0500 Subject: [PATCH 157/258] Introduce Datalab (#614) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Co-authored-by: Jesse Cummings Co-authored-by: Jesse Cummings Co-authored-by: Hui Wen <45724323+huiwengoh@users.noreply.github.com> --- .gitignore | 1 + .mypy.ini | 9 + DEVELOPMENT.md | 82 ++ cleanlab/__init__.py | 43 + cleanlab/datalab/__init__.py | 0 cleanlab/datalab/data.py | 258 ++++++ cleanlab/datalab/data_issues.py | 279 ++++++ cleanlab/datalab/datalab.py | 519 +++++++++++ cleanlab/datalab/display.py | 61 ++ cleanlab/datalab/examples/__init__.py | 0 cleanlab/datalab/factory.py | 155 ++++ cleanlab/datalab/issue_finder.py | 351 ++++++++ cleanlab/datalab/issue_manager/__init__.py | 5 + cleanlab/datalab/issue_manager/duplicate.py | 222 +++++ .../datalab/issue_manager/issue_manager.py | 342 ++++++++ cleanlab/datalab/issue_manager/label.py | 226 +++++ cleanlab/datalab/issue_manager/noniid.py | 425 +++++++++ cleanlab/datalab/issue_manager/outlier.py | 276 ++++++ cleanlab/datalab/report.py | 110 +++ cleanlab/datalab/serialize.py | 138 +++ docs/requirements.txt | 2 + docs/source/cleanlab/datalab/data.rst | 9 + docs/source/cleanlab/datalab/data_issues.rst | 9 + docs/source/cleanlab/datalab/datalab.rst | 9 + docs/source/cleanlab/datalab/factory.rst | 9 + .../datalab/guide/custom_issue_manager.rst | 227 +++++ docs/source/cleanlab/datalab/guide/index.rst | 19 + docs/source/cleanlab/datalab/index.rst | 39 + docs/source/cleanlab/datalab/issue_finder.rst | 12 + .../datalab/issue_manager/duplicate.rst | 9 + .../cleanlab/datalab/issue_manager/index.rst | 13 + .../datalab/issue_manager/issue_manager.rst | 8 + .../cleanlab/datalab/issue_manager/label.rst | 8 + .../cleanlab/datalab/issue_manager/noniid.rst | 9 + .../datalab/issue_manager/outlier.rst | 9 + .../datalab/optional_dependencies.rst | 11 + docs/source/cleanlab/datalab/report.rst | 12 + docs/source/index.rst | 4 +- docs/source/tutorials/datalab/audio.ipynb | 779 +++++++++++++++++ .../tutorials/datalab/datalab_advanced.ipynb | 817 ++++++++++++++++++ .../datalab/datalab_quickstart.ipynb | 750 ++++++++++++++++ docs/source/tutorials/datalab/image.ipynb | 551 ++++++++++++ docs/source/tutorials/datalab/index.rst | 12 + docs/source/tutorials/datalab/tabular.ipynb | 523 +++++++++++ docs/source/tutorials/datalab/text.ipynb | 567 ++++++++++++ docs/source/tutorials/indepth_overview.ipynb | 108 ++- docs/source/tutorials/index.rst | 1 + requirements-dev.txt | 1 + setup.cfg | 1 + setup.py | 13 + tests/datalab/conftest.py | 81 ++ tests/datalab/issue_manager/test_noniid.py | 47 + tests/datalab/test_data.py | 163 ++++ tests/datalab/test_data_issues.py | 46 + tests/datalab/test_datalab.py | 757 ++++++++++++++++ tests/datalab/test_factory.py | 31 + tests/datalab/test_init.py | 16 + tests/datalab/test_issue_finder.py | 83 ++ tests/datalab/test_issue_manager.py | 509 +++++++++++ tests/datalab/test_report.py | 85 ++ 60 files changed, 9841 insertions(+), 20 deletions(-) create mode 100644 cleanlab/datalab/__init__.py create mode 100644 cleanlab/datalab/data.py create mode 100644 cleanlab/datalab/data_issues.py create mode 100644 cleanlab/datalab/datalab.py create mode 100644 cleanlab/datalab/display.py create mode 100644 cleanlab/datalab/examples/__init__.py create mode 100644 cleanlab/datalab/factory.py create mode 100644 cleanlab/datalab/issue_finder.py create mode 100644 cleanlab/datalab/issue_manager/__init__.py create mode 100644 cleanlab/datalab/issue_manager/duplicate.py create mode 100644 cleanlab/datalab/issue_manager/issue_manager.py create mode 100644 cleanlab/datalab/issue_manager/label.py create mode 100644 cleanlab/datalab/issue_manager/noniid.py create mode 100644 cleanlab/datalab/issue_manager/outlier.py create mode 100644 cleanlab/datalab/report.py create mode 100644 cleanlab/datalab/serialize.py create mode 100644 docs/source/cleanlab/datalab/data.rst create mode 100644 docs/source/cleanlab/datalab/data_issues.rst create mode 100644 docs/source/cleanlab/datalab/datalab.rst create mode 100644 docs/source/cleanlab/datalab/factory.rst create mode 100644 docs/source/cleanlab/datalab/guide/custom_issue_manager.rst create mode 100644 docs/source/cleanlab/datalab/guide/index.rst create mode 100644 docs/source/cleanlab/datalab/index.rst create mode 100644 docs/source/cleanlab/datalab/issue_finder.rst create mode 100644 docs/source/cleanlab/datalab/issue_manager/duplicate.rst create mode 100644 docs/source/cleanlab/datalab/issue_manager/index.rst create mode 100644 docs/source/cleanlab/datalab/issue_manager/issue_manager.rst create mode 100644 docs/source/cleanlab/datalab/issue_manager/label.rst create mode 100644 docs/source/cleanlab/datalab/issue_manager/noniid.rst create mode 100644 docs/source/cleanlab/datalab/issue_manager/outlier.rst create mode 100644 docs/source/cleanlab/datalab/optional_dependencies.rst create mode 100644 docs/source/cleanlab/datalab/report.rst create mode 100644 docs/source/tutorials/datalab/audio.ipynb create mode 100644 docs/source/tutorials/datalab/datalab_advanced.ipynb create mode 100644 docs/source/tutorials/datalab/datalab_quickstart.ipynb create mode 100644 docs/source/tutorials/datalab/image.ipynb create mode 100644 docs/source/tutorials/datalab/index.rst create mode 100644 docs/source/tutorials/datalab/tabular.ipynb create mode 100644 docs/source/tutorials/datalab/text.ipynb create mode 100644 tests/datalab/conftest.py create mode 100644 tests/datalab/issue_manager/test_noniid.py create mode 100644 tests/datalab/test_data.py create mode 100644 tests/datalab/test_data_issues.py create mode 100644 tests/datalab/test_datalab.py create mode 100644 tests/datalab/test_factory.py create mode 100644 tests/datalab/test_init.py create mode 100644 tests/datalab/test_issue_finder.py create mode 100644 tests/datalab/test_issue_manager.py create mode 100644 tests/datalab/test_report.py diff --git a/.gitignore b/.gitignore index 82fd660bf0..e46bc2dfb5 100644 --- a/.gitignore +++ b/.gitignore @@ -118,6 +118,7 @@ venv.bak/ /docs/source/notebooks/*.gz /docs/source/notebooks/spoken_digits /docs/source/notebooks/pretrained_models +/docs/source/tutorials/datalab/datalab-files/ # Editor files .vscode/ diff --git a/.mypy.ini b/.mypy.ini index ab7155b9e2..c119d9c9e1 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -21,3 +21,12 @@ ignore_missing_imports = True [mypy-tqdm.*] ignore_missing_imports = True + +[mypy-matplotlib.*] +ignore_missing_imports = True + +[mypy-datasets.*] +ignore_missing_imports = True + +[mypy-scipy.*] +ignore_missing_imports = True diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 8b912a591f..59501c6c2e 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -39,6 +39,88 @@ pip install -e . For Macs with Apple silicon: replace `tensorflow` in requirements-dev.txt with: `tensorflow-macos==2.9.2` and `tensorflow-metal==0.5.1` +### Handling optional dependencies + +When designing a class that relies on an optional, domain-specific runtime dependency, it is better to use lazy-importing to avoid forcing users to install the dependency if they do not need it. + +Depending on the coupling of your class to the dependency, you may want to consider importing it at the module-level or as an instance variable of the class or a function that uses the dependency. + +If the dependency is used by many methods in the module or other classes, it is better to import it at the module-level. +On the other hand, if the dependency is only used by a handful of methods, then it's better to import it inside the method. If the dependency is not installed, an ImportError should be raised when the method is called, along with instructions on how to install the dependency. + +Here is an example of a class that lazily imports CuPy and has a sum method (element-wise) that can be used on both CPU and GPU devices. + +Unless an alternative implementations of the sum method is available, an `ImportError` should be raised when the method is called with instructions on how to install the dependency. + +
    Example code + +```python +def lazy_import_cupy(): + try: + import cupy + except ImportError as error: + # If the dependency is required for the class to work, + # replace this block with a raised ImportError containing instructions + print("Warning: cupy is not installed. Please install it with `pip install cupy`.") + cupy = None + return cupy + +class Summation: + def __init__(self): + self.cupy = lazy_import_cupy() + def sum(self, x) -> float: + if self.cupy is None: + return sum(x) + return self.cupy.sum(x) +``` +
    + + +For the build system to recognize the optional dependency, you should add it to the `EXTRAS_REQUIRE` constant in **setup.py**: + +
    Example code + +```python +EXTRAS_REQUIRE = { + ... + "gpu": [ + # Explain why the dependency below is needed, + # e.g. "for performing summation on GPU" + "cupy", + ], +} +``` + + +Or assign to a separate variable and add it to `EXTRAS_REQUIRE` + +```python +GPU_REQUIRES = [ + # Explanation ... + "cupy", +] + +EXTAS_REQUIRE = { + ... + "gpu": GPU_REQUIRES, +} +``` +
    + + +The package can be installed with the optional dependency (here called `gpu`) via: + +1. PyPI installation + +```shell +pip install -r cleanlab[gpu] +``` + +2. Editable installation + +```shell +pip install -e .[gpu] +``` ## Testing diff --git a/cleanlab/__init__.py b/cleanlab/__init__.py index 5746a49a21..f0d92a2795 100644 --- a/cleanlab/__init__.py +++ b/cleanlab/__init__.py @@ -9,3 +9,46 @@ from . import outlier from . import token_classification from . import multilabel_classification + + +class DatalabUnavailable: + def __init__(self, message): + self.message = message + + def __getattr__(self, name): + message = self.message + f" (raised when trying to access {name})" + raise ImportError(message) + + def __call__(self, *args, **kwargs): + message = ( + self.message + f" (raised when trying to call with args: {args}, kwargs: {kwargs})" + ) + raise ImportError(message) + + +def _datalab_import_factory(): + try: + from .datalab.datalab import Datalab as _Datalab + + return _Datalab + except ImportError: + return DatalabUnavailable( + "Datalab is not available due to missing dependencies. " + "To install Datalab, run `pip install cleanlab[datalab]`." + ) + + +def _issue_manager_import_factory(): + try: + from .datalab.issue_manager import IssueManager as _IssueManager + + return _IssueManager + except ImportError: + return DatalabUnavailable( + "IssueManager is not available due to missing dependencies for Datalab. " + "To install Datalab, run `pip install cleanlab[datalab]`." + ) + + +Datalab = _datalab_import_factory() +IssueManager = _issue_manager_import_factory() diff --git a/cleanlab/datalab/__init__.py b/cleanlab/datalab/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cleanlab/datalab/data.py b/cleanlab/datalab/data.py new file mode 100644 index 0000000000..098b9e2438 --- /dev/null +++ b/cleanlab/datalab/data.py @@ -0,0 +1,258 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . +"""Classes and methods for datasets that are loaded into Datalab.""" + +import os +from typing import Any, Callable, Dict, List, Mapping, Tuple, Union, cast, TYPE_CHECKING + +try: + import datasets +except ImportError as error: + raise ImportError( + "Cannot import datasets package. " + "Please install it and try again, or just install cleanlab with " + "all optional dependencies via: `pip install cleanlab[all]`" + ) from error +import numpy as np +import pandas as pd +from datasets.arrow_dataset import Dataset +from datasets import ClassLabel + +from cleanlab.internal.validation import labels_to_array + + +if TYPE_CHECKING: # pragma: no cover + DatasetLike = Union[Dataset, pd.DataFrame, Dict[str, Any], List[Dict[str, Any]], str] + + +class DataFormatError(ValueError): + """Exception raised when the data is not in a supported format.""" + + def __init__(self, data: Any): + self.data = data + message = ( + f"Unsupported data type: {type(data)}\n" + "Supported types: " + "datasets.Dataset, pandas.DataFrame, dict, list, str" + ) + super().__init__(message) + + +class DatasetDictError(ValueError): + """Exception raised when a DatasetDict is passed to Datalab. + + Usually, this means that a dataset identifier was passed to Datalab, but + the dataset is a DatasetDict, which contains multiple splits of the dataset. + + """ + + def __init__(self): + message = ( + "Please pass a single dataset, not a DatasetDict. " + "Try specifying a split, e.g. `dataset = load_dataset('dataset', split='train')` " + "then pass `dataset` to Datalab." + ) + super().__init__(message) + + +class DatasetLoadError(ValueError): + """Exception raised when a dataset cannot be loaded. + + Parameters + ---------- + dataset_type: type + The type of dataset that failed to load. + """ + + def __init__(self, dataset_type: type): + message = f"Failed to load dataset from {dataset_type}.\n" + super().__init__(message) + + +class Data: + """ + Class that holds and validates datasets for Datalab. + + Internally, the data is stored as a datasets.Dataset object and the labels + are integers (ranging from 0 to K-1, where K is the number of classes) stored + in a numpy array. + + Parameters + ---------- + data : + Dataset to be audited by Datalab. + Several formats are supported, which will internally be converted to a Dataset object. + + Supported formats: + - datasets.Dataset + - pandas.DataFrame + - dict + - keys are strings + - values are arrays or lists of equal length + - list + - list of dictionaries with the same keys + - str + - path to a local file + - Text (.txt) + - CSV (.csv) + - JSON (.json) + - or a dataset identifier on the Hugging Face Hub + It checks if the string is a path to a file that exists locally, and if not, + it assumes it is a dataset identifier on the Hugging Face Hub. + + label_name : Union[str, List[str]] + Name of the label column in the dataset. + + Warnings + -------- + Optional dependencies: + + - datasets : + Dataset, DatasetDict and load_dataset are imported from datasets. + This is an optional dependency of cleanlab, but is required for + :py:class:`Datalab ` to work. + """ + + def __init__(self, data: "DatasetLike", label_name: str) -> None: + self._validate_data(data) + self._label_name = label_name + self._data = self._load_data(data) + self._validate_data_and_labels(self._data, self._data[label_name]) + self._data_hash = hash(self._data) + self._labels, self._label_map = _extract_labels(self._data, label_name) + + def _load_data(self, data: "DatasetLike") -> Dataset: + """Checks the type of dataset and uses the correct loader method and + assigns the result to the data attribute.""" + dataset_factory_map: Dict[type, Callable[..., Dataset]] = { + Dataset: lambda x: x, + pd.DataFrame: Dataset.from_pandas, + dict: self._load_dataset_from_dict, + list: self._load_dataset_from_list, + str: self._load_dataset_from_string, + } + if not isinstance(data, tuple(dataset_factory_map.keys())): + raise DataFormatError(data) + return dataset_factory_map[type(data)](data) + + def __len__(self) -> int: + return len(self._data) + + def __eq__(self, other) -> bool: + if isinstance(other, Data): + # Equality checks + hashes = self._data_hash == other._data_hash + labels = np.array_equal(self._labels, other._labels) + label_names = self._label_name == other._label_name + label_maps = self._label_map == other._label_map + return all([hashes, labels, label_names, label_maps]) + return False + + def __hash__(self) -> int: + return self._data_hash + + @property + def class_names(self) -> list: + return list(self._label_map.values()) + + @staticmethod + def _validate_data(data) -> None: + if isinstance(data, datasets.DatasetDict): + raise DatasetDictError() + if not isinstance(data, (Dataset, pd.DataFrame, dict, list, str)): + raise DataFormatError(data) + + @staticmethod + def _validate_data_and_labels(data, labels) -> None: + assert isinstance(labels, (np.ndarray, list)) + assert len(labels) == len(data) + + @staticmethod + def _load_dataset_from_dict(data_dict: Dict[str, Any]) -> Dataset: + try: + return Dataset.from_dict(data_dict) + except Exception as error: + raise DatasetLoadError(dict) from error + + @staticmethod + def _load_dataset_from_list(data_list: List[Dict[str, Any]]) -> Dataset: + try: + return Dataset.from_list(data_list) + except Exception as error: + raise DatasetLoadError(list) from error + + @staticmethod + def _load_dataset_from_string(data_string: str) -> Dataset: + if not os.path.exists(data_string): + try: + dataset = datasets.load_dataset(data_string) + return cast(Dataset, dataset) + except Exception as error: + raise DatasetLoadError(str) from error + + factory: Dict[str, Callable[[str], Any]] = { + ".txt": Dataset.from_text, + ".csv": Dataset.from_csv, + ".json": Dataset.from_json, + } + + extension = os.path.splitext(data_string)[1] + if extension not in factory: + raise DatasetLoadError(type(data_string)) + + dataset = factory[extension](data_string) + dataset_cast = cast(Dataset, dataset) + return dataset_cast + + +def _extract_labels(data: Dataset, label_name: str) -> Tuple[np.ndarray, Mapping]: + """ + Picks out labels from the dataset and formats them to be [0, 1, ..., K-1] + where K is the number of classes. Also returns a mapping from the formatted + labels to the original labels in the dataset. + + Note: This function is not meant to be used directly. It is used by + ``cleanlab.data.Data`` to extract the formatted labels from the dataset + and stores them as attributes. + + Parameters + ---------- + label_name : str + Name of the column in the dataset that contains the labels. + + Returns + ------- + formatted_labels : np.ndarray + Labels in the format [0, 1, ..., K-1] where K is the number of classes. + + inverse_map : dict + Mapping from the formatted labels to the original labels in the dataset. + """ + + labels = labels_to_array(data[label_name]) # type: ignore[assignment] + if labels.ndim != 1: + raise ValueError("labels must be 1D numpy array.") + + label_name_feature = data.features[label_name] + if isinstance(label_name_feature, ClassLabel): + label_map = {label: label_name_feature.str2int(label) for label in label_name_feature.names} + formatted_labels = labels + else: + label_map = {label: i for i, label in enumerate(np.unique(labels))} + formatted_labels = np.vectorize(label_map.get)(labels) + inverse_map = {i: label for label, i in label_map.items()} + + return formatted_labels, inverse_map diff --git a/cleanlab/datalab/data_issues.py b/cleanlab/datalab/data_issues.py new file mode 100644 index 0000000000..5ed773a84a --- /dev/null +++ b/cleanlab/datalab/data_issues.py @@ -0,0 +1,279 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . +""" +Module for the :py:class:`DataIssues` class, which serves as a central repository for storing +information and statistics about issues found in a dataset. + +It collects information from various +:py:class:`IssueManager ` +instances and keeps track of each issue, a summary for each type of issue, +related information and statistics about the issues. + +The collected information can be accessed using the +:py:meth:`get_info ` method. +""" +from __future__ import annotations + +import warnings +from typing import TYPE_CHECKING, Any, Dict, Optional +import numpy as np + +import pandas as pd + +if TYPE_CHECKING: # pragma: no cover + from cleanlab.datalab.data import Data + from cleanlab.datalab.issue_manager import IssueManager + + +class DataIssues: + """ + Class that collects and stores information and statistics on issues found in a dataset. + + Parameters + ---------- + data : + The data object for which the issues are being collected. + + Parameters + ---------- + issues : pd.DataFrame + Stores information about each individual issue found in the data, + on a per-example basis. + issue_summary : pd.DataFrame + Summarizes the overall statistics for each issue type. + info : dict + A dictionary that contains information and statistics about the data and each issue type. + """ + + def __init__(self, data: Data) -> None: + self.issues: pd.DataFrame = pd.DataFrame(index=range(len(data))) + self.issue_summary: pd.DataFrame = pd.DataFrame( + columns=["issue_type", "score", "num_issues"] + ).astype({"score": np.float64, "num_issues": np.int64}) + class_names = data.class_names + self.info: Dict[str, Dict[str, Any]] = { + "statistics": { + "num_examples": len(data), + "class_names": class_names, + "num_classes": len(class_names), + "multi_label": False, + "health_score": None, + }, + } + self._label_map = data._label_map + + @property + def statistics(self) -> Dict[str, Any]: + """Returns the statistics dictionary. + + Shorthand for self.info["statistics"]. + """ + return self.info["statistics"] + + def get_issues(self, issue_name: Optional[str] = None) -> pd.DataFrame: + """ + Use this after finding issues to see which examples suffer from which types of issues. + + Parameters + ---------- + issue_name : str or None + The type of issue to focus on. If `None`, returns full DataFrame summarizing all of the types of issues detected in each example from the dataset. + + Raises + ------ + ValueError + If `issue_name` is not a type of issue previously considered in the audit. + + Returns + ------- + specific_issues : + A DataFrame where each row corresponds to an example from the dataset and columns specify: + whether this example exhibits a particular type of issue and how severely (via a numeric quality score where lower values indicate more severe instances of the issue). + + Additional columns may be present in the DataFrame depending on the type of issue specified. + """ + if issue_name is None: + return self.issues + + columns = [col for col in self.issues.columns if issue_name in col] + if not columns: + raise ValueError(f"No columns found for issue type '{issue_name}'.") + specific_issues = self.issues[columns] + info = self.get_info(issue_name=issue_name) + if issue_name == "label": + specific_issues = specific_issues.assign( + given_label=info["given_label"], predicted_label=info["predicted_label"] + ) + + if issue_name == "outlier": + column_dict = { + k: info.get(k) + for k in ["nearest_neighbor", "distance_to_nearest_neighbor"] + if info.get(k) is not None + } + specific_issues = specific_issues.assign(**column_dict) + + if issue_name == "near_duplicate": + column_dict = { + k: info.get(k) + for k in ["near_duplicate_sets", "distance_to_nearest_neighbor"] + if info.get(k) is not None + } + specific_issues = specific_issues.assign(**column_dict) + return specific_issues + + def get_summary(self, issue_name: Optional[str] = None) -> pd.DataFrame: + """Summarize the issues found in dataset of a particular type, + including how severe this type of issue is overall across the dataset. + + Parameters + ---------- + issue_name : + Name of the issue type to summarize. If `None`, summarizes each of the different issue types previously considered in the audit. + + Returns + ------- + summary : + DataFrame where each row corresponds to a type of issue, and columns quantify: + the number of examples in the dataset estimated to exhibit this type of issue, + and the overall severity of the issue across the dataset (via a numeric quality score where lower values indicate that the issue is overall more severe). + """ + if self.issue_summary.empty: + raise ValueError( + "No issues found in the dataset. " + "Call `find_issues` before calling `get_summary`." + ) + + if issue_name is None: + return self.issue_summary + + row_mask = self.issue_summary["issue_type"] == issue_name + if not any(row_mask): + raise ValueError(f"Issue type {issue_name} not found in the summary.") + return self.issue_summary[row_mask].reset_index(drop=True) + + def get_info(self, issue_name: Optional[str] = None) -> Dict[str, Any]: + """Get the info for the issue_name key. + + This function is used to get the info for a specific issue_name. If the info is not computed yet, it will raise an error. + + Parameters + ---------- + issue_name : + The issue name for which the info is required. + + Returns + ------- + info: + The info for the issue_name. + """ + info = self.info.get(issue_name, None) if issue_name else self.info + if info is None: + raise ValueError( + f"issue_name {issue_name} not found in self.info. These have not been computed yet." + ) + info = info.copy() + if issue_name == "label": + # Labels that are stored as integers may need to be converted to strings. + for key in ["given_label", "predicted_label"]: + labels = info.get(key, None) + if labels is not None: + info[key] = np.vectorize(self._label_map.get)(labels) + + info["class_names"] = self.statistics["class_names"] + return info + + def collect_statistics_from_issue_manager(self, issue_manager: IssueManager) -> None: + """Update the statistics in the info dictionary. + + Parameters + ---------- + statistics : + A dictionary of statistics to add/update in the info dictionary. + + Examples + -------- + + A common use case is to reuse the KNN-graph across multiple issue managers. + To avoid recomputing the KNN-graph for each issue manager, + we can pass it as a statistic to the issue managers. + + >>> from scipy.sparse import csr_matrix + >>> weighted_knn_graph = csr_matrix(...) + >>> issue_manager_that_computes_knn_graph = ... + + """ + key = "statistics" + statistics: Dict[str, Any] = issue_manager.info.pop(key, {}) + if statistics: + self.info[key].update(statistics) + + def collect_results_from_issue_manager(self, issue_manager: IssueManager) -> None: + """ + Collects results from an IssueManager and update the corresponding + attributes of the Datalab object. + + This includes: + - self.issues + - self.issue_summary + - self.info + + Parameters + ---------- + issue_manager : + IssueManager object to collect results from. + """ + overlapping_columns = list(set(self.issues.columns) & set(issue_manager.issues.columns)) + if overlapping_columns: + warnings.warn( + f"Overwriting columns {overlapping_columns} in self.issues with " + f"columns from issue manager {issue_manager}." + ) + self.issues.drop(columns=overlapping_columns, inplace=True) + self.issues = self.issues.join(issue_manager.issues, how="outer") + + if issue_manager.issue_name in self.issue_summary["issue_type"].values: + warnings.warn( + f"Overwriting row in self.issue_summary with " + f"row from issue manager {issue_manager}." + ) + self.issue_summary = self.issue_summary[ + self.issue_summary["issue_type"] != issue_manager.issue_name + ] + issue_column_name: str = f"is_{issue_manager.issue_name}_issue" + num_issues: int = int(issue_manager.issues[issue_column_name].sum()) + self.issue_summary = pd.concat( + [ + self.issue_summary, + issue_manager.summary.assign(num_issues=num_issues), + ], + axis=0, + ignore_index=True, + ) + + if issue_manager.issue_name in self.info: + warnings.warn( + f"Overwriting key {issue_manager.issue_name} in self.info with " + f"key from issue manager {issue_manager}." + ) + self.info[issue_manager.issue_name] = issue_manager.info + + def set_health_score(self) -> None: + """Set the health score for the dataset based on the issue summary. + + Currently, the health score is the mean of the scores for each issue type. + """ + self.info["statistics"]["health_score"] = self.issue_summary["score"].mean() diff --git a/cleanlab/datalab/datalab.py b/cleanlab/datalab/datalab.py new file mode 100644 index 0000000000..7be464ae73 --- /dev/null +++ b/cleanlab/datalab/datalab.py @@ -0,0 +1,519 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . +""" +Datalab offers a unified audit to detect all kinds of issues in data and labels. + +.. note:: + .. include:: optional_dependencies.rst +""" +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +import numpy as np +import numpy.typing as npt +import pandas as pd + +import cleanlab +from cleanlab.datalab.data import Data +from cleanlab.datalab.data_issues import DataIssues +from cleanlab.datalab.display import _Displayer +from cleanlab.datalab.issue_finder import IssueFinder +from cleanlab.datalab.serialize import _Serializer +from cleanlab.datalab.report import Reporter + +if TYPE_CHECKING: # pragma: no cover + from datasets.arrow_dataset import Dataset + from scipy.sparse import csr_matrix + + DatasetLike = Union[Dataset, pd.DataFrame, Dict[str, Any], List[Dict[str, Any]], str] + +__all__ = ["Datalab"] + + +class Datalab: + """ + A single object to automatically detect all kinds of issues in datasets. + This is how we recommend you interface with the cleanlab library if you want to audit the quality of your data. If you have other specific goals, then consider using the other methods across this library. Even then, Datalab may be the easiest way to run specific analyses of your data. Datalab tracks intermediate state (e.g. data statistics) from certain cleanlab functions that can be re-used across other cleanlab functions for better efficiency. + + Parameters + ---------- + data : Union[Dataset, pd.DataFrame, dict, list, str] + Dataset-like object that can be converted to a Hugging Face Dataset object. + + It should contain the labels for all examples, identified by a + `label_name` column in the Dataset object. + + Supported formats: + - datasets.Dataset + - pandas.DataFrame + - dict (keys are strings, values are arrays/lists of length ``N``) + - list (list of dictionaries that each have the same keys) + - str + + - path to a local file: Text (.txt), CSV (.csv), JSON (.json) + - or a dataset identifier on the Hugging Face Hub + + label_name : str + The name of the label column in the dataset. + + verbosity : int, optional + The higher the verbosity level, the more information + Datalab prints when auditing a dataset. + Valid values are 0 through 4. Default is 1. + + Examples + -------- + >>> import datasets + >>> from cleanlab import Datalab + >>> data = datasets.load_dataset("glue", "sst2", split="train") + >>> datalab = Datalab(data, label_name="label") + """ + + def __init__( + self, + data: "DatasetLike", + label_name: str, + verbosity: int = 1, + ) -> None: + self._data = Data(data, label_name) + self.data = self._data._data + self._labels, self._label_map = self._data._labels, self._data._label_map + self._data_hash = self._data._data_hash + self.label_name = self._data._label_name + self.data_issues = DataIssues(self._data) + self.cleanlab_version = cleanlab.version.__version__ + self.verbosity = verbosity + + def __repr__(self) -> str: + return _Displayer(data_issues=self.data_issues).__repr__() + + def __str__(self) -> str: + return _Displayer(data_issues=self.data_issues).__str__() + + @property + def labels(self) -> np.ndarray: + """Labels of the dataset, in a [0, 1, ..., K-1] format.""" + return self._labels + + @property + def class_names(self) -> List[str]: + """Names of the classes in the dataset.""" + return self._data.class_names + + def find_issues( + self, + *, + pred_probs: Optional[np.ndarray] = None, + features: Optional[npt.NDArray] = None, + knn_graph: Optional[csr_matrix] = None, + issue_types: Optional[Dict[str, Any]] = None, + ) -> None: + """ + Checks the dataset for all sorts of common issues in real-world data (in both labels and feature values). + + You can use Datalab to find issues in your data, utilizing *any* model you have already trained. + This method only interacts with your model via its predictions or embeddings (and other functions thereof). + The more of these inputs you provide, the more types of issues Datalab can detect in your dataset/labels. + If you provide a subset of these inputs, Datalab will output what insights it can based on the limited information from your model. + + Note + ---- + This method acts as a wrapper around the :py:meth:`IssueFinder.find_issues ` method, + where the core logic for issue detection is implemented. + + Note + ---- + The issues are saved in the ``self.issues`` attribute, but are not returned. + + Parameters + ---------- + pred_probs : + Out-of-sample predicted class probabilities made by the model for every example in the dataset. + To best detect label issues, provide this input obtained from the most accurate model you can produce. + + If provided, this must be a 2D array with shape (num_examples, K) where K is the number of classes in the dataset. + + features : Optional[np.ndarray] + Feature embeddings (vector representations) of every example in the dataset. + + If provided, this must be a 2D array with shape (num_examples, num_features). + + knn_graph : + Sparse matrix representing distances between examples in the dataset in a k nearest neighbor graph. + + If provided, this must be a square CSR matrix with shape (num_examples, num_examples) and (k*num_examples) non-zero entries (k is the number of nearest neighbors considered for each example) + evenly distributed across the rows. + The non-zero entries must be the distances between the corresponding examples. Self-distances must be omitted + (i.e. the diagonal must be all zeros and the k nearest neighbors of each example must not include itself). + + For any duplicated examples i,j whose distance is 0, there should be an *explicit* zero stored in the matrix, i.e. ``knn_graph[i,j] = 0``. + + If both `knn_graph` and `features` are provided, the `knn_graph` will take precendence. + If `knn_graph` is not provided, it is constructed based on the provided `features`. + If neither `knn_graph` nor `features` are provided, certain issue types like (near) duplicates will not be considered. + + issue_types : + Collection specifying which types of issues to consider in audit and any non-default parameter settings to use. + If unspecified, a default set of issue types and recommended parameter settings is considered. + + This is a dictionary of dictionaries, where the keys are the issue types of interest + and the values are dictionaries of parameter values that control how each type of issue is detected (only for advanced users). + More specifically, the values are constructor keyword arguments passed to the corresponding ``IssueManager``, + which is responsible for detecting the particular issue type. + + .. seealso:: + :py:class:`IssueManager ` + + Examples + -------- + + Here are some ways to provide inputs to :py:meth:`find_issues`: + + - Passing ``pred_probs``: + + .. code-block:: python + + >>> from sklearn.linear_model import LogisticRegression + >>> import numpy as np + >>> from cleanlab import Datalab + >>> X = np.array([[0, 1], [1, 1], [2, 2], [2, 0]]) + >>> y = np.array([0, 1, 1, 0]) + >>> clf = LogisticRegression(random_state=0).fit(X, y) + >>> pred_probs = clf.predict_proba(X) + >>> lab = Datalab(data={"X": X, "y": y}, label_name="y") + >>> lab.find_issues(pred_probs=pred_probs) + + + - Passing ``features``: + + .. code-block:: python + + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.neighbors import NearestNeighbors + >>> import numpy as np + >>> from cleanlab import Datalab + >>> X = np.array([[0, 1], [1, 1], [2, 2], [2, 0]]) + >>> y = np.array([0, 1, 1, 0]) + >>> lab = Datalab(data={"X": X, "y": y}, label_name="y") + >>> lab.find_issues(features=X) + + .. note:: + + You can pass both ``pred_probs`` and ``features`` to :py:meth:`find_issues` for a more comprehensive audit. + + - Passing a ``knn_graph``: + + .. code-block:: python + + >>> from sklearn.neighbors import NearestNeighbors + >>> import numpy as np + >>> from cleanlab import Datalab + >>> X = np.array([[0, 1], [1, 1], [2, 2], [2, 0]]) + >>> y = np.array([0, 1, 1, 0]) + >>> nbrs = NearestNeighbors(n_neighbors=2, metric="euclidean").fit(X) + >>> knn_graph = nbrs.kneighbors_graph(mode="distance") + >>> knn_graph # Pass this to Datalab + <4x4 sparse matrix of type '' + with 8 stored elements in Compressed Sparse Row format> + >>> knn_graph.toarray() # DO NOT PASS knn_graph.toarray() to Datalab, only pass the sparse matrix itself + array([[0. , 1. , 2.23606798, 0. ], + [1. , 0. , 1.41421356, 0. ], + [0. , 1.41421356, 0. , 2. ], + [0. , 1.41421356, 2. , 0. ]]) + >>> lab = Datalab(data={"X": X, "y": y}, label_name="y") + >>> lab.find_issues(knn_graph=knn_graph) + + - Configuring issue types: + + Suppose you want to only consider label issues. Just pass a dictionary with the key "label" and an empty dictionary as the value (to use default label issue parameters). + + .. code-block:: python + + >>> issue_types = {"label": {}} + >>> # lab.find_issues(pred_probs=pred_probs, issue_types=issue_types) + + If you are advanced user who wants greater control, you can pass keyword arguments to the issue manager that handles the label issues. + For example, if you want to pass the keyword argument "clean_learning_kwargs" + to the constructor of the :py:class:`LabelIssueManager `, you would pass: + + + .. code-block:: python + + >>> issue_types = { + ... "label": { + ... "clean_learning_kwargs": { + ... "prune_method": "prune_by_noise_rate", + ... }, + ... }, + ... } + >>> # lab.find_issues(pred_probs=pred_probs, issue_types=issue_types) + + """ + issue_finder = IssueFinder(datalab=self, verbosity=self.verbosity) + issue_finder.find_issues( + pred_probs=pred_probs, + features=features, + knn_graph=knn_graph, + issue_types=issue_types, + ) + + def report( + self, + *, + num_examples: int = 5, + verbosity: Optional[int] = None, + include_description: bool = True, + ) -> None: + """Prints informative summary of all issues. + + Parameters + ---------- + num_examples : + Number of examples to show for each type of issue. + The report shows the top `num_examples` instances in the dataset that suffer the most from each type of issue. + + verbosity : + Higher verbosity levels add more information to the report. + + include_description : + Whether or not to include a description of each issue type in the report. + Consider setting this to ``False`` once you're familiar with how each issue type is defined. + + See Also + -------- + For advanced usage, see documentation for the :py:class:`Reporter ` class. + """ + if verbosity is None: + verbosity = self.verbosity + reporter = Reporter( + data_issues=self.data_issues, + verbosity=verbosity, + include_description=include_description, + ) + print(reporter.get_report(num_examples=num_examples)) + + @property + def issues(self) -> pd.DataFrame: + """Issues found in each example from the dataset.""" + return self.data_issues.issues + + @issues.setter + def issues(self, issues: pd.DataFrame) -> None: + self.data_issues.issues = issues + + @property + def issue_summary(self) -> pd.DataFrame: + """Summary of issues found in the dataset and the overall severity of each type of issue. + + This is a wrapper around the ``DataIssues.issue_summary`` attribute. + + Examples + ------- + + If checks for "label" and "outlier" issues were run, + then the issue summary will look something like this: + + >>> datalab.issue_summary + issue_type score + outlier 0.123 + label 0.456 + """ + return self.data_issues.issue_summary + + @issue_summary.setter + def issue_summary(self, issue_summary: pd.DataFrame) -> None: + self.data_issues.issue_summary = issue_summary + + @property + def info(self) -> Dict[str, Dict[str, Any]]: + """Information and statistics about the dataset issues found. + + This is a wrapper around the ``DataIssues.info`` attribute. + + Examples + ------- + + If checks for "label" and "outlier" issues were run, + then the info will look something like this: + + >>> datalab.info + { + "label": { + "given_labels": [0, 1, 0, 1, 1, 1, 1, 1, 0, 1, ...], + "predicted_label": [0, 0, 0, 1, 0, 1, 0, 1, 0, 1, ...], + ..., + }, + "outlier": { + "nearest_neighbor": [3, 7, 1, 2, 8, 4, 5, 9, 6, 0, ...], + "distance_to_nearest_neighbor": [0.123, 0.789, 0.456, ...], + ..., + }, + } + """ + return self.data_issues.info + + @info.setter + def info(self, info: Dict[str, Dict[str, Any]]) -> None: + self.data_issues.info = info + + def get_issues(self, issue_name: Optional[str] = None) -> pd.DataFrame: + """ + Use this after finding issues to see which examples suffer from which types of issues. + + NOTE + ---- + This is a wrapper around the :py:meth:`DataIssues.get_issues ` method. + + Parameters + ---------- + issue_name : str or None + The type of issue to focus on. If `None`, returns full DataFrame summarizing all of the types of issues detected in each example from the dataset. + + Raises + ------ + ValueError + If `issue_name` is not a type of issue previously considered in the audit. + + Returns + ------- + specific_issues : + A DataFrame where each row corresponds to an example from the dataset and columns specify: + whether this example exhibits a particular type of issue and how severely (via a numeric quality score where lower values indicate more severe instances of the issue). + + Additional columns may be present in the DataFrame depending on the type of issue specified. + """ + return self.data_issues.get_issues(issue_name=issue_name) + + def get_summary(self, issue_name: Optional[str] = None) -> pd.DataFrame: + """Summarize the issues found in dataset of a particular type, + including how severe this type of issue is overall across the dataset. + + NOTE + ---- + This is a wrapper around the + :py:meth:`DataIssues.get_summary ` method. + + Parameters + ---------- + issue_name : + Name of the issue type to summarize. If `None`, summarizes each of the different issue types previously considered in the audit. + + Returns + ------- + summary : + DataFrame where each row corresponds to a type of issue, and columns quantify: + the number of examples in the dataset estimated to exhibit this type of issue, + and the overall severity of the issue across the dataset (via a numeric quality score where lower values indicate that the issue is overall more severe). + """ + return self.data_issues.get_summary(issue_name=issue_name) + + def get_info(self, issue_name: Optional[str] = None) -> Dict[str, Any]: + """Get the info for the issue_name key. + + This function is used to get the info for a specific issue_name. If the info is not computed yet, it will raise an error. + + NOTE + ---- + This is a wrapper around the + :py:meth:`DataIssues.get_info ` method. + + Parameters + ---------- + issue_name : + The issue name for which the info is required. + + Returns + ------- + info: + The info for the issue_name. + """ + return self.data_issues.get_info(issue_name) + + @staticmethod + def list_possible_issue_types() -> List[str]: + """Returns a list of all registered issue types. + + Any issue type that is not in this list cannot be used in the :py:meth:`find_issues` method. + + Note + ---- + This method is a wrapper around :py:meth:`IssueFinder.list_possible_issue_types `. + + See Also + -------- + :py:class:`REGISTRY ` : All available issue types and their corresponding issue managers can be found here. + """ + return IssueFinder.list_possible_issue_types() + + @staticmethod + def list_default_issue_types() -> List[str]: + """Returns a list of the issue types that are run by default + when :py:meth:`find_issues` is called without specifying `issue_types`. + + Note + ---- + This method is a wrapper around :py:meth:`IssueFinder.list_default_issue_types `. + + See Also + -------- + :py:class:`REGISTRY ` : All available issue types and their corresponding issue managers can be found here. + """ + return IssueFinder.list_default_issue_types() + + def save(self, path: str, force: bool = False) -> None: + """Saves this Datalab object to file (all files are in folder at `path/`). + We do not guarantee saved Datalab can be loaded from future versions of cleanlab. + + Parameters + ---------- + path : + Folder in which all information about this Datalab should be saved. + + force : + If ``True``, overwrites any existing files in the folder at `path`. Use this with caution! + + Note + ---- + You have to save the Dataset yourself separately if you want it saved to file. + """ + _Serializer.serialize(path=path, datalab=self, force=force) + save_message = f"Saved Datalab to folder: {path}" + print(save_message) + + @staticmethod + def load(path: str, data: Optional[Dataset] = None) -> "Datalab": + """Loads Datalab object from a previously saved folder. + + Parameters + ---------- + `path` : + Path to the folder previously specified in ``Datalab.save()``. + + `data` : + The dataset used to originally construct the Datalab. + Remember the dataset is not saved as part of the Datalab, + you must save/load the data separately. + + Returns + ------- + `datalab` : + A Datalab object that is identical to the one originally saved. + """ + datalab = _Serializer.deserialize(path=path, data=data) + load_message = f"Datalab loaded from folder: {path}" + print(load_message) + return datalab diff --git a/cleanlab/datalab/display.py b/cleanlab/datalab/display.py new file mode 100644 index 0000000000..520b261516 --- /dev/null +++ b/cleanlab/datalab/display.py @@ -0,0 +1,61 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . +""" +Module that handles the string representation of Datalab objects. +""" + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: # pragma: no cover + from cleanlab.datalab.data_issues import DataIssues + + +class _Displayer: + def __init__(self, data_issues: "DataIssues") -> None: + self.data_issues = data_issues + + def __repr__(self) -> str: + """What is displayed in console if user executes: >>> datalab""" + checks_run = not self.data_issues.issues.empty + display_str = f"checks_run={checks_run}" + num_examples = self.data_issues.get_info("statistics")["num_examples"] + if num_examples is not None: + display_str += f", num_examples={num_examples}" + num_classes = self.data_issues.get_info("statistics")["num_classes"] + if num_classes is not None: + display_str += f", num_classes={num_classes}" + if checks_run: + issues_identified = self.data_issues.issue_summary["num_issues"].sum() + display_str += f", issues_identified={issues_identified}" + return f"Datalab({display_str})" + + def __str__(self) -> str: + """What is displayed if user executes: print(datalab)""" + checks_run = not self.data_issues.issues.empty + num_examples = self.data_issues.get_info("statistics").get("num_examples") + num_classes = self.data_issues.get_info("statistics").get("num_classes") + + issues_identified = ( + self.data_issues.issue_summary["num_issues"].sum() if checks_run else "Not checked" + ) + info_list = [ + f"Checks run: {'Yes' if checks_run else 'No'}", + f"Number of examples: {num_examples if num_examples is not None else 'Unknown'}", + f"Number of classes: {num_classes if num_classes is not None else 'Unknown'}", + f"Issues identified: {issues_identified}", + ] + + return "Datalab:\n" + "\n".join(info_list) diff --git a/cleanlab/datalab/examples/__init__.py b/cleanlab/datalab/examples/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cleanlab/datalab/factory.py b/cleanlab/datalab/factory.py new file mode 100644 index 0000000000..0cbe8f5262 --- /dev/null +++ b/cleanlab/datalab/factory.py @@ -0,0 +1,155 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . +"""The factory module provides a factory class for constructing concrete issue managers +and a decorator for registering new issue managers. + +This module provides the :py:meth:`register` decorator for users to register new subclasses of +:py:class:`IssueManager ` +in the registry. Each IssueManager detects some particular type of issue in a dataset. + + +Note +---- + +The :class:`REGISTRY` variable is used by the factory class to keep track +of registered issue managers. +The factory class is used as an implementation detail by +:py:class:`Datalab `, +which provides a simplified API for constructing concrete issue managers. +:py:class:`Datalab ` is intended to be used by users +and provides detailed documentation on how to use the API. + +Warning +------- +Neither the :class:`REGISTRY` variable nor the factory class should be used directly by users. +""" +from __future__ import annotations + +from typing import Dict, List, Type + +from cleanlab.datalab.issue_manager import ( + IssueManager, + LabelIssueManager, + NearDuplicateIssueManager, + OutlierIssueManager, + NonIIDIssueManager, +) + + +REGISTRY: Dict[str, Type[IssueManager]] = { + "outlier": OutlierIssueManager, + "label": LabelIssueManager, + "near_duplicate": NearDuplicateIssueManager, + "non_iid": NonIIDIssueManager, +} +"""Registry of issue managers that can be constructed from a string +and used in the Datalab class. + +:meta hide-value: + +Currently, the following issue managers are registered by default: + +- ``"outlier"``: :py:class:`OutlierIssueManager ` +- ``"label"``: :py:class:`LabelIssueManager ` +- ``"near_duplicate"``: :py:class:`NearDuplicateIssueManager ` +- ``"non_iid"``: :py:class:`NonIIDIssueManager ` + +Warning +------- +This variable should not be used directly by users. +""" + + +# Construct concrete issue manager with a from_str method +class _IssueManagerFactory: + """Factory class for constructing concrete issue managers.""" + + @classmethod + def from_str(cls, issue_type: str) -> Type[IssueManager]: + """Constructs a concrete issue manager class from a string.""" + if isinstance(issue_type, list): + raise ValueError( + "issue_type must be a string, not a list. Try using from_list instead." + ) + if issue_type not in REGISTRY: + raise ValueError(f"Invalid issue type: {issue_type}") + return REGISTRY[issue_type] + + @classmethod + def from_list(cls, issue_types: List[str]) -> List[Type[IssueManager]]: + """Constructs a list of concrete issue manager classes from a list of strings.""" + return [cls.from_str(issue_type) for issue_type in issue_types] + + +def register(cls: Type[IssueManager]) -> Type[IssueManager]: + """Registers the issue manager factory. + + Parameters + ---------- + cls : + A subclass of + :py:class:`IssueManager `. + + Returns + ------- + cls : + The same class that was passed in. + + Example + ------- + + When defining a new subclass of + :py:class:`IssueManager `, + you can register it like so: + + .. code-block:: python + + from cleanlab import IssueManager + from cleanlab.datalab.factory import register + + @register + class MyIssueManager(IssueManager): + issue_name: str = "my_issue" + def find_issues(self, **kwargs): + # Some logic to find issues + pass + + or in a function call: + + .. code-block:: python + + from cleanlab import IssueManager + from cleanlab.datalab.factory import register + + class MyIssueManager(IssueManager): + issue_name: str = "my_issue" + def find_issues(self, **kwargs): + # Some logic to find issues + pass + + register(MyIssueManager) + """ + name: str = str(cls.issue_name) + if name in REGISTRY: + # Warn user that they are overwriting an existing issue manager + print( + f"Warning: Overwriting existing issue manager {name} with {cls}. " + "This may cause unexpected behavior." + ) + if not issubclass(cls, IssueManager): + raise ValueError(f"Class {cls} must be a subclass of IssueManager") + REGISTRY[name] = cls + return cls diff --git a/cleanlab/datalab/issue_finder.py b/cleanlab/datalab/issue_finder.py new file mode 100644 index 0000000000..c3de568ecd --- /dev/null +++ b/cleanlab/datalab/issue_finder.py @@ -0,0 +1,351 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . +""" +Module for the :class:`IssueFinder` class, which is responsible for configuring, +creating and running issue managers. + +It determines which types of issues to look for, instatiates the IssueManagers +via a factory, run the issue managers +(:py:meth:`IssueManager.find_issues `), +and collects the results to :py:class:`DataIssues `. + +.. note:: + + This module is not intended to be used directly. Instead, use the public-facing + :py:meth:`Datalab.find_issues ` method. +""" + +from typing import Any, List, Optional, Dict, TYPE_CHECKING +import warnings + +import numpy as np +import numpy.typing as npt +from scipy.sparse import csr_matrix + +from cleanlab.datalab.factory import _IssueManagerFactory, REGISTRY + +if TYPE_CHECKING: # pragma: no cover + from cleanlab.datalab.datalab import Datalab + + +class IssueFinder: + """ + The IssueFinder class is responsible for managing the process of identifying + issues in the dataset by handling the creation and execution of relevant + IssueManagers. It serves as a coordinator or helper class for the Datalab class + to encapsulate the specific behavior of the issue finding process. + + At a high level, the IssueFinder is responsible for: + + - Determining which types of issues to look for. + - Instantiating the appropriate IssueManagers using a factory. + - Running the IssueManagers' `find_issues` methods. + - Collecting the results into a DataIssues instance. + + Parameters + ---------- + datalab : Datalab + The Datalab instance associated with this IssueFinder. + + verbosity : int + Controls the verbosity of the output during the issue finding process. + + Note + ---- + This class is not intended to be used directly. Instead, use the + `Datalab.find_issues` method which internally utilizes an IssueFinder instance. + """ + + def __init__(self, datalab: "Datalab", verbosity=1): + self.datalab = datalab + self.verbosity = verbosity + + def find_issues( + self, + *, + pred_probs: Optional[np.ndarray] = None, + features: Optional[npt.NDArray] = None, + knn_graph: Optional[csr_matrix] = None, + issue_types: Optional[Dict[str, Any]] = None, + ) -> None: + """ + Checks the dataset for all sorts of common issues in real-world data (in both labels and feature values). + + You can use Datalab to find issues in your data, utilizing *any* model you have already trained. + This method only interacts with your model via its predictions or embeddings (and other functions thereof). + The more of these inputs you provide, the more types of issues Datalab can detect in your dataset/labels. + If you provide a subset of these inputs, Datalab will output what insights it can based on the limited information from your model. + + Note + ---- + This method is not intended to be used directly. Instead, use the + :py:meth:`Datalab.find_issues ` method. + + Note + ---- + The issues are saved in the ``self.datalab.data_issues.issues`` attribute, but are not returned. + + Parameters + ---------- + pred_probs : + Out-of-sample predicted class probabilities made by the model for every example in the dataset. + To best detect label issues, provide this input obtained from the most accurate model you can produce. + + If provided, this must be a 2D array with shape (num_examples, K) where K is the number of classes in the dataset. + + features : Optional[np.ndarray] + Feature embeddings (vector representations) of every example in the dataset. + + If provided, this must be a 2D array with shape (num_examples, num_features). + + knn_graph : + Sparse matrix representing distances between examples in the dataset in a k nearest neighbor graph. + + If provided, this must be a square CSR matrix with shape (num_examples, num_examples) and (k*num_examples) non-zero entries (k is the number of nearest neighbors considered for each example) + evenly distributed across the rows. + The non-zero entries must be the distances between the corresponding examples. Self-distances must be omitted + (i.e. the diagonal must be all zeros and the k nearest neighbors of each example must not include itself). + + For any duplicated examples i,j whose distance is 0, there should be an *explicit* zero stored in the matrix, i.e. ``knn_graph[i,j] = 0``. + + If both `knn_graph` and `features` are provided, the `knn_graph` will take precendence. + If `knn_graph` is not provided, it is constructed based on the provided `features`. + If neither `knn_graph` nor `features` are provided, certain issue types like (near) duplicates will not be considered. + + issue_types : + Collection specifying which types of issues to consider in audit and any non-default parameter settings to use. + If unspecified, a default set of issue types and recommended parameter settings is considered. + + This is a dictionary of dictionaries, where the keys are the issue types of interest + and the values are dictionaries of parameter values that control how each type of issue is detected (only for advanced users). + More specifically, the values are constructor keyword arguments passed to the corresponding ``IssueManager``, + which is responsible for detecting the particular issue type. + + .. seealso:: + :py:class:`IssueManager ` + """ + + if issue_types is not None and not issue_types: + warnings.warn( + "No issue types were specified. " "No issues will be found in the dataset." + ) + return None + + if issue_types is not None and not issue_types: + warnings.warn( + "No issue types were specified. " "No issues will be found in the dataset." + ) + return None + + issue_types_copy = self.get_available_issue_types( + pred_probs=pred_probs, + features=features, + knn_graph=knn_graph, + issue_types=issue_types, + ) + + new_issue_managers = [ + factory(datalab=self.datalab, **issue_types_copy.get(factory.issue_name, {})) + for factory in _IssueManagerFactory.from_list(list(issue_types_copy.keys())) + ] + + if not new_issue_managers: + no_args_passed = all(arg is None for arg in [pred_probs, features, knn_graph]) + if no_args_passed: + warnings.warn("No arguments were passed to find_issues.") + warnings.warn("No issue check performed.") + return None + + failed_managers = [] + data_issues = self.datalab.data_issues + for issue_manager, arg_dict in zip(new_issue_managers, issue_types_copy.values()): + try: + if self.verbosity: + print(f"Finding {issue_manager.issue_name} issues ...") + issue_manager.find_issues(**arg_dict) + data_issues.collect_statistics_from_issue_manager(issue_manager) + data_issues.collect_results_from_issue_manager(issue_manager) + except Exception as e: + print(f"Error in {issue_manager.issue_name}: {e}") + failed_managers.append(issue_manager) + + if self.verbosity: + print( + f"Audit complete. {data_issues.issue_summary['num_issues'].sum()} issues found in the dataset." + ) + if failed_managers: + print(f"Failed to check for these issue types: {failed_managers}") + + data_issues.set_health_score() + + def _resolve_required_args(self, pred_probs, features, knn_graph): + """Resolves the required arguments for each issue type. + + This is a helper function that filters out any issue manager + that does not have the required arguments. + + This does not consider custom hyperparameters for each issue type. + + + Parameters + ---------- + pred_probs : + Out-of-sample predicted probabilities made on the data. + + features : + Name of column containing precomputed embeddings. + + Returns + ------- + args_dict : + Dictionary of required arguments for each issue type, if available. + """ + args_dict = { + "label": {"pred_probs": pred_probs}, + "outlier": {"pred_probs": pred_probs, "features": features, "knn_graph": knn_graph}, + "near_duplicate": {"features": features, "knn_graph": knn_graph}, + "non_iid": {"features": features, "knn_graph": knn_graph}, + } + + args_dict = { + k: {k2: v2 for k2, v2 in v.items() if v2 is not None} for k, v in args_dict.items() if v + } + + # Prefer `knn_graph` over `features` if both are provided. + for v in args_dict.values(): + if "knn_graph" in v and "features" in v: + warnings.warn( + "Both `features` and `knn_graph` were provided. " + "Most issue managers will likely prefer using `knn_graph` " + "instead of `features` for efficiency." + ) + + args_dict = {k: v for k, v in args_dict.items() if v} + + return args_dict + + def _set_issue_types( + self, + issue_types: Optional[Dict[str, Any]], + required_defaults_dict: Dict[str, Any], + ) -> Dict[str, Any]: + """Set necessary configuration for each IssueManager in a dictionary. + + While each IssueManager defines default values for its arguments, + the Datalab class needs to organize the calls to each IssueManager + with different arguments, some of which may be user-provided. + + Parameters + ---------- + issue_types : + Dictionary of issue types and argument configuration for their respective IssueManagers. + If None, then the `required_defaults_dict` is used. + + required_defaults_dict : + Dictionary of default parameter configuration for each issue type. + + Returns + ------- + issue_types_copy : + Dictionary of issue types and their parameter configuration. + The input `issue_types` is copied and updated with the necessary default values. + """ + if issue_types is not None: + issue_types_copy = issue_types.copy() + self._check_missing_args(required_defaults_dict, issue_types_copy) + else: + issue_types_copy = required_defaults_dict.copy() + # Check that all required arguments are provided. + self._validate_issue_types_dict(issue_types_copy, required_defaults_dict) + + # Remove None values from argument list, rely on default values in IssueManager + for key, value in issue_types_copy.items(): + issue_types_copy[key] = {k: v for k, v in value.items() if v is not None} + return issue_types_copy + + @staticmethod + def _check_missing_args(required_defaults_dict, issue_types): + for key, issue_type_value in issue_types.items(): + missing_args = set(required_defaults_dict.get(key, {})) - set(issue_type_value.keys()) + # Impute missing arguments with default values. + missing_dict = { + missing_arg: required_defaults_dict[key][missing_arg] + for missing_arg in missing_args + } + issue_types[key].update(missing_dict) + + @staticmethod + def _validate_issue_types_dict( + issue_types: Dict[str, Any], required_defaults_dict: Dict[str, Any] + ) -> None: + missing_required_args_dict = {} + for issue_name, required_args in required_defaults_dict.items(): + if issue_name in issue_types: + missing_args = set(required_args.keys()) - set(issue_types[issue_name].keys()) + if missing_args: + missing_required_args_dict[issue_name] = missing_args + if any(missing_required_args_dict.values()): + error_message = "" + for issue_name, missing_required_args in missing_required_args_dict.items(): + error_message += f"Required argument {missing_required_args} for issue type {issue_name} was not provided.\n" + raise ValueError(error_message) + + @staticmethod + def list_possible_issue_types() -> List[str]: + """Returns a list of all registered issue types. + + Any issue type that is not in this list cannot be used in the :py:meth:`find_issues` method. + + See Also + -------- + :py:class:`REGISTRY ` : All available issue types and their corresponding issue managers can be found here. + """ + return list(REGISTRY.keys()) + + @staticmethod + def list_default_issue_types() -> List[str]: + """Returns a list of the issue types that are run by default + when :py:meth:`find_issues` is called without specifying `issue_types`. + + See Also + -------- + :py:class:`REGISTRY ` : All available issue types and their corresponding issue managers can be found here. + """ + return ["label", "outlier", "near_duplicate"] + + def get_available_issue_types(self, **kwargs): + """Returns a dictionary of issue types that can be used in :py:meth:`Datalab.find_issues + ` method.""" + + pred_probs = kwargs.get("pred_probs", None) + features = kwargs.get("features", None) + knn_graph = kwargs.get("knn_graph", None) + issue_types = kwargs.get("issue_types", None) + + # Determine which parameters are required for each issue type + required_args_per_issue_type = self._resolve_required_args(pred_probs, features, knn_graph) + + issue_types_copy = self._set_issue_types(issue_types, required_args_per_issue_type) + + if issue_types is None: + # Only run default issue types if no issue types are specified + issue_types_copy = { + issue: issue_types_copy[issue] + for issue in self.list_default_issue_types() + if issue in issue_types_copy + } + + return issue_types_copy diff --git a/cleanlab/datalab/issue_manager/__init__.py b/cleanlab/datalab/issue_manager/__init__.py new file mode 100644 index 0000000000..0352a6a03a --- /dev/null +++ b/cleanlab/datalab/issue_manager/__init__.py @@ -0,0 +1,5 @@ +from .issue_manager import IssueManager # isort:skip +from .duplicate import NearDuplicateIssueManager +from .label import LabelIssueManager +from .outlier import OutlierIssueManager +from .noniid import NonIIDIssueManager diff --git a/cleanlab/datalab/issue_manager/duplicate.py b/cleanlab/datalab/issue_manager/duplicate.py new file mode 100644 index 0000000000..dcfdbde2a6 --- /dev/null +++ b/cleanlab/datalab/issue_manager/duplicate.py @@ -0,0 +1,222 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Union +import warnings + +import numpy as np +import numpy.typing as npt +import pandas as pd +from scipy.sparse import csr_matrix +from sklearn.neighbors import NearestNeighbors +from sklearn.utils.validation import check_is_fitted + +from cleanlab.datalab.issue_manager import IssueManager + +if TYPE_CHECKING: # pragma: no cover + from cleanlab.datalab.datalab import Datalab + + +class NearDuplicateIssueManager(IssueManager): + """Manages issues related to near-duplicate examples.""" + + description: ClassVar[ + str + ] = """A (near) duplicate issue refers to two or more examples in + a dataset that are extremely similar to each other, relative + to the rest of the dataset. The examples flagged with this issue + may be exactly duplicated, or lie atypically close together when + represented as vectors (i.e. feature embeddings). + """ + issue_name: ClassVar[str] = "near_duplicate" + verbosity_levels = { + 0: [], + 1: ["threshold"], + 2: [], + } + + def __init__( + self, + datalab: Datalab, + metric: Optional[str] = None, + threshold: float = 0.13, + k: int = 10, + **_, + ): + super().__init__(datalab) + self.metric = metric + self.threshold = self._set_threshold(threshold) + self.k = k + self.near_duplicate_sets: List[List[int]] = [] + + def find_issues( + self, + features: Optional[npt.NDArray] = None, + **kwargs, + ) -> None: + knn_graph = self._process_knn_graph_from_inputs(kwargs) + old_knn_metric = self.datalab.get_info("statistics").get("knn_metric") + metric_changes = self.metric and self.metric != old_knn_metric + + if knn_graph is None or metric_changes: + if features is None: + raise ValueError( + "If a knn_graph is not provided, features must be provided to fit a new knn." + ) + if self.metric is None: + self.metric = "cosine" if features.shape[1] > 3 else "euclidean" + knn = NearestNeighbors(n_neighbors=self.k, metric=self.metric) + + if self.metric and self.metric != knn.metric: + warnings.warn( + f"Metric {self.metric} does not match metric {knn.metric} used to fit knn. " + "Most likely an existing NearestNeighbors object was passed in, but a different " + "metric was specified." + ) + self.metric = knn.metric + + try: + check_is_fitted(knn) + except: + knn.fit(features) + + knn_graph = knn.kneighbors_graph(mode="distance") + N = knn_graph.shape[0] + nn_distances = knn_graph.data.reshape(N, -1)[:, 0] + scores = np.tanh(nn_distances) + is_issue_column = nn_distances < self.threshold * np.median(nn_distances) + + self.issues = pd.DataFrame( + { + f"is_{self.issue_name}_issue": is_issue_column, + self.issue_score_key: scores, + }, + ) + + self.near_duplicate_sets = self._neighbors_within_radius(knn_graph, self.threshold) + + self.summary = self.make_summary(score=scores.mean()) + self.info = self.collect_info(knn_graph=knn_graph) + + @staticmethod + def _neighbors_within_radius(knn_graph: csr_matrix, radius: float): + """Returns a list of lists of indices of near-duplicate examples. + + Each list of indices represents a set of near-duplicate examples. + + If the list is empty for a given example, then that example is not + a near-duplicate of any other example. + """ + + N = knn_graph.shape[0] + distances = knn_graph.data.reshape(N, -1) + # Create a mask for the threshold + mask = distances < radius + + # Update the indptr to reflect the new number of neighbors + indptr = np.zeros(knn_graph.indptr.shape, dtype=knn_graph.indptr.dtype) + indptr[1:] = np.cumsum(mask.sum(axis=1)) + + # Filter the knn_graph based on the threshold + indices = knn_graph.indices[mask.ravel()] + near_duplicate_sets = [indices[indptr[i] : indptr[i + 1]] for i in range(N)] + + return near_duplicate_sets + + def _process_knn_graph_from_inputs(self, kwargs: Dict[str, Any]) -> Union[csr_matrix, None]: + """Determine if a knn_graph is provided in the kwargs or if one is already stored in the associated Datalab instance.""" + knn_graph_kwargs: Optional[csr_matrix] = kwargs.get("knn_graph", None) + knn_graph_stats = self.datalab.get_info("statistics").get("weighted_knn_graph", None) + + knn_graph: Optional[csr_matrix] = None + if knn_graph_kwargs is not None: + knn_graph = knn_graph_kwargs + elif knn_graph_stats is not None: + knn_graph = knn_graph_stats + + if isinstance(knn_graph, csr_matrix) and kwargs.get("k", 0) > ( + knn_graph.nnz // knn_graph.shape[0] + ): + # If the provided knn graph is insufficient, then we need to recompute the knn graph + # with the provided features + knn_graph = None + return knn_graph + + def collect_info(self, knn_graph: csr_matrix) -> dict: + issues_dict = { + "average_near_duplicate_score": self.issues[self.issue_score_key].mean(), + "near_duplicate_sets": self.near_duplicate_sets, + } + + params_dict = { + "metric": self.metric, + "k": self.k, + "threshold": self.threshold, + } + + N = knn_graph.shape[0] + dists = knn_graph.data.reshape(N, -1)[:, 0] + nn_ids = knn_graph.indices.reshape(N, -1)[:, 0] + + knn_info_dict = { + "nearest_neighbor": nn_ids.tolist(), + "distance_to_nearest_neighbor": dists.tolist(), + } + + statistics_dict = self._build_statistics_dictionary(knn_graph=knn_graph) + + info_dict = { + **issues_dict, + **params_dict, + **knn_info_dict, + **statistics_dict, + } + return info_dict + + def _build_statistics_dictionary(self, knn_graph: csr_matrix) -> Dict[str, Dict[str, Any]]: + statistics_dict: Dict[str, Dict[str, Any]] = {"statistics": {}} + + # Add the knn graph as a statistic if necessary + graph_key = "weighted_knn_graph" + old_knn_graph = self.datalab.get_info("statistics").get(graph_key, None) + old_graph_exists = old_knn_graph is not None + prefer_new_graph = ( + not old_graph_exists + or knn_graph.nnz > old_knn_graph.nnz + or self.metric != self.datalab.get_info("statistics").get("knn_metric", None) + ) + if prefer_new_graph: + statistics_dict["statistics"][graph_key] = knn_graph + if self.metric is not None: + statistics_dict["statistics"]["knn_metric"] = self.metric + + return statistics_dict + + def _set_threshold( + self, + threshold: float, + ) -> float: + """Computes nearest-neighbors thresholding for near-duplicate detection.""" + if threshold < 0: + warnings.warn( + f"Computed threshold {threshold} is less than 0. " + "Setting threshold to 0." + "This may indicate that either the only a few examples are in the dataset, " + "or the data is heavily skewed." + ) + threshold = 0 + return threshold diff --git a/cleanlab/datalab/issue_manager/issue_manager.py b/cleanlab/datalab/issue_manager/issue_manager.py new file mode 100644 index 0000000000..4048090f21 --- /dev/null +++ b/cleanlab/datalab/issue_manager/issue_manager.py @@ -0,0 +1,342 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . +from __future__ import annotations + +from abc import ABC, ABCMeta, abstractmethod +from itertools import chain +from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Set, Tuple, Type, TypeVar +import json + +import numpy as np +import pandas as pd + +if TYPE_CHECKING: # pragma: no cover + from cleanlab.datalab.datalab import Datalab + + +T = TypeVar("T", bound="IssueManager") +TM = TypeVar("TM", bound="IssueManagerMeta") + + +class IssueManagerMeta(ABCMeta): + """Metaclass for IssueManager that adds issue_score_key to the class. + + :meta private: + """ + + issue_name: ClassVar[str] + issue_score_key: ClassVar[str] + verbosity_levels: ClassVar[Dict[int, List[str]]] = { + 0: [], + 1: [], + 2: [], + 3: [], + } + + def __new__( + meta: Type[TM], + name: str, + bases: Tuple[Type[Any], ...], + class_dict: Dict[str, Any], + ) -> TM: # Classes that inherit from ABC don't need to be modified + if ABC in bases: + return super().__new__(meta, name, bases, class_dict) + + # Ensure that the verbosity levels don't have keys other than those in ["issue", "info"] + verbosity_levels = class_dict.get("verbosity_levels", meta.verbosity_levels) + for level, level_list in verbosity_levels.items(): + if not isinstance(level_list, list): + raise ValueError( + f"Verbosity levels must be lists. " + f"Got {level_list} in {name}.verbosity_levels" + ) + prohibited_keys = [key for key in level_list if not isinstance(key, str)] + if prohibited_keys: + raise ValueError( + f"Verbosity levels must be lists of strings. " + f"Got {prohibited_keys} in {name}.verbosity_levels[{level}]" + ) + + # Concrete classes need to have an issue_name attribute + if "issue_name" not in class_dict: + raise TypeError("IssueManagers need an issue_name class variable") + + # Add issue_score_key to class + class_dict["issue_score_key"] = f"{class_dict['issue_name']}_score" + return super().__new__(meta, name, bases, class_dict) + + +class IssueManager(ABC, metaclass=IssueManagerMeta): + """Base class for managing data issues of a particular type in a Datalab. + + For each example in a dataset, the IssueManager for a particular type of issue should compute: + - A numeric severity score between 0 and 1, + with values near 0 indicating severe instances of the issue. + - A boolean `is_issue` value, which is True + if we believe this example suffers from the issue in question. + `is_issue` may be determined by thresholding the severity score + (with an a priori determined reasonable threshold value), + or via some other means (e.g. Confident Learning for flagging label issues). + + The IssueManager should also report: + - A global value between 0 and 1 summarizing how severe this issue is in the dataset overall + (e.g. the average severity across all examples in dataset + or count of examples where `is_issue=True`). + - Other interesting `info` about the issue and examples in the dataset, + and statistics estimated from current dataset that may be reused + to score this issue in future data. + For example, `info` for label issues could contain the: + confident_thresholds, confident_joint, predicted label for each example, etc. + Another example is for (near)-duplicate detection issue, where `info` could contain: + which set of examples in the dataset are all (nearly) identical. + + Implementing a new IssueManager: + - Define the `issue_name` class attribute, e.g. "label", "duplicate", "outlier", etc. + - Implement the abstract methods `find_issues` and `collect_info`. + - `find_issues` is responsible for computing computing the `issues` and `summary` dataframes. + - `collect_info` is responsible for computing the `info` dict. It is called by `find_issues`, + once the manager has set the `issues` and `summary` dataframes as instance attributes. + """ + + description: ClassVar[str] = "" + """Short text that summarizes the type of issues handled by this IssueManager. + + :meta hide-value: + """ + issue_name: ClassVar[str] + """Returns a key that is used to store issue summary results about the assigned Lab.""" + issue_score_key: ClassVar[str] + """Returns a key that is used to store issue score results about the assigned Lab.""" + verbosity_levels: ClassVar[Dict[int, List[str]]] = { + 0: [], + 1: [], + 2: [], + 3: [], + } + """A dictionary of verbosity levels and their corresponding dictionaries of + report items to print. + + :meta hide-value: + + Example + ------- + + >>> verbosity_levels = { + ... 0: [], + ... 1: ["some_info_key"], + ... 2: ["additional_info_key"], + ... } + """ + + def __init__(self, datalab: Datalab, **_): + self.datalab = datalab + self.info: Dict[str, Any] = {} + self.issues: pd.DataFrame = pd.DataFrame() + self.summary: pd.DataFrame = pd.DataFrame() + + def __repr__(self): + class_name = self.__class__.__name__ + return class_name + + @classmethod + def __init_subclass__(cls): + required_class_variables = [ + "issue_name", + ] + for var in required_class_variables: + if not hasattr(cls, var): + raise NotImplementedError(f"Class {cls.__name__} must define class variable {var}") + + @abstractmethod + def find_issues(self, *args, **kwargs) -> None: + """Finds occurrences of this particular issue in the dataset. + + Computes the `issues` and `summary` dataframes. Calls `collect_info` to compute the `info` dict. + """ + raise NotImplementedError + + def collect_info(self, *args, **kwargs) -> dict: + """Collects data for the info attribute of the Datalab. + + NOTE + ---- + This method is called by :py:meth:`find_issues` after :py:meth:`find_issues` has set the `issues` and `summary` dataframes + as instance attributes. + """ + raise NotImplementedError + + @classmethod + def make_summary(cls, score: float) -> pd.DataFrame: + """Construct a summary dataframe. + + Parameters + ---------- + score : + The overall score for this issue. + + Returns + ------- + summary : + A summary dataframe. + """ + return pd.DataFrame( + { + "issue_type": [cls.issue_name], + "score": [score], + }, + ) + + @classmethod + def report( + cls, + issues: pd.DataFrame, + summary: pd.DataFrame, + info: Dict[str, Any], + num_examples: int = 5, + verbosity: int = 0, + include_description: bool = False, + info_to_omit: Optional[List[str]] = None, + ) -> str: + """Compose a report of the issues found by this IssueManager. + + Parameters + ---------- + issues : + An issues dataframe. + + Example + ------- + >>> import pandas as pd + >>> issues = pd.DataFrame( + ... { + ... "is_X_issue": [True, False, True], + ... "X_score": [0.2, 0.9, 0.4], + ... }, + ... ) + + summary : + The summary dataframe. + + Example + ------- + >>> summary = pd.DataFrame( + ... { + ... "issue_type": ["X"], + ... "score": [0.5], + ... }, + ... ) + + info : + The info dict. + + Example + ------- + >>> info = { + ... "A": "val_A", + ... "B": ["val_B1", "val_B2"], + ... } + + num_examples : + The number of examples to print. + + verbosity : + The verbosity level of the report. + + include_description : + Whether to include a description of the issue in the report. + + Returns + ------- + report_str : + A string containing the report. + """ + + max_verbosity = max(cls.verbosity_levels.keys()) + top_level = max_verbosity + 1 + if verbosity not in list(cls.verbosity_levels.keys()) + [top_level]: + raise ValueError( + f"Verbosity level {verbosity} not supported. " + f"Supported levels: {cls.verbosity_levels.keys()}" + f"Use verbosity={top_level} to print all info." + ) + if issues.empty: + print(f"No issues found") + + topk_ids = issues.sort_values(by=cls.issue_score_key, ascending=True).index[:num_examples] + + score = summary["score"].loc[0] + report_str = f"{' ' + cls.issue_name + ' issues ':-^60}\n\n" + + if include_description and cls.description: + description = cls.description + if verbosity == 0: + description = description.split("\n\n", maxsplit=1)[0] + report_str += "About this issue:\n\t" + description + "\n\n" + report_str += ( + f"Number of examples with this issue: {issues[f'is_{cls.issue_name}_issue'].sum()}\n" + f"Overall dataset quality in terms of this issue: : {score:.4f}\n\n" + ) + + info_to_print: Set[str] = set() + _info_to_omit = set(issues.columns).union(info_to_omit or []) + verbosity_levels_values = chain.from_iterable( + list(cls.verbosity_levels.values())[: verbosity + 1] + ) + info_to_print.update(set(verbosity_levels_values) - _info_to_omit) + if verbosity == top_level: + info_to_print.update(set(info.keys()) - _info_to_omit) + + report_str += "Examples representing most severe instances of this issue:\n" + report_str += issues.loc[topk_ids].to_string() + + def truncate(s, max_len=4) -> str: + if hasattr(s, "shape") or hasattr(s, "ndim"): + s = np.array(s) + if s.ndim > 1: + description = f"array of shape {s.shape}\n" + with np.printoptions(threshold=max_len): + if s.ndim == 2: + description += f"{s}" + if s.ndim > 2: + description += f"{s}" + return description + s = s.tolist() + + if isinstance(s, list): + if all([isinstance(s_, list) for s_ in s]): + return truncate(np.array(s, dtype=object), max_len=max_len) + if len(s) > max_len: + s = s[:max_len] + ["..."] + return str(s) + + if info_to_print: + info_to_print_dict = {key: info[key] for key in info_to_print} + # Print the info dict, truncating arrays to 4 elements, + report_str += f"\n\nAdditional Information: " + for key, value in info_to_print_dict.items(): + if key == "statistics": + continue + if isinstance(value, dict): + report_str += f"\n{key}:\n{json.dumps(value, indent=4)}" + elif isinstance(value, pd.DataFrame): + max_rows = 5 + df_str = value.head(max_rows).to_string() + if len(value) > max_rows: + df_str += f"\n... (total {len(value)} rows)" + report_str += f"\n{key}:\n{df_str}" + else: + report_str += f"\n{key}: {truncate(value)}" + return report_str diff --git a/cleanlab/datalab/issue_manager/label.py b/cleanlab/datalab/issue_manager/label.py new file mode 100644 index 0000000000..e6783f48f5 --- /dev/null +++ b/cleanlab/datalab/issue_manager/label.py @@ -0,0 +1,226 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional + +import numpy as np + +from cleanlab.classification import CleanLearning +from cleanlab.datalab.issue_manager import IssueManager +from cleanlab.internal.validation import assert_valid_inputs + +if TYPE_CHECKING: # pragma: no cover + import pandas as pd + + from cleanlab.datalab.datalab import Datalab + + +class LabelIssueManager(IssueManager): + """Manages label issues in a Datalab. + + Parameters + ---------- + datalab : + A Datalab instance. + + clean_learning_kwargs : + Keyword arguments to pass to the :py:meth:`CleanLearning ` constructor. + + health_summary_parameters : + Keyword arguments to pass to the :py:meth:`health_summary ` function. + """ + + description: ClassVar[ + str + ] = """Examples whose given label is estimated to be potentially incorrect + (e.g. due to annotation error) are flagged as having label issues. + """ + + issue_name: ClassVar[str] = "label" + verbosity_levels = { + 0: [], + 1: [], + 2: [], + 3: ["classes_by_label_quality", "overlapping_classes"], + } + + def __init__( + self, + datalab: Datalab, + clean_learning_kwargs: Optional[Dict[str, Any]] = None, + health_summary_parameters: Optional[Dict[str, Any]] = None, + **_, + ): + super().__init__(datalab) + self.cl = CleanLearning(**(clean_learning_kwargs or {})) + self.health_summary_parameters: Dict[str, Any] = health_summary_parameters or {} + self._reset() + + @staticmethod + def _process_find_label_issues_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]: + """Searches for keyword arguments that are meant for the + CleanLearning.find_label_issues method call + + Examples + -------- + >>> from cleanlab.datalab.issue_manager.label import LabelIssueManager + >>> LabelIssueManager._process_clean_learning_kwargs(thresholds=[0.1, 0.9]) + {'thresholds': [0.1, 0.9]} + """ + accepted_kwargs = [ + "thresholds", + "noise_matrix", + "inverse_noise_matrix", + "save_space", + "clf_kwargs", + "validation_func", + ] + return {k: v for k, v in kwargs.items() if k in accepted_kwargs and v is not None} + + def _reset(self) -> None: + """Reset the attributes of this manager based on the available datalab info + and the keyword arguments stored as instance attributes. + + This allows the builder to use pre-computed info from the datalab to speed up + some computations in the :py:meth:`find_issues` method. + """ + if not self.health_summary_parameters: + statistics_dict = self.datalab.get_info("statistics") + self.health_summary_parameters = { + "labels": self.datalab._labels, + "asymmetric": statistics_dict.get("asymmetric", None), + "class_names": list(self.datalab._label_map.values()), + "num_examples": statistics_dict.get("num_examples"), + "joint": statistics_dict.get("joint", None), + "confident_joint": statistics_dict.get("confident_joint", None), + "multi_label": statistics_dict.get("multi_label", None), + "verbose": False, + } + self.health_summary_parameters = { + k: v for k, v in self.health_summary_parameters.items() if v is not None + } + + def find_issues( + self, + pred_probs: np.ndarray, + health_summary_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> None: + self.health_summary_parameters.update({"pred_probs": pred_probs}) + # Find examples with label issues + self.issues = self.cl.find_label_issues( + labels=self.datalab._labels, + pred_probs=pred_probs, + **self._process_find_label_issues_kwargs(kwargs), + ) + self.issues.rename(columns={"label_quality": self.issue_score_key}, inplace=True) + + summary_dict = self.get_health_summary( + pred_probs=pred_probs, **(health_summary_kwargs or {}) + ) + + # Get a summarized dataframe of the label issues + self.summary = self.make_summary(score=summary_dict["overall_label_health_score"]) + + # Collect info about the label issues + self.info = self.collect_info(issues=self.issues, summary_dict=summary_dict) + + # Drop columns from issues that are in the info + self.issues = self.issues.drop(columns=["given_label", "predicted_label"]) + + def get_health_summary(self, pred_probs, **kwargs) -> dict: + """Returns a short summary of the health of this Lab.""" + from cleanlab.dataset import health_summary + + # Validate input + self._validate_pred_probs(pred_probs) + + summary_kwargs = self._get_summary_parameters(pred_probs, **kwargs) + summary = health_summary(**summary_kwargs) + return summary + + def _get_summary_parameters(self, pred_probs, **kwargs) -> Dict["str", Any]: + """Collects a set of input parameters for the health summary function based on + any info available in the datalab. + + Parameters + ---------- + pred_probs : + The predicted probabilities for each example. + + kwargs : + Keyword arguments to pass to the health summary function. + + Returns + ------- + summary_parameters : + A dictionary of parameters to pass to the health summary function. + """ + if "confident_joint" in self.health_summary_parameters: + summary_parameters = { + "confident_joint": self.health_summary_parameters["confident_joint"] + } + elif all([x in self.health_summary_parameters for x in ["joint", "num_examples"]]): + summary_parameters = { + k: self.health_summary_parameters[k] for k in ["joint", "num_examples"] + } + else: + summary_parameters = { + "pred_probs": pred_probs, + "labels": self.datalab._labels, + } + + summary_parameters["class_names"] = self.health_summary_parameters["class_names"] + + for k in ["asymmetric", "verbose"]: + # Start with the health_summary_parameters, then override with kwargs + if k in self.health_summary_parameters: + summary_parameters[k] = self.health_summary_parameters[k] + if k in kwargs: + summary_parameters[k] = kwargs[k] + return summary_parameters + + def collect_info(self, issues: pd.DataFrame, summary_dict: dict) -> dict: + issues_info = { + "num_label_issues": sum(issues[f"is_{self.issue_name}_issue"]), + "average_label_quality": issues[self.issue_score_key].mean(), + "given_label": issues["given_label"].tolist(), + "predicted_label": issues["predicted_label"].tolist(), + } + + health_summary_info = { + "confident_joint": summary_dict["joint"], + "classes_by_label_quality": summary_dict["classes_by_label_quality"], + "overlapping_classes": summary_dict["overlapping_classes"], + } + + cl_info = {} + for k in self.cl.__dict__: + if k not in ["py", "noise_matrix", "inverse_noise_matrix", "confident_joint"]: + continue + cl_info[k] = self.cl.__dict__[k] + + info_dict = { + **issues_info, + **health_summary_info, + **cl_info, + } + + return info_dict + + def _validate_pred_probs(self, pred_probs) -> None: + assert_valid_inputs(X=None, y=self.datalab._labels, pred_probs=pred_probs) diff --git a/cleanlab/datalab/issue_manager/noniid.py b/cleanlab/datalab/issue_manager/noniid.py new file mode 100644 index 0000000000..6185fb32a8 --- /dev/null +++ b/cleanlab/datalab/issue_manager/noniid.py @@ -0,0 +1,425 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union, cast +import warnings +import itertools + +from scipy.stats import gaussian_kde +import numpy as np +import pandas as pd +import numpy.typing as npt +from scipy.sparse import csr_matrix +from sklearn.neighbors import NearestNeighbors +from sklearn.utils.validation import check_is_fitted + +from cleanlab.datalab.issue_manager import IssueManager + +if TYPE_CHECKING: # pragma: no cover + from cleanlab.datalab.datalab import Datalab + + +def simplified_kolmogorov_smirnov_test( + neighbor_histogram: npt.NDArray[np.float64], + non_neighbor_histogram: npt.NDArray[np.float64], +) -> float: + """Computes the Kolmogorov-Smirnov statistic between two groups of data. + The statistic is the largest difference between the empirical cumulative + distribution functions (ECDFs) of the two groups. + + Parameters + ---------- + neighbor_histogram : + Histogram data for the nearest neighbor group. + + non_neighbor_histogram : + Histogram data for the non-neighbor group. + + Returns + ------- + statistic : + The KS statistic between the two ECDFs. + + Note + ---- + - Both input arrays should have the same length. + - The input arrays are histograms, which means they contain the count + or frequency of values in each group. The data in the histograms + should be normalized so that they sum to one. + + To calculate the KS statistic, the function first calculates the ECDFs + for both input arrays, which are step functions that show the cumulative + sum of the data up to each point. The function then calculates the + largest absolute difference between the two ECDFs. + """ + + neighbor_cdf = np.cumsum(neighbor_histogram) + non_neighbor_cdf = np.cumsum(non_neighbor_histogram) + + statistic = np.max(np.abs(neighbor_cdf - non_neighbor_cdf)) + return statistic + + +class NonIIDIssueManager(IssueManager): + """Manages issues related to non-iid data distributions. + + Parameters + ---------- + datalab : + The Datalab instance that this issue manager searches for issues in. + + metric : + The distance metric used to compute the KNN graph of the examples in the dataset. + If set to `None`, the metric will be automatically selected based on the dimensionality + of the features used to represent the examples in the dataset. + + k : + The number of nearest neighbors to consider when computing the KNN graph of the examples. + + num_permutations : + The number of trials to run when performing permutation testing to determine whether + the distribution of index-distances between neighbors in the dataset is IID or not. + + """ + + description: ClassVar[ + str + ] = """Whether the dataset exhibits statistically significant + violations of the IID assumption like: + changepoints or shift, drift, autocorrelation, etc. + The specific violation considered is whether the + examples are ordered such that almost adjacent examples + tend to have more similar feature values. + """ + issue_name: ClassVar[str] = "non_iid" + verbosity_levels = { + 0: ["p-value"], + 1: [], + 2: [], + } + + def __init__( + self, + datalab: Datalab, + metric: Optional[str] = None, + k: int = 10, + num_permutations: int = 25, + **_, + ): + super().__init__(datalab) + self.metric = metric + self.k = k + self.num_permutations = num_permutations + self.tests = { + "ks": simplified_kolmogorov_smirnov_test, + } + self.background_distribution = None + + def find_issues(self, features: Optional[npt.NDArray] = None, **kwargs) -> None: + knn_graph = self._process_knn_graph_from_inputs(kwargs) + old_knn_metric = self.datalab.get_info("statistics").get("knn_metric") + metric_changes = self.metric and self.metric != old_knn_metric + + if knn_graph is None or metric_changes: + if features is None: + raise ValueError( + "If a knn_graph is not provided, features must be provided to fit a new knn." + ) + + if self.metric is None: + self.metric = "cosine" if features.shape[1] > 3 else "euclidean" + knn = NearestNeighbors(n_neighbors=self.k, metric=self.metric) + + if self.metric and self.metric != knn.metric: + warnings.warn( + f"Metric {self.metric} does not match metric {knn.metric} used to fit knn. " + "Most likely an existing NearestNeighbors object was passed in, but a different " + "metric was specified." + ) + self.metric = knn.metric + + try: + check_is_fitted(knn) + except: + knn.fit(features) + + self.neighbor_index_choices = self._get_neighbors(knn=knn) + else: + self.neighbor_index_choices = self._get_neighbors(knn_graph=knn_graph) + + self.num_neighbors = self.k + + indices = np.arange(self.N) + self.neighbor_index_distances = np.abs(indices.reshape(-1, 1) - self.neighbor_index_choices) + + self.statistics = self._get_statistics(self.neighbor_index_distances) + + self.p_value = self._permutation_test(num_permutations=self.num_permutations) + + scores = self._score_dataset() + score_median_threshold = np.median(scores) * 0.7 + self.issues = pd.DataFrame( + { + f"is_{self.issue_name}_issue": scores < score_median_threshold, + self.issue_score_key: scores, + }, + ) + + self.summary = self.make_summary(score=self.p_value) + + if knn_graph is None: + self.info = self.collect_info(knn=knn) + self.info = self.collect_info(knn_graph=knn_graph, knn=knn) + + def _process_knn_graph_from_inputs(self, kwargs: Dict[str, Any]) -> Union[csr_matrix, None]: + """Determine if a knn_graph is provided in the kwargs or if one is already stored in the associated Datalab instance.""" + knn_graph_kwargs: Optional[csr_matrix] = kwargs.get("knn_graph", None) + knn_graph_stats = self.datalab.get_info("statistics").get("weighted_knn_graph", None) + + knn_graph: Optional[csr_matrix] = None + if knn_graph_kwargs is not None: + knn_graph = knn_graph_kwargs + elif knn_graph_stats is not None: + knn_graph = knn_graph_stats + + need_to_recompute_knn = isinstance(knn_graph, csr_matrix) and ( + kwargs.get("k", 0) > knn_graph.nnz // knn_graph.shape[0] + or self.k > knn_graph.nnz // knn_graph.shape[0] + ) + + if need_to_recompute_knn: + # If the provided knn graph is insufficient, then we need to recompute the knn graph + # with the provided features + knn_graph = None + return knn_graph + + def collect_info( + self, knn_graph: Optional[csr_matrix] = None, knn: Optional[NearestNeighbors] = None + ) -> dict: + issues_dict = { + "p-value": self.p_value, + } + + params_dict = { + "metric": self.metric, + "k": self.k, + } + if knn_graph is None: + assert knn is not None, "If knn_graph is None, knn must be provided." + knn_graph = knn.kneighbors_graph(mode="distance") # type: ignore[union-attr] + + assert knn_graph is not None, "knn_graph must be provided or computed." + statistics_dict = self._build_statistics_dictionary(knn_graph=knn_graph) + + info_dict = { + **issues_dict, + **params_dict, # type: ignore[arg-type] + **statistics_dict, # type: ignore[arg-type] + } + return info_dict + + def _build_statistics_dictionary(self, knn_graph: csr_matrix) -> Dict[str, Dict[str, Any]]: + statistics_dict: Dict[str, Dict[str, Any]] = {"statistics": {}} + + # Add the knn graph as a statistic if necessary + graph_key = "weighted_knn_graph" + old_knn_graph = self.datalab.get_info("statistics").get(graph_key, None) + old_graph_exists = old_knn_graph is not None + prefer_new_graph = ( + (knn_graph is not None and not old_graph_exists) + or knn_graph.nnz > old_knn_graph.nnz + or self.metric != self.datalab.get_info("statistics").get("knn_metric", None) + ) + if prefer_new_graph: + statistics_dict["statistics"][graph_key] = knn_graph + if self.metric is not None: + statistics_dict["statistics"]["knn_metric"] = self.metric + + return statistics_dict + + def _permutation_test(self, num_permutations) -> float: + N = self.N + + perms = np.fromiter( + itertools.chain.from_iterable( + np.random.permutation(N) for i in range(num_permutations) + ), + dtype=int, + ).reshape(num_permutations, N) + + neighbor_index_choices = self.neighbor_index_choices + neighbor_index_choices = neighbor_index_choices.reshape(1, *neighbor_index_choices.shape) + perm_neighbor_choices = perms[:, neighbor_index_choices].reshape( + num_permutations, *neighbor_index_choices.shape[1:] + ) + neighbor_index_distances = np.abs(perms[..., None] - perm_neighbor_choices).reshape( + num_permutations, -1 + ) + + statistics = [] + for neighbor_index_dist in neighbor_index_distances: + stats = self._get_statistics( + neighbor_index_dist, + ) + statistics.append(stats) + + ks_stats = np.array([stats["ks"] for stats in statistics]) + ks_stats_kde = gaussian_kde(ks_stats) + p_value = ks_stats_kde.integrate_box(self.statistics["ks"], 100) + + return p_value + + def _score_dataset(self) -> npt.NDArray[np.float64]: + """This function computes a variant of the KS statistic for each + datapoint. Rather than computing the maximum difference + between the CDF of the neighbor distances (foreground + distribution) and the CDF of the all index distances + (background distribution), we compute the absolute difference + in area-under-the-curve of the two CDFs. + + The foreground distribution is computed by sampling the + neighbor distances from the KNN graph, but the background + distribution is computed analytically. The background CDF for + a datapoint i can be split up into three parts. Let d = min(i, + N - i - 1). + + 1. For 0 < j <= d, the slope of the CDF is 2 / (N - 1) since + there are two datapoints in the dataset that are distance j + from datapoint i. We call this threshold the 'double distance + threshold' + + 2. For d < j <= N - d - 1, the slope of the CDF is + 1 / (N - 1) since there is only one datapoint in the dataset + that is distance j from datapoint i. + + 3. For j > N - d - 1, the slope of the CDF is 0 and is + constant at 1.0 since there are no datapoints in the dataset + that are distance j from datapoint i. + + We compute the area differences on each of the k intervals for + which the foreground CDF is constant which allows for the + possibility that the background CDF may intersect the + foreground CDF on this interval. We do not account for these + cases when computing absolute AUC difference. + + Our algorithm is simple, sort the k sampled neighbor + distances. Then, for each of the k neighbor distances sampled, + compute the AUC for each CDF up to that point. Then, subtract + from each area the previous area in the sorted order to get + the AUC of the CDF on the interval between those two + points. Subtract the background interval AUCs from the + foreground interval AUCs, take the absolute value, and + sum. The algorithm is vectorized such that this statistic is + computed for each of the N datapoints simultaneously. + + The statistics are then normalized by their respective maximum + possible distance (N - d - 1) and then mapped to [0,1] via + tanh. + """ + N = self.N + + sorted_neighbors = np.sort(self.neighbor_index_distances, axis=1) + + # find the maximum distance that occurs with double probability + middle_idx = np.floor((N - 1) / 2).astype(int) + double_distances = np.arange(N).reshape(N, 1) + double_distances[double_distances > middle_idx] -= N - 1 + double_distances = np.abs(double_distances) + + sorted_neighbors = np.hstack([sorted_neighbors, np.ones((N, 1)) * (N - 1)]).astype(int) + + # the set of distances that are less than the double distance threshold + set_beginning = sorted_neighbors <= double_distances + # the set of distances that are greater than the double distance threshold but have nonzero probability + set_middle = (sorted_neighbors > double_distances) & ( + sorted_neighbors <= (N - double_distances - 1) + ) + # the set of distances that occur with 0 probability + set_end = sorted_neighbors > (N - double_distances - 1) + + shifted_neighbors = np.zeros(sorted_neighbors.shape) + shifted_neighbors[:, 1:] = sorted_neighbors[:, :-1] + diffs = sorted_neighbors - shifted_neighbors # the distances between the sorted indices + + area_beginning = (double_distances**2) / (N - 1) + length = N - 2 * double_distances - 1 + a = 2 * double_distances / (N - 1) + area_middle = 0.5 * (a + 1) * length + + # compute the area under the CDF for each of the indices in sorted_neighbors + background_area = np.zeros(diffs.shape) + background_diffs = np.zeros(diffs.shape) + background_area[set_beginning] = ((sorted_neighbors**2) / (N - 1))[set_beginning] + background_area[set_middle] = ( + area_beginning + + 0.5 + * ( + (sorted_neighbors + 3 * double_distances) + * (sorted_neighbors - double_distances) + / (N - 1) + ) + )[set_middle] + background_area[set_end] = ( + area_beginning + area_middle + (sorted_neighbors - (N - double_distances - 1) * 1.0) + )[set_end] + + # compute the area under the CDF between indices in sorted_neighbors + shifted_background = np.zeros(background_area.shape) + shifted_background[:, 1:] = background_area[:, :-1] + background_diffs = background_area - shifted_background + + # compute the foreground CDF and AUC between indices in sorted_neighbors + foreground_cdf = np.arange(sorted_neighbors.shape[1]) / (sorted_neighbors.shape[1] - 1) + foreground_diffs = foreground_cdf.reshape(1, -1) * diffs + + # compute the differences between foreground and background area intervals + area_diffs = np.abs(foreground_diffs - background_diffs) + stats = np.sum(area_diffs, axis=1) + + # normalize scores by the index and transform to [0, 1] + indices = np.arange(N) + reverse = N - indices + normalizer = np.where(indices > reverse, indices, reverse) + + scores = stats / normalizer + scores = np.tanh(-1 * scores) + 1 + return scores + + def _get_neighbors( + self, knn: Optional[NearestNeighbors] = None, knn_graph: Optional[csr_matrix] = None + ) -> np.ndarray: + """ + Given a fitted knn object or a knn graph, returns an (N, k) array in + which j is in A[i] if item i and j are nearest neighbors. + """ + if knn_graph is not None: + N = knn_graph.shape[0] + kneighbors = knn_graph.indices.reshape(N, -1) + elif knn is not None: + _, kneighbors = knn.kneighbors() + N = kneighbors.shape[0] + else: + raise ValueError("Must provide either knn or knn_graph") + self.N = N + return kneighbors + + def _get_statistics( + self, + neighbor_index_distances, + ) -> dict[str, float]: + neighbor_index_distances = neighbor_index_distances.flatten() + sorted_neighbors = np.sort(neighbor_index_distances) + sorted_neighbors = np.hstack([sorted_neighbors, np.ones((1)) * (self.N - 1)]).astype(int) + + if self.background_distribution is None: + self.background_distribution = (self.N - np.arange(1, self.N)) / ( + self.N * (self.N - 1) / 2 + ) + + background_distribution = cast(np.ndarray, self.background_distribution) + background_cdf = np.cumsum(background_distribution) + + foreground_cdf = np.arange(sorted_neighbors.shape[0]) / (sorted_neighbors.shape[0] - 1) + + statistic = np.max(np.abs(foreground_cdf - background_cdf[sorted_neighbors - 1])) + statistics = {"ks": statistic} + return statistics diff --git a/cleanlab/datalab/issue_manager/outlier.py b/cleanlab/datalab/issue_manager/outlier.py new file mode 100644 index 0000000000..9f349b4cf3 --- /dev/null +++ b/cleanlab/datalab/issue_manager/outlier.py @@ -0,0 +1,276 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Tuple, Union, cast + +from scipy.sparse import csr_matrix +from scipy.stats import iqr +import numpy as np +import numpy.typing as npt +import pandas as pd + +from cleanlab.datalab.issue_manager import IssueManager +from cleanlab.outlier import OutOfDistribution, transform_distances_to_scores + +if TYPE_CHECKING: # pragma: no cover + from sklearn.neighbors import NearestNeighbors + from cleanlab.datalab.datalab import Datalab + + +class OutlierIssueManager(IssueManager): + """Manages issues related to out-of-distribution examples.""" + + description: ClassVar[ + str + ] = """Examples that are very different from the rest of the dataset + (i.e. potentially out-of-distribution or rare/anomalous instances). + """ + issue_name: ClassVar[str] = "outlier" + verbosity_levels = { + 0: [], + 1: [], + 2: ["average_ood_score"], + 3: [], + } + + DEFAULT_THRESHOLDS = { + "features": 0.37037, + "pred_probs": 0.13, + } + """Default thresholds for outlier detection. + + If outlier detection is performed on the features, an example whose average + distance to their k nearest neighbors is greater than + Q3_avg_dist + (1 / threshold - 1) * IQR_avg_dist is considered an outlier. + + If outlier detection is performed on the predicted probabilities, an example + whose average score is lower than threshold * median_outlier_score is + considered an outlier. + """ + + def __init__( + self, + datalab: Datalab, + threshold: Optional[float] = None, + **kwargs, + ): + super().__init__(datalab) + + ood_kwargs = kwargs.get("ood_kwargs", {}) + + valid_ood_params = OutOfDistribution.DEFAULT_PARAM_DICT.keys() + params = { + key: value + for key, value in ((k, kwargs.get(k, None)) for k in valid_ood_params) + if value is not None + } + + if params: + ood_kwargs["params"] = params + + self.ood: OutOfDistribution = OutOfDistribution(**ood_kwargs) + + self.threshold = threshold + self._embeddings: Optional[np.ndarray] = None + self._metric: str = None # type: ignore + + def find_issues( + self, + features: Optional[npt.NDArray] = None, + pred_probs: Optional[np.ndarray] = None, + **kwargs, + ) -> None: + knn_graph = self._process_knn_graph_from_inputs(kwargs) + distances: Optional[np.ndarray] = None + + if knn_graph is not None: + N = knn_graph.shape[0] + k = knn_graph.nnz // N + t = cast(int, self.ood.params["t"]) + distances = knn_graph.data.reshape(-1, k) + assert isinstance(distances, np.ndarray) + scores = transform_distances_to_scores(distances, k=k, t=t) + elif features is not None: + scores = self._score_with_features(features, **kwargs) + elif pred_probs is not None: + scores = self._score_with_pred_probs(pred_probs, **kwargs) + else: + if kwargs.get("knn_graph", None) is not None: + raise ValueError( + "knn_graph is provided, but not sufficiently large to compute the scores based on the provided hyperparameters." + ) + raise ValueError(f"Either features pred_probs must be provided.") + + if features is not None or knn_graph is not None: + if knn_graph is None: + assert ( + features is not None + ), "features must be provided so that we can compute the knn graph." + knn_graph = self._process_knn_graph_from_features(kwargs) + distances = knn_graph.data.reshape(knn_graph.shape[0], -1) + + assert isinstance(distances, np.ndarray) + ( + self.threshold, + is_issue_column, + ) = self._compute_threshold_and_issue_column_from_distances(distances, self.threshold) + + else: + assert pred_probs is not None + # Threshold based on pred_probs, very small scores are outliers + if self.threshold is None: + self.threshold = self.DEFAULT_THRESHOLDS["pred_probs"] + if not 0 <= self.threshold: + raise ValueError(f"threshold must be non-negative, but got {self.threshold}.") + is_issue_column = scores < self.threshold * np.median(scores) + + self.issues = pd.DataFrame( + { + f"is_{self.issue_name}_issue": is_issue_column, + self.issue_score_key: scores, + }, + ) + + self.summary = self.make_summary(score=scores.mean()) + + self.info = self.collect_info(knn_graph=knn_graph) + + def _process_knn_graph_from_inputs(self, kwargs: Dict[str, Any]) -> Union[csr_matrix, None]: + """Determine if a knn_graph is provided in the kwargs or if one is already stored in the associated Datalab instance.""" + knn_graph_kwargs: Optional[csr_matrix] = kwargs.get("knn_graph", None) + knn_graph_stats = self.datalab.get_info("statistics").get("weighted_knn_graph", None) + + knn_graph: Optional[csr_matrix] = None + if knn_graph_kwargs is not None: + knn_graph = knn_graph_kwargs + elif knn_graph_stats is not None: + knn_graph = knn_graph_stats + + if isinstance(knn_graph, csr_matrix) and kwargs.get("k", 0) > ( + knn_graph.nnz // knn_graph.shape[0] + ): + # If the provided knn graph is insufficient, then we need to recompute the knn graph + # with the provided features + knn_graph = None + return knn_graph + + def _compute_threshold_and_issue_column_from_distances( + self, distances: np.ndarray, threshold: Optional[float] = None + ) -> Tuple[float, np.ndarray]: + avg_distances = distances.mean(axis=1) + if threshold: + if not (isinstance(threshold, (int, float)) and 0 <= threshold <= 1): + raise ValueError( + f"threshold must be a number between 0 and 1, got {threshold} of type {type(threshold)}." + ) + if threshold is None: + threshold = OutlierIssueManager.DEFAULT_THRESHOLDS["features"] + q3_distance = np.percentile(avg_distances, 75) + iqr_scale = 1 / threshold - 1 if threshold != 0 else np.inf + return threshold, avg_distances > q3_distance + iqr_scale * iqr(avg_distances) + + def _process_knn_graph_from_features(self, kwargs: Dict) -> csr_matrix: + # Check if the weighted knn graph exists in info + knn_graph = self.datalab.get_info("statistics").get("weighted_knn_graph", None) + + k: int = 0 # Used to check if the knn graph needs to be recomputed, already set in the knn object + if knn_graph is not None: + k = knn_graph.nnz // knn_graph.shape[0] + + knn: NearestNeighbors = self.ood.params["knn"] # type: ignore + if kwargs.get("knn", None) is not None or knn.n_neighbors > k: # type: ignore[union-attr] + # If the pre-existing knn graph has fewer neighbors than the knn object, + # then we need to recompute the knn graph + assert knn == self.ood.params["knn"] # type: ignore[union-attr] + knn_graph = knn.kneighbors_graph(mode="distance") # type: ignore[union-attr] + self._metric = knn.metric # type: ignore[union-attr] + + return knn_graph + + def collect_info(self, *, knn_graph: Optional[csr_matrix] = None) -> dict: + issues_dict = { + "average_ood_score": self.issues[self.issue_score_key].mean(), + "threshold": self.threshold, + } + pred_probs_issues_dict: Dict[str, Any] = {} + feature_issues_dict = {} + + if knn_graph is not None: + knn = self.ood.params["knn"] # type: ignore + N = knn_graph.shape[0] + k = knn_graph.nnz // N + dists = knn_graph.data.reshape(N, -1)[:, 0] + nn_ids = knn_graph.indices.reshape(N, -1)[:, 0] + + feature_issues_dict.update( + { + "k": k, # type: ignore[union-attr] + "nearest_neighbor": nn_ids.tolist(), + "distance_to_nearest_neighbor": dists.tolist(), + } + ) + if self.ood.params["knn"] is not None: + knn = self.ood.params["knn"] + feature_issues_dict.update({"metric": knn.metric}) # type: ignore[union-attr] + + if self.ood.params["confident_thresholds"] is not None: + pass # + statistics_dict = self._build_statistics_dictionary(knn_graph=knn_graph) + ood_params_dict = self.ood.params + knn_dict = { + **pred_probs_issues_dict, + **feature_issues_dict, + } + info_dict: Dict[str, Any] = { + **issues_dict, + **ood_params_dict, # type: ignore[arg-type] + **knn_dict, + **statistics_dict, + } + return info_dict + + def _build_statistics_dictionary( + self, *, knn_graph: Optional[csr_matrix] + ) -> Dict[str, Dict[str, Any]]: + statistics_dict: Dict[str, Dict[str, Any]] = {"statistics": {}} + + # Add the knn graph as a statistic if necessary + graph_key = "weighted_knn_graph" + old_knn_graph = self.datalab.get_info("statistics").get(graph_key, None) + old_graph_exists = old_knn_graph is not None + prefer_new_graph = ( + not old_graph_exists + or (isinstance(knn_graph, csr_matrix) and knn_graph.nnz > old_knn_graph.nnz) + or self._metric != self.datalab.get_info("statistics").get("knn_metric", None) + ) + if prefer_new_graph: + if knn_graph is not None: + statistics_dict["statistics"][graph_key] = knn_graph + if self._metric is not None: + statistics_dict["statistics"]["knn_metric"] = self._metric + + return statistics_dict + + def _score_with_pred_probs(self, pred_probs: np.ndarray, **kwargs) -> np.ndarray: + # Remove "threshold" from kwargs if it exists + kwargs.pop("threshold", None) + scores = self.ood.fit_score(pred_probs=pred_probs, labels=self.datalab._labels, **kwargs) + return scores + + def _score_with_features(self, features: npt.NDArray, **kwargs) -> npt.NDArray: + scores = self.ood.fit_score(features=features) + return scores diff --git a/cleanlab/datalab/report.py b/cleanlab/datalab/report.py new file mode 100644 index 0000000000..1cbeebc1a9 --- /dev/null +++ b/cleanlab/datalab/report.py @@ -0,0 +1,110 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . +""" +Module that handles reporting of all types of issues identified in the data. +""" + +from typing import TYPE_CHECKING + +import pandas as pd + +from cleanlab.datalab.factory import _IssueManagerFactory + + +if TYPE_CHECKING: # pragma: no cover + from cleanlab.datalab.data_issues import DataIssues + + +class Reporter: + """Class that generates a report about the issues stored in a :py:class:`DataIssues` object. + + Parameters + ---------- + data_issues : + The :py:class:`DataIssues` object containing the issues to report on. This is usually + generated by the :py:class:`Datalab` class, stored in the :py:attr:`data_issues` attribute, + and then passed to the :py:class:`Reporter` class to generate a report. + + verbosity : + The default verbosity of the report to generate. Each :py:class`IssueManager` + specifies the available verbosity levels and what additional information + is included at each level. + + include_description : + Whether to include the description of each issue type in the report. The description + is included by default, but can be excluded by setting this parameter to ``False``. + + Note + ---- + This class is not intended to be used directly. Instead, use the + `Datalab.find_issues` method which internally utilizes an IssueFinder instance. + """ + + def __init__( + self, data_issues: "DataIssues", verbosity: int = 1, include_description: bool = True + ): + self.data_issues = data_issues + self.verbosity = verbosity + self.include_description = include_description + + def get_report(self, num_examples: int) -> str: + """Constructs a report about identified issues in the data. + + Parameters + ---------- + num_examples : + The number of examples to include in the report for each issue type. + + + Returns + ------- + report_str : + A string containing the report. + + Examples + -------- + >>> from cleanlab.datalab.report import Reporter + >>> reporter = Reporter(data_issues=data_issues, include_description=False) + >>> report_str = reporter.get_report(num_examples=5) + >>> print(report_str) + """ + report_str = "" + issue_summary = self.data_issues.issue_summary + issue_summary_sorted = issue_summary.sort_values(by="num_issues", ascending=False) + report_str += self._write_summary(summary=issue_summary_sorted) + + issue_reports = [ + _IssueManagerFactory.from_str(issue_type=key).report( + issues=self.data_issues.get_issues(issue_name=key), + summary=self.data_issues.get_summary(issue_name=key), + info=self.data_issues.get_info(issue_name=key), + num_examples=num_examples, + verbosity=self.verbosity, + include_description=self.include_description, + ) + for key in issue_summary_sorted["issue_type"].tolist() + ] + + report_str += "\n\n\n".join(issue_reports) + return report_str + + def _write_summary(self, summary: pd.DataFrame) -> str: + return ( + "Here is a summary of the different kinds of issues found in the data:\n\n" + + summary.to_string(index=False) + + "\n\n" + + "(Note: A lower score indicates a more severe issue across all examples in the dataset.)\n\n\n" + ) diff --git a/cleanlab/datalab/serialize.py b/cleanlab/datalab/serialize.py new file mode 100644 index 0000000000..548661f1a4 --- /dev/null +++ b/cleanlab/datalab/serialize.py @@ -0,0 +1,138 @@ +# Copyright (C) 2017-2023 Cleanlab Inc. +# This file is part of cleanlab. +# +# cleanlab is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cleanlab is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with cleanlab. If not, see . +from __future__ import annotations + +import os +import pickle +import warnings +from typing import TYPE_CHECKING, Optional + +import pandas as pd + +import cleanlab +from cleanlab.datalab.data import Data + +if TYPE_CHECKING: # pragma: no cover + from datasets.arrow_dataset import Dataset + + from cleanlab.datalab.datalab import Datalab + + +# Constants: +OBJECT_FILENAME = "datalab.pkl" +ISSUES_FILENAME = "issues.csv" +ISSUE_SUMMARY_FILENAME = "summary.csv" +INFO_FILENAME = "info.pkl" +DATA_DIRNAME = "data" + + +class _Serializer: + @staticmethod + def _save_data_issues(path: str, datalab: Datalab) -> None: + """Saves the issues to disk.""" + issues_path = os.path.join(path, ISSUES_FILENAME) + datalab.data_issues.issues.to_csv(issues_path, index=False) + + issue_summary_path = os.path.join(path, ISSUE_SUMMARY_FILENAME) + datalab.data_issues.issue_summary.to_csv(issue_summary_path, index=False) + + @staticmethod + def _save_data(path: str, datalab: Datalab) -> None: + """Saves the dataset to disk.""" + data_path = os.path.join(path, DATA_DIRNAME) + datalab.data.save_to_disk(data_path) + + @staticmethod + def _validate_version(datalab: Datalab) -> None: + current_version = cleanlab.__version__ # type: ignore[attr-defined] + datalab_version = datalab.cleanlab_version + if current_version != datalab_version: + warnings.warn( + f"Saved Datalab was created using different version of cleanlab " + f"({datalab_version}) than current version ({current_version}). " + f"Things may be broken!" + ) + + @classmethod + def serialize(cls, path: str, datalab: Datalab, force: bool) -> None: + """Serializes the datalab object to disk. + + Parameters + ---------- + path : str + Path to save the datalab object to. + + datalab : Datalab + The datalab object to save. + + force : bool + If True, will overwrite existing files at the specified path. + """ + path_exists = os.path.exists(path) + if not path_exists: + os.mkdir(path) + else: + if not force: + raise FileExistsError("Please specify a new path or set force=True") + print(f"WARNING: Existing files will be overwritten by newly saved files at: {path}") + + # Save the datalab object to disk. + with open(os.path.join(path, OBJECT_FILENAME), "wb") as f: + pickle.dump(datalab, f) + + # Save the issues to disk. Use placeholder method for now. + cls._save_data_issues(path=path, datalab=datalab) + + # Save the dataset to disk + cls._save_data(path=path, datalab=datalab) + + @classmethod + def deserialize(cls, path: str, data: Optional[Dataset] = None) -> Datalab: + """Deserializes the datalab object from disk.""" + + if not os.path.exists(path): + raise ValueError(f"No folder found at specified path: {path}") + + with open(os.path.join(path, OBJECT_FILENAME), "rb") as f: + datalab: Datalab = pickle.load(f) + + cls._validate_version(datalab) + + # Load the issues from disk. + issues_path = os.path.join(path, ISSUES_FILENAME) + if not hasattr(datalab.data_issues, "issues") and os.path.exists(issues_path): + datalab.data_issues.issues = pd.read_csv(issues_path) + + issue_summary_path = os.path.join(path, ISSUE_SUMMARY_FILENAME) + if not hasattr(datalab.data_issues, "issue_summary") and os.path.exists(issue_summary_path): + datalab.data_issues.issue_summary = pd.read_csv(issue_summary_path) + + if data is not None: + if hash(data) != hash(datalab._data): + raise ValueError( + "Data has been modified since Lab was saved. " + "Cannot load Lab with modified data." + ) + + if len(data) != len(datalab.labels): + raise ValueError( + f"Length of data ({len(data)}) does not match length of labels ({len(datalab.labels)})" + ) + + datalab._data = Data(data, datalab.label_name) + datalab.data = datalab._data._data + + return datalab diff --git a/docs/requirements.txt b/docs/requirements.txt index f23a4bdec1..3570a8a28c 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -16,6 +16,7 @@ requests==2.28.2 tensorflow-datasets==4.5.2 tensorflow==2.9.1 tensorflow-io==0.26.0 +sentence-transformers==2.2.2 speechbrain==0.5.13 huggingface_hub==0.11.1 fasttext-wheel==0.9.2 @@ -24,3 +25,4 @@ skorch==0.12.1 torchvision==0.14.1 torchaudio==0.13.1 timm==0.6.12 +datasets>=2.9.0 diff --git a/docs/source/cleanlab/datalab/data.rst b/docs/source/cleanlab/datalab/data.rst new file mode 100644 index 0000000000..6da27efbc7 --- /dev/null +++ b/docs/source/cleanlab/datalab/data.rst @@ -0,0 +1,9 @@ +data +==== + +.. automodule:: cleanlab.datalab.data + :autosummary: + :members: + :undoc-members: + :show-inheritance: + :ignore-module-all: \ No newline at end of file diff --git a/docs/source/cleanlab/datalab/data_issues.rst b/docs/source/cleanlab/datalab/data_issues.rst new file mode 100644 index 0000000000..d3a578ae2b --- /dev/null +++ b/docs/source/cleanlab/datalab/data_issues.rst @@ -0,0 +1,9 @@ +data_issues +=========== + +.. automodule:: cleanlab.datalab.data_issues + :autosummary: + :members: + :undoc-members: + :show-inheritance: + :ignore-module-all: \ No newline at end of file diff --git a/docs/source/cleanlab/datalab/datalab.rst b/docs/source/cleanlab/datalab/datalab.rst new file mode 100644 index 0000000000..8a38a27f95 --- /dev/null +++ b/docs/source/cleanlab/datalab/datalab.rst @@ -0,0 +1,9 @@ +datalab +======= + +.. automodule:: cleanlab.datalab.datalab + :autosummary: + :members: + :undoc-members: + :show-inheritance: + :ignore-module-all: \ No newline at end of file diff --git a/docs/source/cleanlab/datalab/factory.rst b/docs/source/cleanlab/datalab/factory.rst new file mode 100644 index 0000000000..d65d1eac5d --- /dev/null +++ b/docs/source/cleanlab/datalab/factory.rst @@ -0,0 +1,9 @@ +factory +======= + +.. automodule:: cleanlab.datalab.factory + :autosummary: + :members: + :undoc-members: + :show-inheritance: + :ignore-module-all: \ No newline at end of file diff --git a/docs/source/cleanlab/datalab/guide/custom_issue_manager.rst b/docs/source/cleanlab/datalab/guide/custom_issue_manager.rst new file mode 100644 index 0000000000..d28dccf292 --- /dev/null +++ b/docs/source/cleanlab/datalab/guide/custom_issue_manager.rst @@ -0,0 +1,227 @@ +.. _issue_manager_creating_your_own: + +Creating Your Own Issues Manager +================================ + + + +This guide walks through the process of creating creating your own +:py:class:`IssueManager ` +to detect a custom-defined type of issue alongside the pre-defined issue types in +:py:class:`Datalab `. + +.. seealso:: + + - :py:meth:`register `: + You can either use this function at runtime to register a new issue manager: + + .. code-block:: python + + from cleanlab.datalab.factory import register + register(MyIssueManager) + + or add as a decorator to the class definition: + + .. code-block:: python + + @register + class MyIssueManager(IssueManager): + ... + +Prerequisites +------------- + +As a starting point for this guide, we'll import the necessary things for the next section and create a dummy dataset. + +.. note:: + + .. include:: ../optional_dependencies.rst + +.. code-block:: python + + + import numpy as np + import pandas as pd + from cleanlab import IssueManager + + # Create a dummy dataset + N = 20 + data = pd.DataFrame( + { + "text": [f"example {i}" for i in range(N)], + "label": np.random.randint(0, 2, N), + }, + ) + + +Implementing IssueManagers +-------------------------- + +.. _basic_issue_manager: + +Basic Issue Check +~~~~~~~~~~~~~~~~~ + + +To create a basic issue manager, inherit from the +:py:class:`IssueManager ` class, +assign a name to the class as the class-variable, `issue_name`, and implement the +:py:meth:`find_issues ` method. + +The :py:meth:`find_issues ` +method should mark each example in the dataset as an issue or not with a boolean array. +It should also provide a score for each example in the dataset that quantifies the quality of the example +with regards to the issue. + +.. code-block:: python + + class Basic(IssueManager): + # Assign a name to the issue + issue_name = "basic" + def find_issues(self, **kwargs) -> None: + # Compute scores for each example + scores = np.random.rand(len(self.datalab.data)) + + # Construct a dataframe where examples are marked for issues + # and the score for each example is included. + self.issues = pd.DataFrame( + { + f"is_{self.issue_name}_issue" : scores < 0.1, + self.issue_score_key : scores, + }, + ) + + # Score the dataset as a whole based on this issue type + self.summary = self.get_summary(score = scores.mean()) + + +.. _intermediate_issue_manager: + +Intermediate Issue Check +~~~~~~~~~~~~~~~~~~~~~~~~ + + +To create an intermediate issue: + +- Perform the same steps as in the :ref:`basic issue check ` section. +- Populate the `info` attribute with a dictionary of information about the identified issues. + +The information can be included in a report generated by :py:class:`Datalab `, +if you add any of the keys to the `verbosity_levels` class-attribute. +Optionally, you can also add a description of the type of issue this issue manager handles to the `description` class-attribute. + +.. code-block:: python + + class Intermediate(IssueManager): + issue_name = "intermediate" + # Add a dictionary of information to include in the report + verbosity_levels = { + 0: [], + 1: ["std"], + 2: ["raw_scores"], + } + # Add a description of the issue + description = "Intermediate issues are a bit more involved than basic issues." + def find_issues(self, *, intermediate_arg: int, **kwargs) -> None: + N = len(self.datalab.data) + raw_scores = np.random.rand(N) + std = raw_scores.std() + threshold = min(0, raw_scores.mean() - std) + sin_filter = np.sin(intermediate_arg * np.arange(N) / N) + kernel = sin_filter ** 2 + scores = kernel * raw_scores + self.issues = pd.DataFrame( + { + f"is_{self.issue_name}_issue" : scores < threshold, + self.issue_score_key : scores, + }, + ) + self.summary = self.get_summary(score = scores.mean()) + + # Useful information that will be available in the Datalab instance + self.info = { + "std": std, + "raw_scores": raw_scores, + "kernel": kernel, + } + +Advanced Issue Check +~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + WIP: This section is a work in progress. + + + +Use with Datalab +---------------- + +We can create a +:py:class:`Datalab ` +instance and run issue checks with the custom issue managers we created like so: + + +.. code-block:: python + + from cleanlab.datalab.factory import register + from cleanlab import Datalab + + + # Register the issue manager + for issue_manager in [Basic, Intermediate]: + register(issue_manager) + + # Instantiate a datalab instance + datalab = Datalab(data, label_name="label") + + # Run the issue check + issue_types = {"basic": {}, "intermediate": {"intermediate_arg": 2}} + datalab.find_issues(issue_types=issue_types) + + # Print report + datalab.report(verbosity=0) + + +The report will look something like this: + +.. code-block:: text + + Here is a summary of the different kinds of issues found in the data: + + issue_type score num_issues + basic 0.477762 2 + intermediate 0.286455 0 + + (Note: A lower score indicates a more severe issue across all examples in the dataset.) + + + ------------------------------------------- basic issues ------------------------------------------- + + Number of examples with this issue: 2 + Overall dataset quality in terms of this issue: : 0.4778 + + Examples representing most severe instances of this issue: + is_basic_issue basic_score + 13 True 0.003042 + 8 True 0.058117 + 11 False 0.121908 + 15 False 0.169312 + 17 False 0.229044 + + + --------------------------------------- intermediate issues ---------------------------------------- + + About this issue: + Intermediate issues are a bit more involved than basic issues. + + Number of examples with this issue: 0 + Overall dataset quality in terms of this issue: : 0.2865 + + Examples representing most severe instances of this issue: + is_intermediate_issue intermediate_score kernel + 0 False 0.000000 0.0 + 1 False 0.007059 0.009967 + 3 False 0.010995 0.087332 + 2 False 0.016296 0.03947 + 11 False 0.019459 0.794251 diff --git a/docs/source/cleanlab/datalab/guide/index.rst b/docs/source/cleanlab/datalab/guide/index.rst new file mode 100644 index 0000000000..769b8cc3f7 --- /dev/null +++ b/docs/source/cleanlab/datalab/guide/index.rst @@ -0,0 +1,19 @@ +Datalab guides +============== + +This page contains a list of guides for using Datalab. + +.. note:: + + .. include:: ../optional_dependencies.rst + + +Developer guides +---------------- + +These guides are for developers who want to contribute to Datalab. + +.. toctree:: + :maxdepth: 3 + + custom_issue_manager \ No newline at end of file diff --git a/docs/source/cleanlab/datalab/index.rst b/docs/source/cleanlab/datalab/index.rst new file mode 100644 index 0000000000..9f1b04dfba --- /dev/null +++ b/docs/source/cleanlab/datalab/index.rst @@ -0,0 +1,39 @@ +datalab +======= + +.. warning:: + Methods in this ``datalab`` module are bleeding edge and may have sharp edges. They are not guaranteed to be stable between different ``cleanlab`` versions. + +.. automodule:: cleanlab.datalab + :autosummary: + :members: + :undoc-members: + :show-inheritance: + +Getting Started +--------------- + +.. include:: optional_dependencies.rst + +Guides +------ + +.. toctree:: + :maxdepth: 2 + + guide/index + + +API Reference +------------- + +.. toctree:: + :maxdepth: 2 + + datalab + data + data_issues + issue_finder + factory + issue_manager/index + report diff --git a/docs/source/cleanlab/datalab/issue_finder.rst b/docs/source/cleanlab/datalab/issue_finder.rst new file mode 100644 index 0000000000..d5540dd90b --- /dev/null +++ b/docs/source/cleanlab/datalab/issue_finder.rst @@ -0,0 +1,12 @@ +issue_finder +============ + +.. note:: This module is not intended to be used directly by users. It is used by the :mod:`cleanlab.datalab.datalab` module. + Specifically, it is used by the :py:meth:`Datalab.find_issues ` method. + +.. automodule:: cleanlab.datalab.issue_finder + :autosummary: + :members: + :undoc-members: + :show-inheritance: + :ignore-module-all: \ No newline at end of file diff --git a/docs/source/cleanlab/datalab/issue_manager/duplicate.rst b/docs/source/cleanlab/datalab/issue_manager/duplicate.rst new file mode 100644 index 0000000000..37d6bc8c9b --- /dev/null +++ b/docs/source/cleanlab/datalab/issue_manager/duplicate.rst @@ -0,0 +1,9 @@ +duplicate +========= + + +.. automodule:: cleanlab.datalab.issue_manager.duplicate + :autosummary: + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/cleanlab/datalab/issue_manager/index.rst b/docs/source/cleanlab/datalab/issue_manager/index.rst new file mode 100644 index 0000000000..92815b8c9c --- /dev/null +++ b/docs/source/cleanlab/datalab/issue_manager/index.rst @@ -0,0 +1,13 @@ +issue_manager +============= + +.. warning:: + Methods in this ``issue_manager`` module are bleeding edge and may have sharp edges. They are not guaranteed to be stable between different ``cleanlab`` versions. + + +.. toctree:: + Base issue_manager module + label + outlier + duplicate + noniid \ No newline at end of file diff --git a/docs/source/cleanlab/datalab/issue_manager/issue_manager.rst b/docs/source/cleanlab/datalab/issue_manager/issue_manager.rst new file mode 100644 index 0000000000..a0974cf86a --- /dev/null +++ b/docs/source/cleanlab/datalab/issue_manager/issue_manager.rst @@ -0,0 +1,8 @@ +issue_manager +============= + +.. automodule:: cleanlab.datalab.issue_manager.issue_manager + :autosummary: + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/cleanlab/datalab/issue_manager/label.rst b/docs/source/cleanlab/datalab/issue_manager/label.rst new file mode 100644 index 0000000000..334bce732f --- /dev/null +++ b/docs/source/cleanlab/datalab/issue_manager/label.rst @@ -0,0 +1,8 @@ +label +===== + +.. automodule:: cleanlab.datalab.issue_manager.label + :autosummary: + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/cleanlab/datalab/issue_manager/noniid.rst b/docs/source/cleanlab/datalab/issue_manager/noniid.rst new file mode 100644 index 0000000000..c93df82679 --- /dev/null +++ b/docs/source/cleanlab/datalab/issue_manager/noniid.rst @@ -0,0 +1,9 @@ +noniid +======= + + +.. automodule:: cleanlab.datalab.issue_manager.noniid + :autosummary: + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/cleanlab/datalab/issue_manager/outlier.rst b/docs/source/cleanlab/datalab/issue_manager/outlier.rst new file mode 100644 index 0000000000..c2ff3ab62c --- /dev/null +++ b/docs/source/cleanlab/datalab/issue_manager/outlier.rst @@ -0,0 +1,9 @@ +outlier +======= + + +.. automodule:: cleanlab.datalab.issue_manager.outlier + :autosummary: + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/cleanlab/datalab/optional_dependencies.rst b/docs/source/cleanlab/datalab/optional_dependencies.rst new file mode 100644 index 0000000000..84378be950 --- /dev/null +++ b/docs/source/cleanlab/datalab/optional_dependencies.rst @@ -0,0 +1,11 @@ +This package has additional dependencies that are not required for the core ``cleanlab`` package. To install them, run: + +.. code-block:: console + + $ pip install cleanlab[datalab] + +For the developmental version of the package, install from source: + +.. code-block:: console + + $ pip install git+https://github.com/cleanlab/cleanlab.git#egg=cleanlab[datalab] \ No newline at end of file diff --git a/docs/source/cleanlab/datalab/report.rst b/docs/source/cleanlab/datalab/report.rst new file mode 100644 index 0000000000..3f5f84d4d6 --- /dev/null +++ b/docs/source/cleanlab/datalab/report.rst @@ -0,0 +1,12 @@ +report +====== + +.. note:: This module is not intended to be used directly by users. It is used by the :mod:`cleanlab.datalab.datalab` module. + Specifically, it is used by the :py:meth:`Datalab.report ` method. + +.. automodule:: cleanlab.datalab.report + :autosummary: + :members: + :undoc-members: + :show-inheritance: + :ignore-module-all: \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index c7546f9563..78ba8aee75 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -126,6 +126,7 @@ Please see our `contributing guidelines Workflows of Data-Centric AI Image Classification (pytorch) Text Classification (tensorflow) @@ -151,8 +152,9 @@ Please see our `contributing guidelines \n", + "Quickstart\n", + "
    \n", + " \n", + "Already have a `model`? Run cross-validation to get out-of-sample `pred_probs`, and then run the code below to audit your dataset and identify any potential issues.\n", + "\n", + "\n", + "
    \n", + " \n", + "```python\n", + "\n", + "from cleanlab import Datalab\n", + "\n", + "lab = Datalab(data=your_dataset, label_name=\"column_name_of_labels\")\n", + "lab.find_issues(pred_probs=your_pred_probs, issue_types={\"label\":{}})\n", + "\n", + "lab.get_issues(\"label\")\n", + " \n", + "```\n", + " \n", + "
    \n", + "
    " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eqsqBq3PiUHA" + }, + "source": [ + "## 1. Install dependencies and import them\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7nT-U9qc8MS" + }, + "source": [ + "You can use `pip` to install all packages required for this tutorial as follows:\n", + "\n", + "```ipython3\n", + "!pip install speechbrain tensorflow sklearn tensorflow_io\n", + "!pip install cleanlab[datalab]\n", + "# Make sure to install the version corresponding to this tutorial\n", + "# E.g. if viewing master branch documentation:\n", + "# !pip install git+https://github.com/cleanlab/cleanlab.git\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Package installation (hidden on docs website).\n", + "# Package versions used: tensorflow==2.9.1 speechbrain==0.5.12 tensorflow-io==0.26.0 torch==1.11.0 torchaudio==0.11.0\n", + "\n", + "dependencies = [\"cleanlab\", \"sklearn\", \"speechbrain==0.5.12\", \"tensorflow==2.9.1\", \"tensorflow_io==0.26.0\", \"huggingface_hub==0.7.0\", \"datasets\"]\n", + "\n", + "# Supress outputs that may appear if tensorflow happens to be improperly installed: \n", + "import os \n", + "os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\" \n", + "\n", + "if \"google.colab\" in str(get_ipython()): # Check if it's running in Google Colab\n", + " %pip install cleanlab # for colab\n", + " cmd = ' '.join([dep for dep in dependencies if dep != \"cleanlab\"])\n", + " %pip install $cmd\n", + "else:\n", + " missing_dependencies = []\n", + " for dependency in dependencies:\n", + " try:\n", + " __import__(dependency)\n", + " except ImportError:\n", + " missing_dependencies.append(dependency)\n", + "\n", + " if len(missing_dependencies) > 0:\n", + " print(\"Missing required dependencies:\")\n", + " print(*missing_dependencies, sep=\", \")\n", + " print(\"\\nPlease install them before running the rest of this notebook.\") " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "x-oboEbRdhf6" + }, + "source": [ + "Let's import some of the packages needed throughout this tutorial.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LaEiwXUiVHCS" + }, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "import random\n", + "import tensorflow as tf\n", + "import torch\n", + "\n", + "from cleanlab import Datalab\n", + "\n", + "SEED = 456 # ensure reproducibility" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# This (optional) cell is hidden from docs.cleanlab.ai \n", + "\n", + "def set_seed(seed=0):\n", + " \"\"\"Ensure reproducibility.\"\"\"\n", + " np.random.seed(seed)\n", + " torch.manual_seed(seed)\n", + " torch.backends.cudnn.deterministic = True\n", + " torch.backends.cudnn.benchmark = False\n", + " torch.cuda.manual_seed_all(seed)\n", + "\n", + "\n", + "set_seed(SEED)\n", + "pd.options.display.max_colwidth = 500\n", + "tf.get_logger().setLevel('FATAL') # suppress more TF logs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SOen_sxQidLC" + }, + "source": [ + "## 2. Load the data\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uHVskN2eeNj6" + }, + "source": [ + "We must first fetch the dataset. To run the below command, you'll need to have `wget` installed; alternatively you can manually navigate to the link in your browser and download from there.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GRDPEg7-VOQe", + "outputId": "cb886220-e86e-4a77-9f3a-d7844c37c3a6" + }, + "outputs": [], + "source": [ + "%%capture\n", + "\n", + "!wget https://github.com/Jakobovski/free-spoken-digit-dataset/archive/v1.0.9.tar.gz\n", + "!mkdir spoken_digits\n", + "!tar -xf v1.0.9.tar.gz -C spoken_digits" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tRvNnyB0e_IE" + }, + "source": [ + "The audio data are .wav files in the `recordings/` folder. Note that the label for each audio clip (i.e. digit from 0 to 9) is indicated in the prefix of the file name (e.g. `6_nicolas_32.wav` has the label 6). If instead applying cleanlab to your own dataset, its classes should be represented as integer indices 0, 1, ..., num_classes - 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FDA5sGZwUSur", + "outputId": "0cedc509-63fd-4dc3-d32f-4b537dfe3895" + }, + "outputs": [], + "source": [ + "DATA_PATH = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/\"\n", + "\n", + "# Get list of .wav file names\n", + "# os.listdir order is nondeterministic, so for reproducibility,\n", + "# we sort first and then do a deterministic shuffle\n", + "file_names = sorted(i for i in os.listdir(DATA_PATH) if i.endswith(\".wav\"))\n", + "random.Random(SEED).shuffle(file_names)\n", + "\n", + "file_paths = [os.path.join(DATA_PATH, name) for name in file_names]\n", + "\n", + "# Check out first 3 files\n", + "file_paths[:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Xi2592bVhSab" + }, + "source": [ + "Let's listen to some example audio clips from the dataset. We introduce a `display_example` function to process the .wav file so we can listen to it in this notebook (can skip these details)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    See the implementation of `display_example` **(click to expand)**\n", + "\n", + "```python\n", + "# Note: This pulldown content is for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", + "\n", + "import tensorflow_io as tfio\n", + "from pathlib import Path\n", + "from IPython import display\n", + "\n", + "# Utility function for loading audio files and making sure the sample rate is correct.\n", + "@tf.function\n", + "def load_wav_16k_mono(filename):\n", + " \"\"\"Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio.\"\"\"\n", + " file_contents = tf.io.read_file(filename)\n", + " wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)\n", + " wav = tf.squeeze(wav, axis=-1)\n", + " sample_rate = tf.cast(sample_rate, dtype=tf.int64)\n", + " wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)\n", + " return wav\n", + "\n", + "\n", + "def display_example(wav_file_name, audio_rate=16000):\n", + " \"\"\"Allows us to listen to any wav file and displays its given label in the dataset.\"\"\"\n", + " wav_file_example = load_wav_16k_mono(wav_file_name)\n", + " label = Path(wav_file_name).parts[-1].split(\"_\")[0]\n", + " print(f\"Given label for this example: {label}\")\n", + " display.display(display.Audio(wav_file_example, rate=audio_rate))\n", + "```\n", + "\n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "import tensorflow_io as tfio\n", + "from pathlib import Path\n", + "from IPython import display\n", + "\n", + "# Utility function for loading audio files and making sure the sample rate is correct.\n", + "@tf.function\n", + "def load_wav_16k_mono(filename):\n", + " \"\"\"Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio.\"\"\"\n", + " file_contents = tf.io.read_file(filename)\n", + " wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)\n", + " wav = tf.squeeze(wav, axis=-1)\n", + " sample_rate = tf.cast(sample_rate, dtype=tf.int64)\n", + " wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)\n", + " return wav\n", + "\n", + "\n", + "def display_example(wav_file_name, audio_rate=16000):\n", + " \"\"\"Allows us to listen to any wav file and displays its given label in the dataset.\"\"\"\n", + " wav_file_example = load_wav_16k_mono(wav_file_name)\n", + " label = Path(wav_file_name).parts[-1].split(\"_\")[0]\n", + " print(f\"Given label for this example: {label}\")\n", + " display.display(display.Audio(wav_file_example, rate=audio_rate))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2bLlDRI6hzon" + }, + "source": [ + "Click the play button below to listen to this example .wav file. Feel free to change the `wav_file_name_example` variable below to listen to other audio clips in the dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 92 + }, + "id": "dLBvUZLlII5w", + "outputId": "c6a4917f-4a82-4a89-9193-415072e45550" + }, + "outputs": [], + "source": [ + "wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/7_jackson_43.wav\" # change this to hear other examples\n", + "display_example(wav_file_name_example)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-QvbZA7yHwkh" + }, + "source": [ + "## 3. Use pre-trained SpeechBrain model to featurize audio\n", + "\n", + "The [SpeechBrain](https://github.com/speechbrain/speechbrain) package offers many Pytorch neural networks that have been pretrained for speech recognition tasks. Here we instantiate an audio feature extractor using SpeechBrain's `EncoderClassifier`. We'll use the \"spkrec-xvect-voxceleb\" network which has been pre-trained on the [VoxCeleb](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/) speech dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vL9lkiKsHvKr" + }, + "outputs": [], + "source": [ + "%%capture\n", + "\n", + "from speechbrain.pretrained import EncoderClassifier\n", + "\n", + "feature_extractor = EncoderClassifier.from_hparams(\n", + " \"speechbrain/spkrec-xvect-voxceleb\",\n", + " # run_opts={\"device\":\"cuda\"} # Uncomment this to run on GPU if you have one (optional)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vXlE6IK4ibcr" + }, + "source": [ + "Next, we run the audio clips through the pre-trained model to extract vector features (aka embeddings).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "id": "obQYDKdLiUU6", + "outputId": "4e923d5c-2cf4-4a5c-827b-0a4fea9d87e4" + }, + "outputs": [], + "source": [ + "# Create dataframe with .wav file names\n", + "df = pd.DataFrame(file_paths, columns=[\"wav_audio_file_path\"])\n", + "df[\"label\"] = df.wav_audio_file_path.map(lambda x: int(Path(x).parts[-1].split(\"_\")[0]))\n", + "df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "I8JqhOZgi94g" + }, + "outputs": [], + "source": [ + "import torchaudio\n", + "\n", + "def extract_audio_embeddings(model, wav_audio_file_path: str) -> tuple:\n", + " \"\"\"Feature extractor that embeds audio into a vector.\"\"\"\n", + " signal, fs = torchaudio.load(wav_audio_file_path) # Reformat audio signal into a tensor\n", + " embeddings = model.encode_batch(\n", + " signal\n", + " ) # Pass tensor through pretrained neural net and extract representation\n", + " return embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2FSQ2GR9R_YA" + }, + "outputs": [], + "source": [ + "# Extract audio embeddings\n", + "embeddings_list = []\n", + "for i, file_name in enumerate(df.wav_audio_file_path): # for each .wav file name\n", + " embeddings = extract_audio_embeddings(feature_extractor, file_name)\n", + " embeddings_list.append(embeddings.cpu().numpy())\n", + "\n", + "embeddings_array = np.squeeze(np.array(embeddings_list))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dELkcdXgjTn_" + }, + "source": [ + "Now we have our features in a 2D numpy array. Each row in the array corresponds to an audio clip. We're now able to represent each audio clip as a 512-dimensional feature vector!\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kAkY31IVXyr8", + "outputId": "fd70d8d6-2f11-48d5-ae9c-a8c97d453632" + }, + "outputs": [], + "source": [ + "print(embeddings_array)\n", + "print(\"Shape of array: \", embeddings_array.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o4RBcaARmfVG" + }, + "source": [ + "## 4. Fit linear model and compute out-of-sample predicted probabilities\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y9BIVyI9kHa4" + }, + "source": [ + "A typical way to leverage pretrained networks for a particular classification task is to add a linear output layer and fine-tune the network parameters on the new data. However this can be computationally intensive. Alternatively, we can freeze the pretrained weights of the network and only train the output layer without having to rely on GPU(s). Here we do this conveniently by fitting a scikit-learn linear model on top of the extracted network embeddings.\n", + "\n", + "To identify label issues, cleanlab requires a probabilistic prediction from your model for every datapoint that should be considered. However these predictions will be _overfit_ (and thus unreliable) for datapoints the model was previously trained on. cleanlab is intended to only be used with **out-of-sample** predicted probabilities, i.e. on datapoints held-out from the model during the training.\n", + "\n", + "K-fold cross-validation is a straightforward way to produce out-of-sample predicted probabilities for every datapoint in the dataset, by training K copies of our model on different data subsets and using each copy to predict on the subset of data it did not see during training. An additional benefit of cross-validation is that it provides more reliable evaluation of our model than a single training/validation split. We can obtain cross-validated out-of-sample predicted probabilities from any classifier via the [cross_val_predict](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html) wrapper provided in scikit-learn.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "i_drkY9YOcw4" + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import cross_val_predict\n", + "\n", + "model = LogisticRegression(C=0.01, max_iter=1000, tol=1e-1, random_state=SEED)\n", + "\n", + "num_crossval_folds = 5 # can decrease this value to reduce runtime, or increase it to get better results\n", + "pred_probs = cross_val_predict(\n", + " estimator=model, X=embeddings_array, y=df.label.values, cv=num_crossval_folds, method=\"predict_proba\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FW1yI9Ryrfkj" + }, + "source": [ + "For each audio clip, the corresponding predicted probabilities in `pred_probs` are produced by a copy of our `LogisticRegression` model that has never been trained on this audio clip. Hence we call these predictions _out-of-sample_. An additional benefit of cross-validation is that it provides more reliable evaluation of our model than a single training/validation split.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_b-AQeoXOc7q", + "outputId": "15ae534a-f517-4906-b177-ca91931a8954" + }, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "\n", + "predicted_labels = pred_probs.argmax(axis=1)\n", + "cv_accuracy = accuracy_score(df.label.values, predicted_labels)\n", + "print(f\"Cross-validated estimate of accuracy on held-out data: {cv_accuracy}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SPz8WBwIlxUE" + }, + "source": [ + "## 5. Use cleanlab to find label issues\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "laui-jXMm6qR" + }, + "source": [ + "Based on the given labels, out-of-sample predicted probabilities and features, cleanlab can quickly help us identify label issues in our dataset. For a dataset with N examples from K classes, the labels should be a 1D array of length N and predicted probabilities should be a 2D (N x K) array. \n", + "\n", + "Here, we use cleanlab to find potential label errors in our data. `Datalab` has several ways of loading the data. In this case, we can just pass the DataFrame created above to instantiate the object. We will then pass in the predicted probabilites to the `find_issues()` method so that Datalab can use them to find potential label errors in our data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lab = Datalab(df, label_name=\"label\")\n", + "lab.find_issues(pred_probs=pred_probs, issue_types={\"label\":{}})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can view the results of running Datalab by calling the `report` method:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "lab.report()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We observe from the report that cleanlab has found some label issues in our dataset. Let us investigate these examples further.\n", + "\n", + "We can view the more details about the label quality for each example using the `get_issues` method, specifying `label` as the issue type." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "label_issues = lab.get_issues(\"label\")\n", + "label_issues.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This method returns a dataframe containing a label quality score for each example. These numeric scores lie between 0 and 1, where lower scores indicate examples more likely to be mislabeled. The dataframe also contains a boolean column specifying whether or not each example is identified to have a label issue (indicating it is likely mislabeled).\n", + "\n", + "We can then filter for the examples that have been identified as a label error:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "identified_label_issues = label_issues[label_issues[\"is_label_issue\"] == True]\n", + "lowest_quality_labels = identified_label_issues.sort_values(\"label_score\").index\n", + "\n", + "print(f\"Here are indices of the most likely errors: \\n {lowest_quality_labels.values}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iI07jQ0BnTgt" + }, + "source": [ + "These examples flagged by cleanlab are those worth inspecting more closely." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 237 + }, + "id": "FQwRHgbclpsO", + "outputId": "fee5c335-c00e-4fcc-f22b-718705e93182" + }, + "outputs": [], + "source": [ + "df.iloc[lowest_quality_labels]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PsDmd5WDnZJG" + }, + "source": [ + "Let's listen to some audio clips below of label issues that were identified in this list.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p9jLn3Lp85rU" + }, + "source": [ + "In this example, the given label is **6** but it sounds like **8**.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 92 + }, + "id": "ff1NFVlDoysO", + "outputId": "8141a036-44c1-4349-c338-880432513e37" + }, + "outputs": [], + "source": [ + "wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/6_yweweler_14.wav\"\n", + "display_example(wav_file_name_example)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HwokyN0bfVsn" + }, + "source": [ + "In the three examples below, the given label is **6** but they sound quite ambiguous.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 92 + }, + "id": "GZgovGkdiaiP", + "outputId": "d76b2ccf-8be2-4f3a-df4c-2c5c99150db7" + }, + "outputs": [], + "source": [ + "wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/6_yweweler_36.wav\"\n", + "display_example(wav_file_name_example)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 92 + }, + "id": "lfa2eHbMwG8R", + "outputId": "6627ebe2-d439-4bf5-e2cb-44f6278ae86c" + }, + "outputs": [], + "source": [ + "wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/6_yweweler_35.wav\"\n", + "display_example(wav_file_name_example)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wav_file_name_example = \"spoken_digits/free-spoken-digit-dataset-1.0.9/recordings/6_nicolas_8.wav\"\n", + "display_example(wav_file_name_example)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-rf8iSngtV83" + }, + "source": [ + "You can see that even widely-used datasets like Spoken Digit contain problematic labels. Never blindly trust your data! You should always check it for potential issues, many of which can be easily identified by cleanlab.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Note: This cell is only for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", + "\n", + "highlighted_indices = [1946, 516, 469, 2132] # verify these examples were found in find_label_issues\n", + "if not all(x in lowest_quality_labels for x in highlighted_indices):\n", + " raise Exception(\"Some highlighted examples are missing from label_issues_indices.\")" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "audio_quickstart_tutorial_deterministic.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docs/source/tutorials/datalab/datalab_advanced.ipynb b/docs/source/tutorials/datalab/datalab_advanced.ipynb new file mode 100644 index 0000000000..f89244f54d --- /dev/null +++ b/docs/source/tutorials/datalab/datalab_advanced.ipynb @@ -0,0 +1,817 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Datalab: Advanced workflows to audit your data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Datalab` helps you identify various issues in your machine learning datasets that may negatively impact the performance of your machine learning model if not addressed. By default, `Datalab` can help you identify noisy labels, outliers, (near) duplicates, and other types of problems that commonly occur in real-world data.\n", + "\n", + "`Datalab` performs these checks by utilizing the (probabilistic) predictions from *any* ML model that has already been trained or its learned representations of the data. Underneath the hood, this class calls all the appropriate cleanlab methods for your dataset and provided model outputs, in order to best audit the data and alert you of important issues. This makes it easy to apply many functionalities of this library all within a single line of code. \n", + "\n", + "**This tutorial will demonstrate some advanced functionalities of Datalab including:**\n", + "\n", + "- Incremental issue search\n", + "- Specifying nondefault arguments to issue checks\n", + "- Save and load Datalab objects\n", + "- Adding a custom IssueManager\n", + "\n", + "If you are new to `Datalab`, check out this [quickstart tutorial](datalab_quickstart.html) for a 5-min introduction!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "Quickstart\n", + "
    \n", + " \n", + "Already have (out-of-sample) `pred_probs` from a model trained on an existing set of labels? Maybe you have some `features` as well? Run the code below to examine your dataset for multiple types of issues.\n", + "\n", + "
    \n", + " \n", + "```ipython3 \n", + "from cleanlab import Datalab\n", + "\n", + "lab = Datalab(data=your_dataset, label_name=\"column_name_of_labels\")\n", + "lab.find_issues(features=your_feature_matrix, pred_probs=your_pred_probs)\n", + "\n", + "lab.report()\n", + "```\n", + " \n", + "
    \n", + "
    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install and import required dependencies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Datalab` has additional dependencies that are not included in the standard installation of cleanlab.\n", + "\n", + "You can use pip to install all packages required for this tutorial as follows:\n", + "\n", + "```\n", + "!pip install matplotlib \n", + "!pip install cleanlab[datalab]\n", + "\n", + "# Make sure to install the version corresponding to this tutorial\n", + "# E.g. if viewing master branch documentation:\n", + "# !pip install git+https://github.com/cleanlab/cleanlab.git\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Package installation (hidden on docs website).\n", + "dependencies = [\"cleanlab\", \"matplotlib\", \"datasets\"] # TODO: make sure this list is updated\n", + "\n", + "if \"google.colab\" in str(get_ipython()): # Check if it's running in Google Colab\n", + " %pip install cleanlab # for colab\n", + " cmd = ' '.join([dep for dep in dependencies if dep != \"cleanlab\"])\n", + " %pip install $cmd\n", + "else:\n", + " missing_dependencies = []\n", + " for dependency in dependencies:\n", + " try:\n", + " __import__(dependency)\n", + " except ImportError:\n", + " missing_dependencies.append(dependency)\n", + "\n", + " if len(missing_dependencies) > 0:\n", + " print(\"Missing required dependencies:\")\n", + " print(*missing_dependencies, sep=\", \")\n", + " print(\"\\nPlease install them before running the rest of this notebook.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: remove cell before merging to master\n", + "# !pip install git+https://github.com/cleanlab/cleanlab.git#egg=cleanlab[datalab]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import cross_val_predict\n", + "\n", + "from cleanlab import Datalab" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create and load the data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll load a toy classification dataset for this tutorial. The dataset has two numerical features and a label column with three classes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    See the code for data generation. **(click to expand)**\n", + " \n", + "```ipython3\n", + "# Note: This pulldown content is for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from cleanlab.benchmarking.noise_generation import (\n", + " generate_noise_matrix_from_trace,\n", + " generate_noisy_labels,\n", + ")\n", + "\n", + "SEED = 123\n", + "np.random.seed(SEED)\n", + "\n", + "BINS = {\n", + " \"low\": [-np.inf, 3.3],\n", + " \"mid\": [3.3, 6.6],\n", + " \"high\": [6.6, +np.inf],\n", + "}\n", + "\n", + "BINS_MAP = {\n", + " \"low\": 0,\n", + " \"mid\": 1,\n", + " \"high\": 2,\n", + "}\n", + "\n", + "\n", + "def create_data():\n", + "\n", + " X = np.random.rand(250, 2) * 5\n", + " y = np.sum(X, axis=1)\n", + " # Map y to bins based on the BINS dict\n", + " y_bin = np.array([k for y_i in y for k, v in BINS.items() if v[0] <= y_i < v[1]])\n", + " y_bin_idx = np.array([BINS_MAP[k] for k in y_bin])\n", + "\n", + " # Split into train and test\n", + " X_train, X_test, y_train, y_test, y_train_idx, y_test_idx = train_test_split(\n", + " X, y_bin, y_bin_idx, test_size=0.5, random_state=SEED\n", + " )\n", + "\n", + " # Add several (5) out-of-distribution points. Sliding them along the decision boundaries\n", + " # to make them look like they are out-of-frame\n", + " X_out = np.array(\n", + " [\n", + " [-1.5, 3.0],\n", + " [-1.75, 6.5],\n", + " [1.5, 7.2],\n", + " [2.5, -2.0],\n", + " [5.5, 7.0],\n", + " ]\n", + " )\n", + " # Add a near duplicate point to the last outlier, with some tiny noise added\n", + " near_duplicate = X_out[-1:] + np.random.rand(1, 2) * 1e-6\n", + " X_out = np.concatenate([X_out, near_duplicate])\n", + "\n", + " y_out = np.sum(X_out, axis=1)\n", + " y_out_bin = np.array([k for y_i in y_out for k, v in BINS.items() if v[0] <= y_i < v[1]])\n", + " y_out_bin_idx = np.array([BINS_MAP[k] for k in y_out_bin])\n", + "\n", + " # Add to train\n", + " X_train = np.concatenate([X_train, X_out])\n", + " y_train = np.concatenate([y_train, y_out])\n", + " y_train_idx = np.concatenate([y_train_idx, y_out_bin_idx])\n", + "\n", + " # Add an exact duplicate example to the training set\n", + " exact_duplicate_idx = np.random.randint(0, len(X_train))\n", + " X_duplicate = X_train[exact_duplicate_idx, None]\n", + " y_duplicate = y_train[exact_duplicate_idx, None]\n", + " y_duplicate_idx = y_train_idx[exact_duplicate_idx, None]\n", + "\n", + " # Add to train\n", + " X_train = np.concatenate([X_train, X_duplicate])\n", + " y_train = np.concatenate([y_train, y_duplicate])\n", + " y_train_idx = np.concatenate([y_train_idx, y_duplicate_idx])\n", + "\n", + " py = np.bincount(y_train_idx) / float(len(y_train_idx))\n", + " m = len(BINS)\n", + "\n", + " noise_matrix = generate_noise_matrix_from_trace(\n", + " m,\n", + " trace=0.9 * m,\n", + " py=py,\n", + " valid_noise_matrix=True,\n", + " seed=SEED,\n", + " )\n", + "\n", + " noisy_labels_idx = generate_noisy_labels(y_train_idx, noise_matrix)\n", + " noisy_labels = np.array([list(BINS_MAP.keys())[i] for i in noisy_labels_idx])\n", + "\n", + " return X_train, y_train_idx, noisy_labels, noisy_labels_idx, X_out, X_duplicate\n", + "```\n", + "\n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from cleanlab.benchmarking.noise_generation import (\n", + " generate_noise_matrix_from_trace,\n", + " generate_noisy_labels,\n", + ")\n", + "\n", + "SEED = 123\n", + "np.random.seed(SEED)\n", + "\n", + "BINS = {\n", + " \"low\": [-np.inf, 3.3],\n", + " \"mid\": [3.3, 6.6],\n", + " \"high\": [6.6, +np.inf],\n", + "}\n", + "\n", + "BINS_MAP = {\n", + " \"low\": 0,\n", + " \"mid\": 1,\n", + " \"high\": 2,\n", + "}\n", + "\n", + "\n", + "def create_data():\n", + "\n", + " X = np.random.rand(250, 2) * 5\n", + " y = np.sum(X, axis=1)\n", + " # Map y to bins based on the BINS dict\n", + " y_bin = np.array([k for y_i in y for k, v in BINS.items() if v[0] <= y_i < v[1]])\n", + " y_bin_idx = np.array([BINS_MAP[k] for k in y_bin])\n", + "\n", + " # Split into train and test\n", + " X_train, X_test, y_train, y_test, y_train_idx, y_test_idx = train_test_split(\n", + " X, y_bin, y_bin_idx, test_size=0.5, random_state=SEED\n", + " )\n", + "\n", + " # Add several (5) out-of-distribution points. Sliding them along the decision boundaries\n", + " # to make them look like they are out-of-frame\n", + " X_out = np.array(\n", + " [\n", + " [-1.5, 3.0],\n", + " [-1.75, 6.5],\n", + " [1.5, 7.2],\n", + " [2.5, -2.0],\n", + " [5.5, 7.0],\n", + " ]\n", + " )\n", + " # Add a near duplicate point to the last outlier, with some tiny noise added\n", + " near_duplicate = X_out[-1:] + np.random.rand(1, 2) * 1e-6\n", + " X_out = np.concatenate([X_out, near_duplicate])\n", + "\n", + " y_out = np.sum(X_out, axis=1)\n", + " y_out_bin = np.array([k for y_i in y_out for k, v in BINS.items() if v[0] <= y_i < v[1]])\n", + " y_out_bin_idx = np.array([BINS_MAP[k] for k in y_out_bin])\n", + "\n", + " # Add to train\n", + " X_train = np.concatenate([X_train, X_out])\n", + " y_train = np.concatenate([y_train, y_out])\n", + " y_train_idx = np.concatenate([y_train_idx, y_out_bin_idx])\n", + "\n", + " # Add an exact duplicate example to the training set\n", + " exact_duplicate_idx = np.random.randint(0, len(X_train))\n", + " X_duplicate = X_train[exact_duplicate_idx, None]\n", + " y_duplicate = y_train[exact_duplicate_idx, None]\n", + " y_duplicate_idx = y_train_idx[exact_duplicate_idx, None]\n", + "\n", + " # Add to train\n", + " X_train = np.concatenate([X_train, X_duplicate])\n", + " y_train = np.concatenate([y_train, y_duplicate])\n", + " y_train_idx = np.concatenate([y_train_idx, y_duplicate_idx])\n", + "\n", + " py = np.bincount(y_train_idx) / float(len(y_train_idx))\n", + " m = len(BINS)\n", + "\n", + " noise_matrix = generate_noise_matrix_from_trace(\n", + " m,\n", + " trace=0.9 * m,\n", + " py=py,\n", + " valid_noise_matrix=True,\n", + " seed=SEED,\n", + " )\n", + "\n", + " noisy_labels_idx = generate_noisy_labels(y_train_idx, noise_matrix)\n", + " noisy_labels = np.array([list(BINS_MAP.keys())[i] for i in noisy_labels_idx])\n", + "\n", + " return X_train, y_train_idx, noisy_labels, noisy_labels_idx, X_out, X_duplicate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, y_train_idx, noisy_labels, noisy_labels_idx, X_out, X_duplicate = create_data()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We make a scatter plot of the features, with a color corresponding to the observed labels. Incorrect given labels are highlighted in red if they do not match the true label, outliers highlighted with an a black cross, and duplicates highlighted with a cyan cross." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    See the code to visualize the data. **(click to expand)**\n", + " \n", + "```ipython3\n", + "# Note: This pulldown content is for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_data(X_train, y_train_idx, noisy_labels_idx, X_out, X_duplicate):\n", + " # Plot data with clean labels and noisy labels, use BINS_MAP for the legend\n", + " fig, ax = plt.subplots(figsize=(8, 6))\n", + " \n", + " low = ax.scatter(X_train[noisy_labels_idx == 0, 0], X_train[noisy_labels_idx == 0, 1], label=\"low\")\n", + " mid = ax.scatter(X_train[noisy_labels_idx == 1, 0], X_train[noisy_labels_idx == 1, 1], label=\"mid\")\n", + " high = ax.scatter(X_train[noisy_labels_idx == 2, 0], X_train[noisy_labels_idx == 2, 1], label=\"high\")\n", + " \n", + " ax.set_title(\"Noisy labels\")\n", + " ax.set_xlabel(r\"$x_1$\")\n", + " ax.set_ylabel(r\"$x_2$\")\n", + "\n", + " # Plot true boundaries (x+y=3.3, x+y=6.6)\n", + " ax.set_xlim(-3.5, 8.5)\n", + " ax.set_ylim(-3.5, 8.5)\n", + " ax.plot([-0.7, 4.0], [4.0, -0.7], color=\"k\", linestyle=\"--\", alpha=0.5)\n", + " ax.plot([-0.7, 7.3], [7.3, -0.7], color=\"k\", linestyle=\"--\", alpha=0.5)\n", + "\n", + " # Draw red circles around the points that are misclassified (i.e. the points that are in the wrong bin)\n", + " for i, (X, y) in enumerate(zip([X_train, X_train], [y_train_idx, noisy_labels_idx])):\n", + " for j, (k, v) in enumerate(BINS_MAP.items()):\n", + " label_err = ax.scatter(\n", + " X[(y == v) & (y != y_train_idx), 0],\n", + " X[(y == v) & (y != y_train_idx), 1],\n", + " s=180,\n", + " marker=\"o\",\n", + " facecolor=\"none\",\n", + " edgecolors=\"red\",\n", + " linewidths=2.5,\n", + " alpha=0.5,\n", + " label=\"Label error\",\n", + " )\n", + "\n", + "\n", + " outlier = ax.scatter(X_out[:, 0], X_out[:, 1], color=\"k\", marker=\"x\", s=100, linewidth=2, label=\"Outlier\")\n", + "\n", + " # Plot the exact duplicate\n", + " dups = ax.scatter(\n", + " X_duplicate[:, 0],\n", + " X_duplicate[:, 1],\n", + " color=\"c\",\n", + " marker=\"x\",\n", + " s=100,\n", + " linewidth=2,\n", + " label=\"Duplicates\",\n", + " )\n", + " \n", + " first_legend = ax.legend(handles=[low, mid, high], loc=[0.785, 0.8], title=\"Given Class Label\", alignment=\"left\", title_fontproperties={\"weight\":\"semibold\"})\n", + " second_legend = ax.legend(handles=[label_err, outlier, dups], loc=[0.785, 0.6], title=\"Type of Issue\", alignment=\"left\", title_fontproperties={\"weight\":\"semibold\"})\n", + " \n", + " ax = plt.gca().add_artist(first_legend)\n", + " ax = plt.gca().add_artist(second_legend)\n", + " plt.tight_layout()\n", + "```\n", + " \n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_data(X_train, y_train_idx, noisy_labels_idx, X_out, X_duplicate):\n", + " # Plot data with clean labels and noisy labels, use BINS_MAP for the legend\n", + " fig, ax = plt.subplots(figsize=(8, 6))\n", + " \n", + " low = ax.scatter(X_train[noisy_labels_idx == 0, 0], X_train[noisy_labels_idx == 0, 1], label=\"low\")\n", + " mid = ax.scatter(X_train[noisy_labels_idx == 1, 0], X_train[noisy_labels_idx == 1, 1], label=\"mid\")\n", + " high = ax.scatter(X_train[noisy_labels_idx == 2, 0], X_train[noisy_labels_idx == 2, 1], label=\"high\")\n", + " \n", + " ax.set_title(\"Noisy labels\")\n", + " ax.set_xlabel(r\"$x_1$\")\n", + " ax.set_ylabel(r\"$x_2$\")\n", + "\n", + " # Plot true boundaries (x+y=3.3, x+y=6.6)\n", + " ax.set_xlim(-3.5, 8.5)\n", + " ax.set_ylim(-3.5, 8.5)\n", + " ax.plot([-0.7, 4.0], [4.0, -0.7], color=\"k\", linestyle=\"--\", alpha=0.5)\n", + " ax.plot([-0.7, 7.3], [7.3, -0.7], color=\"k\", linestyle=\"--\", alpha=0.5)\n", + "\n", + " # Draw red circles around the points that are misclassified (i.e. the points that are in the wrong bin)\n", + " for i, (X, y) in enumerate(zip([X_train, X_train], [y_train_idx, noisy_labels_idx])):\n", + " for j, (k, v) in enumerate(BINS_MAP.items()):\n", + " label_err = ax.scatter(\n", + " X[(y == v) & (y != y_train_idx), 0],\n", + " X[(y == v) & (y != y_train_idx), 1],\n", + " s=180,\n", + " marker=\"o\",\n", + " facecolor=\"none\",\n", + " edgecolors=\"red\",\n", + " linewidths=2.5,\n", + " alpha=0.5,\n", + " label=\"Label error\",\n", + " )\n", + "\n", + "\n", + " outlier = ax.scatter(X_out[:, 0], X_out[:, 1], color=\"k\", marker=\"x\", s=100, linewidth=2, label=\"Outlier\")\n", + "\n", + " # Plot the exact duplicate\n", + " dups = ax.scatter(\n", + " X_duplicate[:, 0],\n", + " X_duplicate[:, 1],\n", + " color=\"c\",\n", + " marker=\"x\",\n", + " s=100,\n", + " linewidth=2,\n", + " label=\"Duplicates\",\n", + " )\n", + " \n", + " first_legend = ax.legend(handles=[low, mid, high], loc=[0.785, 0.8], title=\"Given Class Label\", alignment=\"left\", title_fontproperties={\"weight\":\"semibold\"})\n", + " second_legend = ax.legend(handles=[label_err, outlier, dups], loc=[0.785, 0.6], title=\"Type of Issue\", alignment=\"left\", title_fontproperties={\"weight\":\"semibold\"})\n", + " \n", + " ax = plt.gca().add_artist(first_legend)\n", + " ax = plt.gca().add_artist(second_legend)\n", + " plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_data(X_train, y_train_idx, noisy_labels_idx, X_out, X_duplicate)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In real-world scenarios, you won't know the true labels or the distribution of the features, so we won't use these in this tutorial, except for evaluation purposes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get out-of-sample predicted probabilities from a classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To detect certain types of issues in classification data (e.g. label errors), `Datalab` relies on predicted class probabilities from a trained model. Ideally, the prediction for each example should be out-of-sample (to avoid overfitting), coming from a copy of the model that was not trained on this example. \n", + "\n", + "This tutorial uses a simple logistic regression model \n", + "and the `cross_val_predict()` function from scikit-learn to generate out-of-sample predicted class probabilities for every example in the training set. You can replace this with *any* other classifier model and train it with cross-validation to get out-of-sample predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = LogisticRegression()\n", + "pred_probs = cross_val_predict(\n", + " estimator=model, X=X_train, y=noisy_labels, cv=5, method=\"predict_proba\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instantiate Datalab object" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we instantiate the Datalab object that will be used in the remainder in the tutorial by passing in the data created above.\n", + "\n", + "`Datalab` has several ways of loading the data. In this case, we'll simply wrap the training features and noisy labels in a dictionary so that we can pass it to `Datalab`.\n", + "\n", + "Other supported data formats for `Datalab` include: [HuggingFace Datasets](https://huggingface.co/docs/datasets/index) and [pandas DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html). `Datalab` works across most data modalities (image, text, tabular, audio, etc). It is intended to find issues that commonly occur in datasets for which you have trained a supervised ML model, regardless of the type of data.\n", + "\n", + "Currently, pandas DataFrames that contain categorical columns might cause some issues when instantiating the `Datalab` object, so it is recommended to ensure that your DataFrame does not contain any categorical columns, or use other data formats (eg. python dictionary, HuggingFace Datasets) to pass in your data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = {\"X\": X_train, \"y\": noisy_labels}\n", + "\n", + "lab = Datalab(data, label_name=\"y\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **Functionality 1**: Incremental issue search " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can call `find_issues` multiple times on a `Datalab` object to detect issues one type at a time.\n", + "\n", + "This is done via the `issue_types` argument which accepts a dictionary of issue types and any corresponding keyword arguments to specify nondefault keyword arguments to use for detecting each type of issues. In this first call, we only want to detect label issues, which are detected solely based on `pred_probs`, hence there is no need for us to pass in `features` here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lab.find_issues(pred_probs=pred_probs, issue_types={\"label\": {}}) \n", + "lab.report()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can check for additional types of issues with the same `Datalab`. Here, we would like to detect outliers and near duplicates which both utilize the features of the data.\n", + "\n", + "Notice that this second call to `find_issues()` updates the output of `report()`, we can see the existing label issues detected alongside the new issues." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lab.find_issues(features=data[\"X\"], issue_types={\"outlier\": {}, \"near_duplicate\": {}})\n", + "lab.report()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **Functionality 2**: Specifying nondefault arguments" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also overwrite previously-executed checks for a type of issue. Here we re-run the detection of outliers, but specify that different non-default settings should be used (in this case, the number of neighbors `k` compared against to determine which datapoints are outliers). \n", + "The results from this new detection will replace the original outlier detection results in the updated `Datalab`. You could similarly specify non-default settings for other issue types in the first call to `Datalab.find_issues()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lab.find_issues(features=data[\"X\"], issue_types={\"outlier\": {\"k\": 30}})\n", + "lab.report()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also increase the verbosity of the `report` to see additional information about the data issues and control how many top-ranked examples are shown for each issue." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lab.report(num_examples=10, verbosity=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice how the number of flagged outlier issues has changed after specfying different settings to use for outlier detection." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **Functionality 3**: Save and load Datalab objects" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A `Datalab` can be saved to a folder at a specified path. In a future Python process, this path can be used to load the `Datalab` from file back into memory. Your dataset is not saved as part of this process, so you'll need to save/load it separately to keep working with it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "path = \"datalab-files\"\n", + "lab.save(path, force=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can load a `Datalab` object we have on file and view the previously detected issues." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_lab = Datalab.load(path)\n", + "new_lab.report()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **Functionality 4**: Adding a custom IssueManager" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Datalab` detects pre-defined types of issues for you in one line of code: `find_issues()`. What if you want to check for other custom types of issues along with these pre-defined types, all within the same line of code?\n", + "\n", + "All issue types in `Datalab` are subclasses of cleanlab's `IssueManager` class.\n", + "To register a custom issue type for use with `Datalab`, simply also make it a subclass of `IssueManager`.\n", + "\n", + "The necessary members to implement in the subclass are:\n", + "\n", + "- A class variable called `issue_name` that acts as a unique identifier for the type of issue.\n", + "- An instance method called `find_issues` that:\n", + " - Computes a quality score for each example in the dataset (between 0-1), in terms of how *unlikely* it is to be an issue.\n", + " - Flags each example as an issue or not (may be based on thresholding the quality scores).\n", + " - Combine these in a dataframe that is assigned to an `issues` attribute of the `IssueManager`.\n", + " - Define a summary score for the overall quality of entire dataset, in terms of this type of issue. Set this score as part of the `summary` attribute of the `IssueManager`.\n", + " \n", + "To demonstrate this, we create an arbitrary issue type that checks the divisibility of an example's index in the dataset by 13." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from cleanlab.datalab.issue_manager import IssueManager\n", + "from cleanlab.datalab.factory import register\n", + "\n", + "\n", + "def scoring_function(idx: int, div: int = 13) -> float:\n", + " if idx == 0:\n", + " # Zero excluded from the divisibility check, gets the highest score\n", + " return 1\n", + " rem = idx % div\n", + " inv_scale = idx // div\n", + " if rem == 0:\n", + " return 0.5 * (1 - np.exp(-0.1*(inv_scale-1)))\n", + " else:\n", + " return 1 - 0.49 * (1 - np.exp(-inv_scale**0.5))*rem/div\n", + "\n", + "\n", + "@register # register this issue type for use with Datalab\n", + "class SuperstitionIssueManager(IssueManager):\n", + " \"\"\"A custom issue manager that keeps track of issue indices that\n", + " are divisible by 13.\n", + " \"\"\"\n", + " description: str = \"Examples with indices that are divisible by 13 may be unlucky.\" # Optional\n", + " issue_name: str = \"superstition\"\n", + "\n", + " def find_issues(self, div=13, **_) -> None:\n", + " ids = self.datalab.issues.index.to_series()\n", + " issues_mask = ids.apply(lambda idx: idx % div == 0 and idx != 0)\n", + " scores = ids.apply(lambda idx: scoring_function(idx, div))\n", + " self.issues = pd.DataFrame(\n", + " {\n", + " f\"is_{self.issue_name}_issue\": issues_mask,\n", + " self.issue_score_key: scores,\n", + " },\n", + " )\n", + " summary_score = 1 - sum(issues_mask) / len(issues_mask)\n", + " self.summary = self.make_summary(score = summary_score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once registered, this `IssueManager` will perform custom issue checks when `find_issues` is called on a `Datalab` instance.\n", + "\n", + "As our `Datalab` instance here already has results from the outlier and near duplicate checks, we perform the custom issue check separately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lab.find_issues(issue_types={\"superstition\": {}})\n", + "lab.report()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "vscode": { + "interpreter": { + "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/tutorials/datalab/datalab_quickstart.ipynb b/docs/source/tutorials/datalab/datalab_quickstart.ipynb new file mode 100644 index 0000000000..da1d62e3eb --- /dev/null +++ b/docs/source/tutorials/datalab/datalab_quickstart.ipynb @@ -0,0 +1,750 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Datalab: A unified audit to detect all kinds of issues in data and labels" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Datalab` helps you identify various issues in your machine learning datasets, such as noisy labels, outliers, (near) duplicates, and other types of problems that commonly occur in real-world data that may negatively impact the performance of your machine learning model if not addressed. `Datalab` utilizes *any* ML model you have already trained for your data to diagnose these issues, it only requires access to either: (probabilistic) predictions from your model or its learned representations of the data.\n", + "\n", + "\n", + "**Overview of what we'll do in this tutorial:**\n", + "\n", + "- Compute out-of-sample predicted probabilities for a sample dataset using cross-validation.\n", + "- Use `Datalab` to identify issues such as noisy labels, outliers, (near) duplicates, and other types of problems \n", + "- View the issue summaries and other information about our sample dataset\n", + "\n", + "You can easily replace our demo dataset with your own image/text/tabular/audio/etc dataset, and then run the same code to discover what sort of issues lurk within it!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "Quickstart\n", + "
    \n", + " \n", + "Already have (out-of-sample) `pred_probs` from a model trained on an existing set of labels? Maybe you have some `features` as well? Run the code below to examine your dataset for multiple types of issues.\n", + "\n", + "
    \n", + " \n", + "```ipython3 \n", + "from cleanlab import Datalab\n", + "\n", + "lab = Datalab(data=your_dataset, label_name=\"column_name_of_labels\")\n", + "lab.find_issues(features=your_feature_matrix, pred_probs=your_pred_probs)\n", + "\n", + "lab.report()\n", + "```\n", + " \n", + "
    \n", + "
    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Install and import required dependencies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Datalab` has additional dependencies that are not included in the standard installation of cleanlab.\n", + "\n", + "You can use pip to install all packages required for this tutorial as follows:\n", + "\n", + "```\n", + "!pip install matplotlib \n", + "!pip install cleanlab[datalab]\n", + "\n", + "# Make sure to install the version corresponding to this tutorial\n", + "# E.g. if viewing master branch documentation:\n", + "# !pip install git+https://github.com/cleanlab/cleanlab.git\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Package installation (hidden on docs website).\n", + "dependencies = [\"cleanlab\", \"matplotlib\", \"datasets\"] # TODO: make sure this list is updated\n", + "\n", + "if \"google.colab\" in str(get_ipython()): # Check if it's running in Google Colab\n", + " %pip install cleanlab # for colab\n", + " cmd = ' '.join([dep for dep in dependencies if dep != \"cleanlab\"])\n", + " %pip install $cmd\n", + "else:\n", + " missing_dependencies = []\n", + " for dependency in dependencies:\n", + " try:\n", + " __import__(dependency)\n", + " except ImportError:\n", + " missing_dependencies.append(dependency)\n", + "\n", + " if len(missing_dependencies) > 0:\n", + " print(\"Missing required dependencies:\")\n", + " print(*missing_dependencies, sep=\", \")\n", + " print(\"\\nPlease install them before running the rest of this notebook.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: remove cell before merging to master\n", + "# !pip install git+https://github.com/cleanlab/cleanlab.git#egg=cleanlab[datalab]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import cross_val_predict\n", + "\n", + "from cleanlab import Datalab" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Create and load the data (can skip these details)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll load a toy classification dataset for this tutorial. The dataset has two numerical features and a label column with three possible classes. Each example is classified as either: *low*, *mid* or *high*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    See the code for data generation. **(click to expand)**\n", + " \n", + "```ipython3\n", + "# Note: This pulldown content is for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from cleanlab.benchmarking.noise_generation import (\n", + " generate_noise_matrix_from_trace,\n", + " generate_noisy_labels,\n", + ")\n", + "\n", + "SEED = 123\n", + "np.random.seed(SEED)\n", + "\n", + "BINS = {\n", + " \"low\": [-np.inf, 3.3],\n", + " \"mid\": [3.3, 6.6],\n", + " \"high\": [6.6, +np.inf],\n", + "}\n", + "\n", + "BINS_MAP = {\n", + " \"low\": 0,\n", + " \"mid\": 1,\n", + " \"high\": 2,\n", + "}\n", + "\n", + "\n", + "def create_data():\n", + "\n", + " X = np.random.rand(250, 2) * 5\n", + " y = np.sum(X, axis=1)\n", + " # Map y to bins based on the BINS dict\n", + " y_bin = np.array([k for y_i in y for k, v in BINS.items() if v[0] <= y_i < v[1]])\n", + " y_bin_idx = np.array([BINS_MAP[k] for k in y_bin])\n", + "\n", + " # Split into train and test\n", + " X_train, X_test, y_train, y_test, y_train_idx, y_test_idx = train_test_split(\n", + " X, y_bin, y_bin_idx, test_size=0.5, random_state=SEED\n", + " )\n", + "\n", + " # Add several (5) out-of-distribution points. Sliding them along the decision boundaries\n", + " # to make them look like they are out-of-frame\n", + " X_out = np.array(\n", + " [\n", + " [-1.5, 3.0],\n", + " [-1.75, 6.5],\n", + " [1.5, 7.2],\n", + " [2.5, -2.0],\n", + " [5.5, 7.0],\n", + " ]\n", + " )\n", + " # Add a near duplicate point to the last outlier, with some tiny noise added\n", + " near_duplicate = X_out[-1:] + np.random.rand(1, 2) * 1e-6\n", + " X_out = np.concatenate([X_out, near_duplicate])\n", + "\n", + " y_out = np.sum(X_out, axis=1)\n", + " y_out_bin = np.array([k for y_i in y_out for k, v in BINS.items() if v[0] <= y_i < v[1]])\n", + " y_out_bin_idx = np.array([BINS_MAP[k] for k in y_out_bin])\n", + "\n", + " # Add to train\n", + " X_train = np.concatenate([X_train, X_out])\n", + " y_train = np.concatenate([y_train, y_out])\n", + " y_train_idx = np.concatenate([y_train_idx, y_out_bin_idx])\n", + "\n", + " # Add an exact duplicate example to the training set\n", + " exact_duplicate_idx = np.random.randint(0, len(X_train))\n", + " X_duplicate = X_train[exact_duplicate_idx, None]\n", + " y_duplicate = y_train[exact_duplicate_idx, None]\n", + " y_duplicate_idx = y_train_idx[exact_duplicate_idx, None]\n", + "\n", + " # Add to train\n", + " X_train = np.concatenate([X_train, X_duplicate])\n", + " y_train = np.concatenate([y_train, y_duplicate])\n", + " y_train_idx = np.concatenate([y_train_idx, y_duplicate_idx])\n", + "\n", + " py = np.bincount(y_train_idx) / float(len(y_train_idx))\n", + " m = len(BINS)\n", + "\n", + " noise_matrix = generate_noise_matrix_from_trace(\n", + " m,\n", + " trace=0.9 * m,\n", + " py=py,\n", + " valid_noise_matrix=True,\n", + " seed=SEED,\n", + " )\n", + "\n", + " noisy_labels_idx = generate_noisy_labels(y_train_idx, noise_matrix)\n", + " noisy_labels = np.array([list(BINS_MAP.keys())[i] for i in noisy_labels_idx])\n", + "\n", + " return X_train, y_train_idx, noisy_labels, noisy_labels_idx, X_out, X_duplicate\n", + "```\n", + "\n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from cleanlab.benchmarking.noise_generation import (\n", + " generate_noise_matrix_from_trace,\n", + " generate_noisy_labels,\n", + ")\n", + "\n", + "SEED = 123\n", + "np.random.seed(SEED)\n", + "\n", + "BINS = {\n", + " \"low\": [-np.inf, 3.3],\n", + " \"mid\": [3.3, 6.6],\n", + " \"high\": [6.6, +np.inf],\n", + "}\n", + "\n", + "BINS_MAP = {\n", + " \"low\": 0,\n", + " \"mid\": 1,\n", + " \"high\": 2,\n", + "}\n", + "\n", + "\n", + "def create_data():\n", + "\n", + " X = np.random.rand(250, 2) * 5\n", + " y = np.sum(X, axis=1)\n", + " # Map y to bins based on the BINS dict\n", + " y_bin = np.array([k for y_i in y for k, v in BINS.items() if v[0] <= y_i < v[1]])\n", + " y_bin_idx = np.array([BINS_MAP[k] for k in y_bin])\n", + "\n", + " # Split into train and test\n", + " X_train, X_test, y_train, y_test, y_train_idx, y_test_idx = train_test_split(\n", + " X, y_bin, y_bin_idx, test_size=0.5, random_state=SEED\n", + " )\n", + "\n", + " # Add several (5) out-of-distribution points. Sliding them along the decision boundaries\n", + " # to make them look like they are out-of-frame\n", + " X_out = np.array(\n", + " [\n", + " [-1.5, 3.0],\n", + " [-1.75, 6.5],\n", + " [1.5, 7.2],\n", + " [2.5, -2.0],\n", + " [5.5, 7.0],\n", + " ]\n", + " )\n", + " # Add a near duplicate point to the last outlier, with some tiny noise added\n", + " near_duplicate = X_out[-1:] + np.random.rand(1, 2) * 1e-6\n", + " X_out = np.concatenate([X_out, near_duplicate])\n", + "\n", + " y_out = np.sum(X_out, axis=1)\n", + " y_out_bin = np.array([k for y_i in y_out for k, v in BINS.items() if v[0] <= y_i < v[1]])\n", + " y_out_bin_idx = np.array([BINS_MAP[k] for k in y_out_bin])\n", + "\n", + " # Add to train\n", + " X_train = np.concatenate([X_train, X_out])\n", + " y_train = np.concatenate([y_train, y_out])\n", + " y_train_idx = np.concatenate([y_train_idx, y_out_bin_idx])\n", + "\n", + " # Add an exact duplicate example to the training set\n", + " exact_duplicate_idx = np.random.randint(0, len(X_train))\n", + " X_duplicate = X_train[exact_duplicate_idx, None]\n", + " y_duplicate = y_train[exact_duplicate_idx, None]\n", + " y_duplicate_idx = y_train_idx[exact_duplicate_idx, None]\n", + "\n", + " # Add to train\n", + " X_train = np.concatenate([X_train, X_duplicate])\n", + " y_train = np.concatenate([y_train, y_duplicate])\n", + " y_train_idx = np.concatenate([y_train_idx, y_duplicate_idx])\n", + "\n", + " py = np.bincount(y_train_idx) / float(len(y_train_idx))\n", + " m = len(BINS)\n", + "\n", + " noise_matrix = generate_noise_matrix_from_trace(\n", + " m,\n", + " trace=0.9 * m,\n", + " py=py,\n", + " valid_noise_matrix=True,\n", + " seed=SEED,\n", + " )\n", + "\n", + " noisy_labels_idx = generate_noisy_labels(y_train_idx, noise_matrix)\n", + " noisy_labels = np.array([list(BINS_MAP.keys())[i] for i in noisy_labels_idx])\n", + "\n", + " return X_train, y_train_idx, noisy_labels, noisy_labels_idx, X_out, X_duplicate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, y_train_idx, noisy_labels, noisy_labels_idx, X_out, X_duplicate = create_data()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We make a scatter plot of the features, with a color corresponding to the observed labels. Incorrect given labels are highlighted in red if they do not match the true label, outliers highlighted with an a black cross, and duplicates highlighted with a cyan cross." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    See the code to visualize the data. **(click to expand)**\n", + " \n", + "```ipython3\n", + "# Note: This pulldown content is for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_data(X_train, y_train_idx, noisy_labels_idx, X_out, X_duplicate):\n", + " # Plot data with clean labels and noisy labels, use BINS_MAP for the legend\n", + " fig, ax = plt.subplots(figsize=(8, 6))\n", + " \n", + " low = ax.scatter(X_train[noisy_labels_idx == 0, 0], X_train[noisy_labels_idx == 0, 1], label=\"low\")\n", + " mid = ax.scatter(X_train[noisy_labels_idx == 1, 0], X_train[noisy_labels_idx == 1, 1], label=\"mid\")\n", + " high = ax.scatter(X_train[noisy_labels_idx == 2, 0], X_train[noisy_labels_idx == 2, 1], label=\"high\")\n", + " \n", + " ax.set_title(\"Noisy labels\")\n", + " ax.set_xlabel(r\"$x_1$\")\n", + " ax.set_ylabel(r\"$x_2$\")\n", + "\n", + " # Plot true boundaries (x+y=3.3, x+y=6.6)\n", + " ax.set_xlim(-3.5, 8.5)\n", + " ax.set_ylim(-3.5, 8.5)\n", + " ax.plot([-0.7, 4.0], [4.0, -0.7], color=\"k\", linestyle=\"--\", alpha=0.5)\n", + " ax.plot([-0.7, 7.3], [7.3, -0.7], color=\"k\", linestyle=\"--\", alpha=0.5)\n", + "\n", + " # Draw red circles around the points that are misclassified (i.e. the points that are in the wrong bin)\n", + " for i, (X, y) in enumerate(zip([X_train, X_train], [y_train_idx, noisy_labels_idx])):\n", + " for j, (k, v) in enumerate(BINS_MAP.items()):\n", + " label_err = ax.scatter(\n", + " X[(y == v) & (y != y_train_idx), 0],\n", + " X[(y == v) & (y != y_train_idx), 1],\n", + " s=180,\n", + " marker=\"o\",\n", + " facecolor=\"none\",\n", + " edgecolors=\"red\",\n", + " linewidths=2.5,\n", + " alpha=0.5,\n", + " label=\"Label error\",\n", + " )\n", + "\n", + "\n", + " outlier = ax.scatter(X_out[:, 0], X_out[:, 1], color=\"k\", marker=\"x\", s=100, linewidth=2, label=\"Outlier\")\n", + "\n", + " # Plot the exact duplicate\n", + " dups = ax.scatter(\n", + " X_duplicate[:, 0],\n", + " X_duplicate[:, 1],\n", + " color=\"c\",\n", + " marker=\"x\",\n", + " s=100,\n", + " linewidth=2,\n", + " label=\"Duplicates\",\n", + " )\n", + " \n", + " first_legend = ax.legend(handles=[low, mid, high], loc=[0.785, 0.8], title=\"Given Class Label\", alignment=\"left\", title_fontproperties={\"weight\":\"semibold\"})\n", + " second_legend = ax.legend(handles=[label_err, outlier, dups], loc=[0.785, 0.6], title=\"Type of Issue\", alignment=\"left\", title_fontproperties={\"weight\":\"semibold\"})\n", + " \n", + " ax = plt.gca().add_artist(first_legend)\n", + " ax = plt.gca().add_artist(second_legend)\n", + " plt.tight_layout()\n", + "```\n", + " \n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_data(X_train, y_train_idx, noisy_labels_idx, X_out, X_duplicate):\n", + " # Plot data with clean labels and noisy labels, use BINS_MAP for the legend\n", + " fig, ax = plt.subplots(figsize=(8, 6))\n", + " \n", + " low = ax.scatter(X_train[noisy_labels_idx == 0, 0], X_train[noisy_labels_idx == 0, 1], label=\"low\")\n", + " mid = ax.scatter(X_train[noisy_labels_idx == 1, 0], X_train[noisy_labels_idx == 1, 1], label=\"mid\")\n", + " high = ax.scatter(X_train[noisy_labels_idx == 2, 0], X_train[noisy_labels_idx == 2, 1], label=\"high\")\n", + " \n", + " ax.set_title(\"Noisy labels\")\n", + " ax.set_xlabel(r\"$x_1$\")\n", + " ax.set_ylabel(r\"$x_2$\")\n", + "\n", + " # Plot true boundaries (x+y=3.3, x+y=6.6)\n", + " ax.set_xlim(-3.5, 8.5)\n", + " ax.set_ylim(-3.5, 8.5)\n", + " ax.plot([-0.7, 4.0], [4.0, -0.7], color=\"k\", linestyle=\"--\", alpha=0.5)\n", + " ax.plot([-0.7, 7.3], [7.3, -0.7], color=\"k\", linestyle=\"--\", alpha=0.5)\n", + "\n", + " # Draw red circles around the points that are misclassified (i.e. the points that are in the wrong bin)\n", + " for i, (X, y) in enumerate(zip([X_train, X_train], [y_train_idx, noisy_labels_idx])):\n", + " for j, (k, v) in enumerate(BINS_MAP.items()):\n", + " label_err = ax.scatter(\n", + " X[(y == v) & (y != y_train_idx), 0],\n", + " X[(y == v) & (y != y_train_idx), 1],\n", + " s=180,\n", + " marker=\"o\",\n", + " facecolor=\"none\",\n", + " edgecolors=\"red\",\n", + " linewidths=2.5,\n", + " alpha=0.5,\n", + " label=\"Label error\",\n", + " )\n", + "\n", + "\n", + " outlier = ax.scatter(X_out[:, 0], X_out[:, 1], color=\"k\", marker=\"x\", s=100, linewidth=2, label=\"Outlier\")\n", + "\n", + " # Plot the exact duplicate\n", + " dups = ax.scatter(\n", + " X_duplicate[:, 0],\n", + " X_duplicate[:, 1],\n", + " color=\"c\",\n", + " marker=\"x\",\n", + " s=100,\n", + " linewidth=2,\n", + " label=\"Duplicates\",\n", + " )\n", + " \n", + " first_legend = ax.legend(handles=[low, mid, high], loc=[0.785, 0.8], title=\"Given Class Label\", alignment=\"left\", title_fontproperties={\"weight\":\"semibold\"})\n", + " second_legend = ax.legend(handles=[label_err, outlier, dups], loc=[0.785, 0.6], title=\"Type of Issue\", alignment=\"left\", title_fontproperties={\"weight\":\"semibold\"})\n", + " \n", + " ax = plt.gca().add_artist(first_legend)\n", + " ax = plt.gca().add_artist(second_legend)\n", + " plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_data(X_train, y_train_idx, noisy_labels_idx, X_out, X_duplicate)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In real-world scenarios, you won't know the true labels or the distribution of the features, so we won't use these in this tutorial, except for evaluation purposes.\n", + "\n", + "\n", + "\n", + "`Datalab` has several ways of loading the data.\n", + "In this case, we'll simply wrap the training features and noisy labels in a dictionary so that we can pass it to `Datalab`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = {\"X\": X_train, \"y\": noisy_labels}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Other supported data formats for `Datalab` include: [HuggingFace Datasets](https://huggingface.co/docs/datasets/index) and [pandas DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html). `Datalab` works across most data modalities (image, text, tabular, audio, etc). It is intended to find issues that commonly occur in datasets for which you have trained a supervised ML model, regardless of the type of data.\n", + "\n", + "Currently, pandas DataFrames that contain categorical columns might cause some issues when instantiating the `Datalab` object, so it is recommended to ensure that your DataFrame does not contain any categorical columns, or use other data formats (eg. python dictionary, HuggingFace Datasets) to pass in your data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Get out-of-sample predicted probabilities from a classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To detect certain types of issues in classification data (e.g. label errors), `Datalab` relies on predicted class probabilities from a trained model. Ideally, the prediction for each example should be out-of-sample (to avoid overfitting), coming from a copy of the model that was not trained on this example. \n", + "\n", + "This tutorial uses a simple logistic regression model \n", + "and the `cross_val_predict()` function from scikit-learn to generate out-of-sample predicted class probabilities for every example in the training set. You can replace this with *any* other classifier model and train it with cross-validation to get out-of-sample predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = LogisticRegression()\n", + "pred_probs = cross_val_predict(\n", + " estimator=model, X=data[\"X\"], y=data[\"y\"], cv=5, method=\"predict_proba\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Use Datalab to find issues in the dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We create a `Datalab` object from the dataset, also providing the name of the label column in the dataset. Only instantiate one `Datalab` object per dataset, and note that only classification datasets are supported for now.\n", + "\n", + "All that is need to audit your data is to call `find_issues()`.\n", + "This method accepts various inputs like: predicted class probabilities, numeric feature representations of the data. The more information you provide here, the more thoroughly `Datalab` will audit your data! Note that `features` should be some numeric representation of each example, either obtained through preprocessing transformation of your raw data or embeddings from a (pre)trained model. In this case, our data is already entirely numeric so we just provide the features directly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lab = Datalab(data, label_name=\"y\")\n", + "lab.find_issues(pred_probs=pred_probs, features=data[\"X\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's review the results of this audit using `report()`.\n", + "This provides a high-level summary of each type of issue found in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lab.report()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Learn more about the issues in your dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are several methods to get more details about a particular issue.\n", + "\n", + "The `get_summary()` method fetches summary statistics regarding how severe each type of issue is overall across the whole dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lab.get_summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also only request the summary for a particular type of issue." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lab.get_summary(\"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `get_issues()` method returns information for each individual example about: whether or not it is plagued by this issue, as well as a quality score for how severe this issue appears to be. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lab.get_issues().head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similar to above, we can pass the type of issue as a argument to `get_issues()` to get the information for that particular issue.\n", + "\n", + "Lower scores indicate more severe instances of the issue, so you can sort by these values to see the most concerning examples in your dataset for each type of issue. Here we show an example of how to get the examples that have been identified as having the most severe label issues." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "examples_w_issue = (\n", + " lab.get_issues(\"label\")\n", + " .query(\"is_label_issue\")\n", + " .sort_values(\"label_score\")\n", + ")\n", + "\n", + "examples_w_issue.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking at the labels for some of these top-ranked examples, we find their given label was indeed incorrect!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Additional information (statistics, intermediate results, etc) related to a particular issue type can be accessed via `get_info(issue_name)`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "lab.get_info(\"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lastly, you can get all sorts of information regarding your dataset using the `get_info()` method (with no arguments passed). This will not be printed in this tutorial as the return is a large dictionary but feel free to check it out yourself!" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Datalab` makes it very easy to check your datasets for all sorts of issues that are important to deal with for training robust models. The inputs it uses to detect issues can come from *any* model you have trained (the better your model, the more accurate the issue detection will be).\n", + "\n", + "For more information, check out this [examples notebook](https://github.com/cleanlab/examples/blob/master/datalab_image_classification/datalab.ipynb) and the [advanced Datalab tutorial](datalab_advanced.html)." + ] + } + ], + "metadata": { + "celltoolbar": "Edit Metadata", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "vscode": { + "interpreter": { + "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/tutorials/datalab/image.ipynb b/docs/source/tutorials/datalab/image.ipynb new file mode 100644 index 0000000000..4f928b3b43 --- /dev/null +++ b/docs/source/tutorials/datalab/image.ipynb @@ -0,0 +1,551 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Image Classification with PyTorch and Datalab\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This 5-minute quickstart tutorial demonstrates how to find potential label errors in image classification data. Here we use the MNIST dataset containing 70,000 images of handwritten digits from 0 to 9.\n", + "\n", + "**Overview of what we'll do in this tutorial:**\n", + "\n", + "- Build a simple [PyTorch](https://pytorch.org/) neural net and wrap it with [skorch](https://skorch.readthedocs.io/) to make it scikit-learn compatible.\n", + "\n", + "- Use this model to compute out-of-sample predicted probabilities, `pred_probs`, via cross-validation.\n", + "\n", + "- Use these predictions to estimate which images in the dataset are mislabeled via cleanlab's `Datalab` class.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "Quickstart\n", + "
    \n", + " \n", + "Already have a `model`? Run cross-validation to get out-of-sample `pred_probs` and then the code below to find any potential label errors in your dataset.\n", + "\n", + "\n", + "
    \n", + " \n", + "```python\n", + "from cleanlab import Datalab\n", + "\n", + "lab = Datalab(data=your_dataset, label_name=\"column_name_of_labels\")\n", + "lab.find_issues(pred_probs=your_pred_probs, issue_types={\"label\":{}})\n", + "\n", + "lab.get_issues(\"label\")\n", + "```\n", + " \n", + "
    \n", + "
    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Install and import required dependencies\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can use `pip` to install all packages required for this tutorial as follows:\n", + "\n", + "```ipython3\n", + "!pip install matplotlib torch torchvision skorch datasets\n", + "!pip install cleanlab[datalab]\n", + "# Make sure to install the version corresponding to this tutorial\n", + "# E.g. if viewing master branch documentation:\n", + "# !pip install git+https://github.com/cleanlab/cleanlab.git\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Package installation (this cell is hidden from docs.cleanlab.ai).\n", + "# If running on Colab, may want to use GPU (select: Runtime > Change runtime type > Hardware accelerator > GPU)\n", + "# Package versions used: matplotlib==3.5.1 torch==1.11.0 skorch==0.11.0\n", + "\n", + "dependencies = [\"cleanlab\", \"matplotlib\", \"torch\", \"torchvision\", \"skorch\", \"datasets\"]\n", + "\n", + "if \"google.colab\" in str(get_ipython()): # Check if it's running in Google Colab\n", + " %pip install cleanlab # for colab\n", + " cmd = ' '.join([dep for dep in dependencies if dep != \"cleanlab\"])\n", + " %pip install $cmd\n", + "else:\n", + " missing_dependencies = []\n", + " for dependency in dependencies:\n", + " try:\n", + " __import__(dependency)\n", + " except ImportError:\n", + " missing_dependencies.append(dependency)\n", + "\n", + " if len(missing_dependencies) > 0:\n", + " print(\"Missing required dependencies:\")\n", + " print(*missing_dependencies, sep=\", \")\n", + " print(\"\\nPlease install them before running the rest of this notebook.\")\n", + "\n", + "# Suppress benign warnings: \n", + "import warnings \n", + "warnings.filterwarnings(\"ignore\", \"Lazy modules are a new feature.*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch import nn\n", + "from sklearn.datasets import fetch_openml\n", + "from sklearn.model_selection import cross_val_predict\n", + "from sklearn.metrics import accuracy_score\n", + "from skorch import NeuralNetClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# This (optional) cell is hidden from docs.cleanlab.ai \n", + "\n", + "import numpy as np \n", + "\n", + "SEED = 123 # for reproducibility \n", + "np.random.seed(SEED)\n", + "torch.manual_seed(SEED)\n", + "torch.backends.cudnn.deterministic = True\n", + "torch.backends.cudnn.benchmark = False\n", + "torch.cuda.manual_seed_all(SEED)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Fetch and scale the MNIST dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mnist = fetch_openml(\"mnist_784\") # Fetch the MNIST dataset\n", + "\n", + "X = mnist.data.astype(\"float32\").to_numpy() # 2D array (images are flattened into 1D)\n", + "X /= 255.0 # Scale the features to the [0, 1] range\n", + "X = X.reshape(len(X), 1, 28, 28) # reshape into [N, C, H, W] for PyTorch\n", + "\n", + "labels = mnist.target.astype(\"int64\").to_numpy() # 1D array of given labels" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "Bringing Your Own Data (BYOD)?\n", + "\n", + "Assign your data's features to variable `X` and its labels to variable `labels` instead, and continue with the rest of the tutorial.\n", + "\n", + "
    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Define a classification model\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we define a simple neural network with PyTorch.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ClassifierModule(nn.Module):\n", + " def __init__(self):\n", + " super().__init__()\n", + "\n", + " self.cnn = nn.Sequential(\n", + " nn.Conv2d(1, 6, 3),\n", + " nn.ReLU(),\n", + " nn.BatchNorm2d(6),\n", + " nn.MaxPool2d(kernel_size=2, stride=2),\n", + " nn.Conv2d(6, 16, 3),\n", + " nn.ReLU(),\n", + " nn.BatchNorm2d(16),\n", + " nn.MaxPool2d(kernel_size=2, stride=2),\n", + " )\n", + " self.out = nn.Sequential(\n", + " nn.Flatten(),\n", + " nn.LazyLinear(128),\n", + " nn.ReLU(),\n", + " nn.Linear(128, 10),\n", + " nn.Softmax(dim=-1),\n", + " )\n", + "\n", + " def forward(self, X):\n", + " X = self.cnn(X)\n", + " X = self.out(X)\n", + " return X" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Ensure your classifier is scikit-learn compatible\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As some cleanlab features require scikit-learn compatibility, we adapt the above PyTorch neural net accordingly. [skorch](https://skorch.readthedocs.io) is a convenient package that helps with this. Alternatively, you can also easily wrap an arbitrary model to be scikit-learn compatible as demonstrated [here](https://github.com/cleanlab/cleanlab#use-cleanlab-with-any-model-for-most-ml-tasks)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_skorch = NeuralNetClassifier(ClassifierModule)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Compute out-of-sample predicted probabilities\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we'd like cleanlab to identify potential label errors in the whole dataset and not just the training set, we can consider using the entire dataset when computing the out-of-sample predicted probabilities, `pred_probs`, via cross-validation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_crossval_folds = 3 # for efficiency; values like 5 or 10 will generally work better\n", + "pred_probs = cross_val_predict(\n", + " model_skorch,\n", + " X,\n", + " labels,\n", + " cv=num_crossval_folds,\n", + " method=\"predict_proba\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An additional benefit of cross-validation is that it facilitates more reliable evaluation of our model than a single training/validation split." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predicted_labels = pred_probs.argmax(axis=1)\n", + "acc = accuracy_score(labels, predicted_labels)\n", + "print(f\"Cross-validated estimate of accuracy on held-out data: {acc}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Use cleanlab to find label issues\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Based on the given labels and out-of-sample predicted probabilities, cleanlab can quickly help us identify label issues in our dataset. \n", + "\n", + "Here, we use cleanlab's `Datalab` to find potential label errors in our data. `Datalab` has several ways of loading the data. In this case, we’ll simply wrap the training features and noisy labels in a dictionary. We can instantiate our `Datalab` object with the dictionary created, and then pass in the model predicted probabilities we obtained above, and specify that we want to look for label errors by specifying that using the `issue_types` argument." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from cleanlab import Datalab\n", + "\n", + "data = {\"X\": X, \"y\": labels}\n", + "\n", + "lab = Datalab(data, label_name=\"y\")\n", + "lab.find_issues(pred_probs=pred_probs, issue_types={\"label\":{}})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the audit is complete, we can view the results and information regarding the labels using Datalab's `get_issues` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "issue_results = lab.get_issues(\"label\")\n", + "issue_results.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataframe above contains a label quality score for each example. These numeric scores lie between 0 and 1, where lower scores indicate examples more likely to be mislabeled. It contains a boolean column specifying whether or not each example is identified to have a label issue (indicating it is likely mislabeled).\n", + "\n", + "We can sort the results obtained by label score to find the indices of the 15 most likely mislabeled examples in our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ranked_label_issues = issue_results.sort_values(\"label_score\").index\n", + "\n", + "print(f\"Top 15 most likely label errors: \\n {ranked_label_issues.values[:15]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`ranked_label_issues` is a list of indices ranked by the label score of each example, the top indices in the list corresponding to examples that are worth inspecting more closely. To help visualize specific examples, we define a `plot_examples` function (can skip these details)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    See the implementation of `plot_examples` **(click to expand)**\n", + "\n", + "```python\n", + "# Note: This pulldown content is for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_examples(id_iter, nrows=1, ncols=1):\n", + " for count, id in enumerate(id_iter):\n", + " plt.subplot(nrows, ncols, count + 1)\n", + " plt.imshow(X[id].reshape(28, 28), cmap=\"gray\")\n", + " plt.title(f\"id: {id} \\n label: {y[id]}\")\n", + " plt.axis(\"off\")\n", + "\n", + " plt.tight_layout(h_pad=2.0)\n", + "```\n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_examples(id_iter, nrows=1, ncols=1):\n", + " for count, id in enumerate(id_iter):\n", + " plt.subplot(nrows, ncols, count + 1)\n", + " plt.imshow(X[id].reshape(28, 28), cmap=\"gray\")\n", + " plt.title(f\"id: {id} \\n label: {labels[id]}\")\n", + " plt.axis(\"off\")\n", + "\n", + " plt.tight_layout(h_pad=2.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's look at the top 15 examples cleanlab thinks are most likely to be incorrectly labeled. We can see a few label errors and odd edge cases. Feel free to change the values below to display more/fewer examples." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_examples(ranked_label_issues[range(15)], 3, 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's zoom into some specific examples from the above set:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Given label is **4** but looks more like a **7**:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_examples([59915])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Given label is **4** but also looks like **9**:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_examples([24798])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A very odd looking **5**:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_examples([59701])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Given label is **3** but could be a **7**:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_examples([50340])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "cleanlab has shortlisted the most likely label errors to speed up your data cleaning process. With this list, you can decide whether to fix label issues or prune some of these examples from the dataset. \n", + "\n", + "You can see that even widely-used datasets like MNIST contain problematic labels. Never blindly trust your data! You should always check it for potential issues, many of which can be easily identified by cleanlab.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Note: This cell is only for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n", + "\n", + "highlighted_indices = [59915, 24798, 59701, 50340] # verify these examples were found by cleanlab\n", + "if not all(x in ranked_label_issues for x in highlighted_indices):\n", + " raise Exception(\"Some highlighted examples are missing from ranked_label_issues.\")" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "ced20e3e49bb4fa4ce8ad38f8f2535b7fc4c39b2b89554502b5dbdad1ad67eda" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/tutorials/datalab/index.rst b/docs/source/tutorials/datalab/index.rst new file mode 100644 index 0000000000..87c3921263 --- /dev/null +++ b/docs/source/tutorials/datalab/index.rst @@ -0,0 +1,12 @@ +Datalab Tutorials +========= + +.. toctree:: + :maxdepth: 1 + + Detecting Common Data Issues with Datalab + Perform Advanced Data Auditing with Datalab + Text Classification + Tabular Classification + Image Classification Label Errors + Audio Classification Label Errors