diff --git a/.github/get_min_dependencies.py b/.github/get_min_dependencies.py
new file mode 100644
index 0000000000..72dce32f17
--- /dev/null
+++ b/.github/get_min_dependencies.py
@@ -0,0 +1,15 @@
+"""This script fetches minimum dependencies of cleanlab package and writes them to the file requirements-min.txt"""
+import json
+
+
+if __name__ == "__main__":
+ with open("./deps.json", "r") as f:
+ deps = json.load(f)
+
+ for package in deps:
+ if package["package"]["package_name"] == "cleanlab":
+ for dep in package["dependencies"]:
+ req_version = dep["required_version"]
+ with open("requirements-min.txt", "a") as f:
+ if req_version.startswith(">="):
+ f.write(f"{dep['package_name']}=={req_version[2:]}\n")
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0896e2a082..e586395a36 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -5,6 +5,53 @@ on:
schedule:
- cron: '0 12 * * 1'
jobs:
+ test37:
+ name: "Test: Python 3.7 on ${{ matrix.os }}"
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os:
+ - ubuntu-latest
+ - macos-latest
+ steps:
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
+ with:
+ python-version: 3.7.16
+ - name: Install cleanlab
+ run: pip install -e .
+ - name: Install development dependencies
+ run: pip install -r requirements-dev.txt
+ - name: Install fasttext for non-Windows machines
+ if: matrix.os != 'windows-latest'
+ run: |
+ pip install fasttext
+ - name: Test with coverage
+ run: pytest --verbose --cov=cleanlab/ --cov-config .coveragerc --cov-report=xml
+ env:
+ TEST_FASTTEXT: true
+ - uses: codecov/codecov-action@v3
+ test37-windows:
+ name: "Test: Python 3.7 on windows"
+ runs-on: windows-latest
+ steps:
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
+ with:
+ python-version: 3.7.9
+ - name: Install cleanlab
+ run: pip install -e .
+ - name: Install development dependencies
+ run: pip install -r requirements-dev.txt
+ - name: Overwrite tensorflow version on Windows
+ run: |
+ pip uninstall -y tensorflow
+ pip install tensorflow-cpu
+ - name: Test with coverage
+ run: pytest --verbose --cov=cleanlab/ --cov-config .coveragerc --cov-report=xml
+ env:
+ TEST_FASTTEXT: true
+ - uses: codecov/codecov-action@v3
test:
name: "Test: Python ${{ matrix.python }} on ${{ matrix.os }}"
runs-on: ${{ matrix.os }}
@@ -15,7 +62,6 @@ jobs:
- macos-latest
- windows-latest
python:
- - "3.7"
- "3.8"
- "3.9"
- "3.10"
@@ -26,10 +72,12 @@ jobs:
python-version: ${{ matrix.python }}
- name: Install cleanlab
run: pip install -e .
- - name: Check cleanlab runs without optional dependencies
- run: python3 -c "import cleanlab"
- name: Install development dependencies
run: pip install -r requirements-dev.txt
+ - name: Install fasttext for non-Windows machines
+ if: matrix.os != 'windows-latest'
+ run: |
+ pip install fasttext
- name: Overwrite tensorflow version on Windows
if: matrix.os == 'windows-latest'
run: |
@@ -37,7 +85,32 @@ jobs:
pip install tensorflow-cpu
- name: Test with coverage
run: pytest --verbose --cov=cleanlab/ --cov-config .coveragerc --cov-report=xml
- - uses: codecov/codecov-action@v2
+ env:
+ TEST_FASTTEXT: true
+ - uses: codecov/codecov-action@v3
+ test-without-extras-min-versions:
+ name: Test without optional dependencies and with minimum compatible versions of dependencies
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
+ with:
+ python-version: '3.7'
+ - name: Install cleanlab
+ run: |
+ python -m pip install --upgrade pip
+ pip install .
+ - name: Install test dependencies
+ run: |
+ pip install pytest pytest-lazy-fixture pipdeptree
+ pipdeptree -j > deps.json
+ - name: Install minimum versions
+ run: |
+ python ./.github/get_min_dependencies.py
+ pip install -r requirements-min.txt
+ - name: Run tests
+ run: |
+ pytest tests/test_multilabel_classification.py tests/test_multiannotator.py tests/test_filter_count.py
typecheck:
name: Type check
runs-on: ubuntu-latest
diff --git a/.github/workflows/gh-pages.yaml b/.github/workflows/gh-pages.yaml
index a1f2f84fa7..ae0ce12e53 100644
--- a/.github/workflows/gh-pages.yaml
+++ b/.github/workflows/gh-pages.yaml
@@ -28,12 +28,12 @@ jobs:
sudo tar xzvf pandoc-2.19.2-linux-amd64.tar.gz --strip-components 1 -C /usr/local
- name: Setup Python
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Setup Node
- uses: actions/setup-node@v2
+ uses: actions/setup-node@v3
with:
node-version: "16"
@@ -45,7 +45,7 @@ jobs:
run: echo "::set-output name=dir::$(pip cache dir)"
- name: Cache dependencies
- uses: actions/cache@v2
+ uses: actions/cache@v3
with:
path: ${{ steps.pip-cache.outputs.dir }}
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml
index fba7abc18c..488bea1b45 100644
--- a/.github/workflows/links.yml
+++ b/.github/workflows/links.yml
@@ -12,15 +12,18 @@ jobs:
- run: >-
sudo apt-get install -y
pandoc
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- run: |
find . -name '*.html' -delete
- run: |
find . -name '*.md' -exec pandoc -i {} -o {}.html \;
- - uses: anishathalye/proof-html@v1
+ - uses: anishathalye/proof-html@v2
with:
directory: .
+ check_html: false
check_favicon: false
- empty_alt_ignore: true
- url_ignore_re: |
- ^https:\/\/docs\.github\.com\/
+ ignore_missing_alt: true
+ tokens: |
+ {"https://github.com": "${{ secrets.GITHUB_TOKEN }}"}
+ swap_urls: |
+ {"^\.\/\(.*\).md": "\\1.md.html"}
diff --git a/.gitignore b/.gitignore
index 16915dde28..e46bc2dfb5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -118,6 +118,8 @@ venv.bak/
/docs/source/notebooks/*.gz
/docs/source/notebooks/spoken_digits
/docs/source/notebooks/pretrained_models
+/docs/source/tutorials/datalab/datalab-files/
-# VS Code
+# Editor files
.vscode/
+.idea/
diff --git a/.mypy.ini b/.mypy.ini
index ab7155b9e2..c119d9c9e1 100644
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -21,3 +21,12 @@ ignore_missing_imports = True
[mypy-tqdm.*]
ignore_missing_imports = True
+
+[mypy-matplotlib.*]
+ignore_missing_imports = True
+
+[mypy-datasets.*]
+ignore_missing_imports = True
+
+[mypy-scipy.*]
+ignore_missing_imports = True
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 80063408ba..d88a2e3f1e 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -13,9 +13,9 @@ and [venv](https://docs.python.org/3/library/venv.html). You can
the tools and choose what is right for you. Here, we'll explain how to get set
up with venv, which is built in to Python 3.
-```console
-$ python3 -m venv ./ENV # create a new virtual environment in the directory ENV
-$ source ./ENV/bin/activate # switch to using the virtual environment
+```shell
+python3 -m venv ./ENV # create a new virtual environment in the directory ENV
+source ./ENV/bin/activate # switch to using the virtual environment
```
You only need to create the virtual environment once, but you will need to
@@ -27,37 +27,125 @@ virtual environment rather than your system Python installation.
Run the following commands in the repository's root directory.
-1. Install development requirements with `pip install -r requirements-dev.txt`
+1. Install development requirements
+```shell
+pip install -r requirements-dev.txt
+```
-2. Install cleanlab as an editable package with `pip install -e .`
+2. Install cleanlab as an editable package
+```shell
+pip install -e .
+```
For Macs with Apple silicon: replace `tensorflow` in requirements-dev.txt with: `tensorflow-macos==2.9.2` and `tensorflow-metal==0.5.1`
+### Handling optional dependencies
+
+When designing a class that relies on an optional, domain-specific runtime dependency, it is better to use lazy-importing to avoid forcing users to install the dependency if they do not need it.
+
+Depending on the coupling of your class to the dependency, you may want to consider importing it at the module-level or as an instance variable of the class or a function that uses the dependency.
+
+If the dependency is used by many methods in the module or other classes, it is better to import it at the module-level.
+On the other hand, if the dependency is only used by a handful of methods, then it's better to import it inside the method. If the dependency is not installed, an ImportError should be raised when the method is called, along with instructions on how to install the dependency.
+
+Here is an example of a class that lazily imports CuPy and has a sum method (element-wise) that can be used on both CPU and GPU devices.
+
+Unless an alternative implementations of the sum method is available, an `ImportError` should be raised when the method is called with instructions on how to install the dependency.
+
+ Example code
+
+```python
+def lazy_import_cupy():
+ try:
+ import cupy
+ except ImportError as error:
+ # If the dependency is required for the class to work,
+ # replace this block with a raised ImportError containing instructions
+ print("Warning: cupy is not installed. Please install it with `pip install cupy`.")
+ cupy = None
+ return cupy
+
+class Summation:
+ def __init__(self):
+ self.cupy = lazy_import_cupy()
+ def sum(self, x) -> float:
+ if self.cupy is None:
+ return sum(x)
+ return self.cupy.sum(x)
+```
+
+
+
+For the build system to recognize the optional dependency, you should add it to the `EXTRAS_REQUIRE` constant in **setup.py**:
+
+ Example code
+
+```python
+EXTRAS_REQUIRE = {
+ ...
+ "gpu": [
+ # Explain why the dependency below is needed,
+ # e.g. "for performing summation on GPU"
+ "cupy",
+ ],
+}
+```
+
+
+Or assign to a separate variable and add it to `EXTRAS_REQUIRE`
+
+```python
+GPU_REQUIRES = [
+ # Explanation ...
+ "cupy",
+]
+
+EXTAS_REQUIRE = {
+ ...
+ "gpu": GPU_REQUIRES,
+}
+```
+
+
+
+The package can be installed with the optional dependency (here called `gpu`) via:
+
+1. PyPI installation
+
+```shell
+pip install -r "cleanlab[gpu]"
+```
+
+2. Editable installation
+
+```shell
+pip install -e ".[gpu]"
+```
## Testing
**Run all the tests:**
-```console
-$ pytest
+```shell
+pytest
```
**Run a specific file or test:**
-```
-$ pytest -k
+```shell
+pytest -k
```
**Run with verbose output:**
-```
-$ pytest --verbose
+```shell
+pytest --verbose
```
**Run with code coverage:**
-```
-$ pytest --cov=cleanlab/ --cov-config .coveragerc --cov-report=html
+```shell
+pytest --cov=cleanlab/ --cov-config .coveragerc --cov-report=html
```
The coverage report will be available in `coverage_html_report/index.html`,
@@ -69,13 +157,13 @@ Cleanlab uses [mypy](https://mypy.readthedocs.io/en/stable/) typing. Type checki
**Check typing in all files:**
-```
-$ mypy cleanlab
+```shell
+mypy cleanlab
```
The above is just a simplified command for demonstration, do NOT run this for testing your own type annotations!
Our CI adds a few additional flags to the `mypy` command it uses in the file:
-**.github/workflows/ci.yml**.
+**.github/workflows/ci.yml**.
To exactly match the `mypy` command that is executed in CI, copy these flags, and also ensure your version of `mypy` and related packages like `pandas-stubs` match the latest released versions (used in our CI).
### Examples
@@ -84,7 +172,7 @@ You can check that the [examples](https://github.com/cleanlab/examples) still
work with changes you make to cleanlab by manually running the notebooks.
You can also run all example notebooks as follows:
-```console
+```shell
git clone https://github.com/cleanlab/examples.git
```
@@ -93,27 +181,29 @@ E.g. you can edit this line to point to your local version of cleanlab as a rela
Finally execute the bash script:
-```console
+```shell
examples/run_all_notebooks.sh
```
## How to style new code contributions
-cleanlab follows the [Black](https://black.readthedocs.io/) code style. This is
+cleanlab follows the [Black](https://black.readthedocs.io/) code style (see [pyproject.toml](pyproject.toml)). This is
enforced by CI, so please format your code by invoking `black` before submitting a pull request.
-Generally aim to follow the [PEP-8 coding style](https://peps.python.org/pep-0008/).
+Generally aim to follow the [PEP-8 coding style](https://peps.python.org/pep-0008/).
Please do not use wildcard `import *` in any files, instead you should always import the specific functions that you need from a module.
+All cleanlab code should have a maximum line length of 100 characters.
+
### Pre-commit hook
This repo uses the [pre-commit framework](https://pre-commit.com/) to easily
set up code style checks that run automatically whenever you make a commit.
You can install the git hook scripts with:
-```console
-$ pre-commit install
+```shell
+pre-commit install
```
### EditorConfig
@@ -138,7 +228,7 @@ endings match the project style.
## Documentation
You can build the docs from your local cleanlab version by following [these
-instructions](docs/README.md#build-the-cleanlab-docs-locally).
+instructions](./docs/README.md#build-the-cleanlab-docs-locally).
If editing existing docs or adding new tutorials, please first read through our [guidelines](https://github.com/cleanlab/cleanlab/tree/master/docs#tips-for-editing-docstutorials).
@@ -203,10 +293,20 @@ Try to adhere to this standardized terminology unless you have good reason not t
Use relative linking to connect information between docs and jupyter notebooks, and make sure links will remain valid in the future as new cleanlab versions are released! Sphinx/html works with relative paths so try to specify relative paths if necessary. For specific situations:
-- Link another function from within a source code docstring: ``:py:func:`function_name ` ``
-- Link another class from within a source code docstring: ``:py:class:`class_name ` ``
-- Link a tutorial notebook from within a source code docstring: ``:ref:`notebook_name ` ``
-- Link a function from within a tutorial notebook: `[function_name](../cleanlab/file.rst#cleanlab.file.function_name)`
+- Link another function or class from within a source code docstring:
+ - If you just want to specify the function/class name (ie. the function/class is unique throughout our library): `` `~cleanlab.file.function_or_class_name` ``.
+
+ This uses the [Sphinx's](https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-default_role) `default_role = "py:obj"` setting, so the leading tilde shortens the link to only display `function_or_class_name`.
+ - If you want to additionally specify the module which the function belongs to:
+ - `` :py:func:`file.function_name ` `` for functions
+ - ``:py:class:`file.class_name ` `` for classes
+
+ Here you have more control over the text that is displayed to display the module name. When referring to a function that is alternatively defined in other modules as well, always use this option to be more explicit about which module you are referencing.
+- Link a tutorial (rst file) from within a source code docstring or rst file: ``:ref:`tutorial_name ` ``
+- Link a tutorial notebook (ipynb file) from within a source code docstring or rst file: `` `notebook_name `_ `` . (If the notebook is not the in the same folder as the source code, use a relative path)
+- Link a function from within a tutorial notebook: `[function_name](../cleanlab/file.html#cleanlab.file.function_name)`
+
+ Links from master branch tutorials will reference master branch functions, similarly links from tutorials in stable branch will reference stable branch functions since we are using relative paths.
- Link a specific section of a notebook from within the notebook: `[section title](#section-title)`
- Link a different tutorial notebook from within a tutorial notebook: `[another notebook](another_notebook.html)`. (Note this only works when the other notebook is in same folder as this notebook, otherwise may need to try relative path)
- Link another specific section of different notebook from within a tutorial notebook: `[another notebook section title](another_notebook.html#another-notebook-section-title)`
diff --git a/README.md b/README.md
index 429c8f5393..2b44160e0a 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,13 @@
-
+
+
+
+
-cleanlab automatically finds and fixes errors in any ML dataset. This data-centric AI package facilitates **machine learning with messy, real-world data** by providing **clean lab**els during training.
+cleanlab helps you **clean** data and **lab**els by automatically detecting issues in a ML dataset. To facilitate **machine learning with messy, real-world data**, this data-centric AI package uses your *existing* models to estimate dataset problems that can be fixed to train even *better* models.
```python
-# cleanlab works with **any classifier**. Yup, you can use sklearn/PyTorch/TensorFlow/XGBoost/etc.
+# cleanlab works with **any classifier**. Yup, you can use PyTorch/TensorFlow/OpenAI/XGBoost/etc.
cl = cleanlab.classification.CleanLearning(sklearn.YourFavoriteClassifier())
# cleanlab finds data and label issues in **any dataset**... in ONE line of code!
@@ -16,13 +19,14 @@ cl.fit(data, labels)
# cleanlab estimates the predictions you would have gotten if you had trained with *no* label issues.
cl.predict(test_data)
-# A true data-centric AI package, cleanlab quantifies class-level issues and overall data quality, for any dataset.
+# A universal data-centric AI tool, cleanlab quantifies class-level issues and overall data quality, for any dataset.
cleanlab.dataset.health_summary(labels, confident_joint=cl.confident_joint)
```
Get started with: [documentation](https://docs.cleanlab.ai/), [tutorials](https://docs.cleanlab.ai/stable/tutorials/image.html), [examples](https://github.com/cleanlab/examples), and [blogs](https://cleanlab.ai/blog/).
- - Learn to run cleanlab on your data in 5 minutes for classification with: [image](https://docs.cleanlab.ai/stable/tutorials/image.html), [text](https://docs.cleanlab.ai/stable/tutorials/text.html), [audio](https://docs.cleanlab.ai/stable/tutorials/audio.html), or [tabular](https://docs.cleanlab.ai/stable/tutorials/tabular.html) data.
+ - Learn to run cleanlab on your data in 5 minutes for classification with: [image](https://docs.cleanlab.ai/stable/tutorials/datalab/image.html), [text](https://docs.cleanlab.ai/stable/tutorials/datalab/text.html), [audio](https://docs.cleanlab.ai/stable/tutorials/datalab/audio.html), or [tabular](https://docs.cleanlab.ai/stable/tutorials/datalab/tabular.html) data.
+- Use cleanlab to automatically: [detect data issues (outliers, duplicates, label errors, etc)](https://docs.cleanlab.ai/stable/tutorials/datalab/datalab_quickstart.html), [train robust models](https://docs.cleanlab.ai/stable/tutorials/indepth_overview.html), [infer consensus + annotator-quality for multi-annotator data](https://docs.cleanlab.ai/stable/tutorials/multiannotator.html), [suggest data to (re)label next (active learning)](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb).
[](https://pypi.org/pypi/cleanlab/)
@@ -33,63 +37,27 @@ Get started with: [documentation](https://docs.cleanlab.ai/), [tutorials](https:
[](https://docs.cleanlab.ai/)
[](https://cleanlab.ai/slack)
[](https://twitter.com/CleanlabAI)
-[](https://cleanlab.ai/studio)
-
------
-
-News! (2022) -- cleanlab made accessible for everybody, not just ML researchers (click to learn more)
-
-
-- Nov 2022 📖 cleanlab 2.2.0 released! Added better algorithms for: label issues in multi-label classification, data with some classes absent, and estimating the number of label errors in a dataset.
-- Sep 2022 📖 cleanlab 2.1.0 released! Added support for: data labeled by multiple annotators in cleanlab.multiannotator, token classification with text data in cleanlab.token_classification, out-of-distribution detection in cleanlab.outlier, and CleanLearning with non-numpy-array data (e.g. pandas dataframes, tensorflow/pytorch datasets, etc) in cleanlab.classification.CleanLearning.
-- April 2022 📖 cleanlab 2.0.0 released! Lays foundations for this library to grow into a general-purpose data-centric AI toolkit.
-- March 2022 📖 Documentation migrated to new website: docs.cleanlab.ai with quickstart tutorials for image/text/audio/tabular data.
-- Feb 2022 💻 APIs simplified to make cleanlab accessible for everybody, not just ML researchers
-- Long-time cleanlab user? Here's how to migrate to cleanlab versions >= 2.0.0.
-
-
-
+[](https://cleanlab.ai/studio/?utm_source=github&utm_medium=readme&utm_campaign=clostostudio)
-News! (2021) -- cleanlab finds pervasive label errors in the most common ML datasets (click to learn more)
-
-
-- Dec 2021 🎉 NeurIPS published the label errors paper (Northcutt, Athalye, & Mueller, 2021).
-- Apr 2021 🎉 Journal of AI Research published the confident learning paper (Northcutt, Jiang, & Chuang, 2021).
-- Mar 2021 😲 cleanlab used to find and fix label issues in 10 of the most common ML benchmark datasets, published in: NeurIPS 2021. Along with the paper (Northcutt, Athalye, & Mueller, 2021), the authors launched labelerrors.com where you can view the label issues in these datasets.
-
-
-
-News! (2020) -- cleanlab supports all OS, achieves state-of-the-art performance (click to learn more)
-
-
-- Dec 2020 🎉 cleanlab supports NeurIPS workshop paper (Northcutt, Athalye, & Lin, 2020).
-- Dec 2020 🤖 cleanlab supports PU learning.
-- Feb 2020 🤖 cleanlab now natively supports Mac, Linux, and Windows.
-- Feb 2020 🤖 cleanlab now supports Co-Teaching (Han et al., 2018).
-- Jan 2020 🎉 cleanlab achieves state-of-the-art on CIFAR-10 with noisy labels. Code to reproduce: examples/cifar10. This is a great place to see how to use cleanlab on real datasets (with predicted probabilities from trained model already precomputed for you).
-
+
+
+
+
+ Examples of various issues in Cat/Dog dataset automatically detected by cleanlab (with 1 line of code).
-
-
-Release notes for past versions are [here](https://github.com/cleanlab/cleanlab/releases).
-Details behind updates are explained in our [blog](https://cleanlab.ai/blog/) and [research papers](https://cleanlab.ai/research/).
## So fresh, so cleanlab
-cleanlab **clean**s your data's **lab**els via state-of-the-art *confident learning* algorithms, published in this [paper](https://jair.org/index.php/jair/article/view/12125) and [blog](https://l7.curtisnorthcutt.com/confident-learning). See some of the datasets cleaned with cleanlab at [labelerrors.com](https://labelerrors.com). This package helps you find data and label issues so you can train reliable ML models.
+cleanlab **clean**s your data's **lab**els via state-of-the-art *confident learning* algorithms, published in this [paper](https://jair.org/index.php/jair/article/view/12125) and [blog](https://l7.curtisnorthcutt.com/confident-learning). See some of the datasets cleaned with cleanlab at [labelerrors.com](https://labelerrors.com). This data-centric AI tool helps you find data and label issues, so you can train reliable ML models.
cleanlab is:
-1. **backed by theory**
- - with [provable guarantees](https://arxiv.org/abs/1911.00068) of exact estimation of noise and label errors, even with imperfect models.
-2. **fast**
- - Code is parallelized (< 1 second to find label issues in ImageNet with pre-computed predictions).
-4. **easy-to-use**
- - Find label issues or train noise-robust models in one line of code (no hyperparameters by default).
-6. **general**
- - Works with **[any dataset](https://labelerrors.com/)** and **any model**, e.g., TensorFlow, PyTorch, sklearn, XGBoost, Huggingface, etc.
+1. **backed by theory** -- with [provable guarantees](https://arxiv.org/abs/1911.00068) of exact label noise estimation, even with imperfect models.
+2. **fast** -- code is parallelized and scalable.
+4. **easy to use** -- one line of code to find mislabeled data, bad annotators, outliers, or train noise-robust models.
+6. **general** -- works with **[any dataset](https://labelerrors.com/)** (text, image, tabular, audio,...) + **any model** (PyTorch, OpenAI, XGBoost,...)

@@ -103,11 +71,23 @@ cleanlab supports Linux, macOS, and Windows and runs on Python 3.7+.
- Get started [here](https://docs.cleanlab.ai/)! Install via `pip` or `conda` as described [here](https://docs.cleanlab.ai/).
- Developers who install the bleeding-edge from source should refer to [this master branch documentation](https://docs.cleanlab.ai/master/index.html).
+- For help, check out our detailed [FAQ](https://docs.cleanlab.ai/stable/tutorials/faq.html), [Github Issues](https://github.com/cleanlab/cleanlab/issues?q=is%3Aissue), or [Slack](https://cleanlab.ai/slack). We welcome any questions!
+
+**Practicing data-centric AI can look like this:**
+1. Train initial ML model on original dataset.
+2. Utilize this model to diagnose data issues (via cleanlab methods) and improve the dataset.
+3. Train the same model on the improved dataset.
+4. Try various modeling techniques to further improve performance.
+
+Most folks jump from Step 1 → 4, but you may achieve big gains without *any* change to your modeling code by using cleanlab!
+Continuously boost performance by iterating Steps 2 → 4 (and try to evaluate with *cleaned* data).
+
+
## Use cleanlab with any model for most ML tasks
-All features of cleanlab work with **any dataset** and **any model**. Yes, any model: scikit-learn, PyTorch, Tensorflow, Keras, JAX, HuggingFace, MXNet, XGBoost, etc.
+All features of cleanlab work with **any dataset** and **any model**. Yes, any model: PyTorch, Tensorflow, Keras, JAX, HuggingFace, OpenAI, XGBoost, scikit-learn, etc.
If you use a sklearn-compatible classifier, all cleanlab methods work out-of-the-box.
@@ -117,7 +97,7 @@ It’s also easy to use your favorite non-sklearn-compatible model (click to
cleanlab can find label issues from any model's predicted class probabilities if you can produce them yourself.
-Some other cleanlab functionality requires your model to be sklearn-compatible.
+Some cleanlab functionality may require your model to be sklearn-compatible.
There's nothing you need to do if your model already has `.fit()`, `.predict()`, and `.predict_proba()` methods.
Otherwise, just wrap your custom model into a Python class that inherits the `sklearn.base.BaseEstimator`:
@@ -150,300 +130,23 @@ cl.predict(test_data)
More details are provided in documentation of [cleanlab.classification.CleanLearning](https://docs.cleanlab.ai/stable/cleanlab/classification.html).
-Note, some libraries exist to give you sklearn-compatibility for free. For PyTorch, check out the [skorch](https://skorch.readthedocs.io/) Python library which will wrap your PyTorch model into a sklearn-compatible model ([example](https://docs.cleanlab.ai/stable/tutorials/image.html)). For TensorFlow/Keras, check out [SciKeras](https://www.adriangb.com/scikeras/) ([example](https://docs.cleanlab.ai/stable/tutorials/text.html)) or [our own Keras wrapper](https://docs.cleanlab.ai/stable/cleanlab/experimental/keras.html). Many libraries also already offer a special scikit-learn API, for example: [XGBoost](https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn) or [LightGBM](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html).
+Note, some libraries exist to give you sklearn-compatibility for free. For PyTorch, check out the [skorch](https://skorch.readthedocs.io/) Python library which will wrap your PyTorch model into a sklearn-compatible model ([example](https://docs.cleanlab.ai/stable/tutorials/image.html)). For TensorFlow/Keras, check out our [Keras wrapper](https://docs.cleanlab.ai/stable/cleanlab/models/keras.html). Many libraries also already offer a special scikit-learn API, for example: [XGBoost](https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn) or [LightGBM](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html).
-cleanlab is useful across a wide variety of Machine Learning tasks. Specific tasks this package offers dedicated functionality for include:
+cleanlab is useful across a wide variety of Machine Learning tasks. Specific tasks this data-centric AI solution offers dedicated functionality for include:
1. [Binary and multi-class classification](https://docs.cleanlab.ai/stable/tutorials/indepth_overview.html)
2. [Multi-label classification](https://docs.cleanlab.ai/stable/tutorials/multilabel_classification.html) (e.g. image/document tagging)
3. [Token classification](https://docs.cleanlab.ai/stable/tutorials/token_classification.html) (e.g. entity recognition in text)
4. [Classification with data labeled by multiple annotators](https://docs.cleanlab.ai/stable/tutorials/multiannotator.html)
-5. [Out of distribution detection](https://docs.cleanlab.ai/stable/tutorials/outliers.html)
+5. [Active learning with multiple annotators](https://github.com/cleanlab/examples/blob/master/active_learning_multiannotator/active_learning.ipynb) (suggest which data to label or re-label to improve model most)
+6. [Outlier and out of distribution detection](https://docs.cleanlab.ai/stable/tutorials/outliers.html)
For many other ML tasks, cleanlab can still help you improve your dataset if appropriately applied.
+Many practical applications are demonstrated in our [Example Notebooks](https://github.com/cleanlab/examples).
-## Cool cleanlab applications
-
-
-Reproducing results in Confident Learning paper
-(click to learn more)
-
-
-
-For additional details, check out the: [confidentlearning-reproduce repository](https://github.com/cgnorthcutt/confidentlearning-reproduce).
-
-### State of the Art Learning with Noisy Labels in CIFAR
-
-A step-by-step guide to reproduce these results is available [here](https://github.com/cleanlab/examples/tree/master/contrib/v1/cifar10). This guide is also a good tutorial for using cleanlab on any large dataset. You'll need to `git clone`
-[confidentlearning-reproduce](https://github.com/cgnorthcutt/confidentlearning-reproduce) which contains the data and files needed to reproduce the CIFAR-10 results.
-
-
-
-Comparison of confident learning (CL), as implemented in cleanlab, versus seven recent methods for learning with noisy labels in CIFAR-10. Highlighted cells show CL robustness to sparsity. The five CL methods estimate label issues, remove them, then train on the cleaned data using [Co-Teaching](https://github.com/cleanlab/cleanlab/blob/master/cleanlab/experimental/coteaching.py).
-
-Observe how cleanlab (i.e. the CL method) is robust to large sparsity in label noise whereas prior art tends to reduce in performance for increased sparsity, as shown by the red highlighted regions. This is important because real-world label noise is often sparse, e.g. a tiger is likely to be mislabeled as a lion, but not as most other classes like airplane, bathtub, and microwave.
-
-### Find label issues in ImageNet
-
-Use cleanlab to identify \~100,000 label errors in the 2012 ILSVRC ImageNet training dataset: [examples/imagenet](https://github.com/cleanlab/examples/tree/master/contrib/v1/imagenet).
-
-
-
-Label issues in ImageNet train set found via cleanlab. Label Errors are boxed in red. Ontological issues in green. Multi-label images in blue.
-
-### Find Label Errors in MNIST
-
-Use cleanlab to identify \~50 label errors in the MNIST dataset: [examples/mnist](https://github.com/cleanlab/examples/tree/master/contrib/v1/mnist).
-
-
-
-Top 24 least-confident labels in the original MNIST **train** dataset, algorithmically identified via cleanlab. Examples are ordered left-right, top-down by increasing self-confidence (predicted probability that the **given** label is correct), denoted **conf** in teal. The most-likely correct label (with largest predicted probability) is in green. Overt label errors highlighted in red.
-
-
-
-
-
-Learning with noisy labels across 4 data distributions and 9 classifiers
-(click to learn more)
-
-
-
-cleanlab is a general tool that can learn with noisy labels regardless of dataset distribution or classifier type: [examples/classifier\_comparison](https://github.com/cleanlab/examples/blob/master/classifier_comparison/classifier_comparison.ipynb).
-
-
-
-Each sub-figure above depicts the decision boundary learned using [cleanlab.classification.CleanLearning](https://github.com/cleanlab/cleanlab/blob/master/cleanlab/classification.py#L141) in the presence of extreme (\~35%) label errors (circled in green). Label noise is class-conditional (not uniformly random). Columns are organized by the classifier used, except the left-most column which depicts the ground-truth data distribution. Rows are organized by dataset.
-
-Each sub-figure depicts accuracy scores on a test set (with correct non-noisy labels) as decimal values:
-
-* LEFT (in black): The classifier test accuracy trained with perfect labels (no label errors).
-* MIDDLE (in blue): The classifier test accuracy trained with noisy labels using cleanlab.
-* RIGHT (in white): The baseline classifier test accuracy trained with noisy labels.
-
-As an example, the table below is the noise matrix (noisy channel) *P(s | y)
-characterizing the label noise for the first dataset row in the figure. *s* represents the observed noisy labels and *y* represents the latent, true labels. The trace of this matrix is 2.6. A trace of 4 implies no label noise. A cell in this matrix is read like: "Around 38% of true underlying '3' labels were randomly flipped to '2' labels in the
-observed dataset."
-
-| `p(label︱y)` | y=0 | y=1 | y=2 | y=3 |
-|--------------|------|------|------|------|
-| label=0 | 0.55 | 0.01 | 0.07 | 0.06 |
-| label=1 | 0.22 | 0.87 | 0.24 | 0.02 |
-| label=2 | 0.12 | 0.04 | 0.64 | 0.38 |
-| label=3 | 0.11 | 0.08 | 0.05 | 0.54 |
-
-
-
-
-
-ML research using cleanlab
-(click to learn more)
-
-
-
-Researchers may find some components of this package useful for evaluating algorithms for ML with noisy labels. For additional details/notation, refer to [the Confident Learning paper](https://jair.org/index.php/jair/article/view/12125).
-
-### Methods to Standardize Research with Noisy Labels
-
-cleanlab supports a number of functions to generate noise for benchmarking and standardization in research. This next example shows how to generate valid, class-conditional, uniformly random noisy channel matrices:
-
-``` python
-# Generate a valid (necessary conditions for learnability are met) noise matrix for any trace > 1
-from cleanlab.benchmarking.noise_generation import generate_noise_matrix_from_trace
-noise_matrix=generate_noise_matrix_from_trace(
- K=number_of_classes,
- trace=float_value_greater_than_1_and_leq_K,
- py=prior_of_y_actual_labels_which_is_just_an_array_of_length_K,
- frac_zero_noise_rates=float_from_0_to_1_controlling_sparsity,
-)
-
-# Check if a noise matrix is valid (necessary conditions for learnability are met)
-from cleanlab.benchmarking.noise_generation import noise_matrix_is_valid
-is_valid=noise_matrix_is_valid(
- noise_matrix,
- prior_of_y_which_is_just_an_array_of_length_K,
-)
-```
-
-For a given noise matrix, this example shows how to generate noisy labels. Methods can be seeded for reproducibility.
-
-``` python
-# Generate noisy labels using the noise_marix. Guarantees exact amount of noise in labels.
-from cleanlab.benchmarking.noise_generation import generate_noisy_labels
-s_noisy_labels = generate_noisy_labels(y_hidden_actual_labels, noise_matrix)
-
-# This package is a full of other useful methods for learning with noisy labels.
-# The tutorial stops here, but you don't have to. Inspect method docstrings for full docs.
-```
-
-
-
-
-
-cleanlab for advanced users
-(click to learn more)
-
-
-
-Many methods and their default parameters are not covered here. Check out the [documentation for the master branch version](https://docs.cleanlab.ai/master/) for the full suite of features supported by the cleanlab API.
-
-## Use any custom model's predicted probabilities to find label errors in 1 line of code
-
-pred_probs (num_examples x num_classes matrix of predicted probabilities) should already be computed on your own, with any classifier. For best results, pred_probs should be obtained in a holdout/out-of-sample manner (e.g. via cross-validation).
-* cleanlab can do this for you via [`cleanlab.count.estimate_cv_predicted_probabilities`](https://docs.cleanlab.ai/master/cleanlab/count.html)]
-* Tutorial with more info: [[here](https://docs.cleanlab.ai/stable/tutorials/pred_probs_cross_val.html)]
-* Examples how to compute pred_probs with: [[CNN image classifier (PyTorch)](https://docs.cleanlab.ai/stable/tutorials/image.html)], [[NN text classifier (TensorFlow)](https://docs.cleanlab.ai/stable/tutorials/text.html)]
-
-```python
-# label issues are ordered by likelihood of being an error. First index is most likely error.
-from cleanlab.filter import find_label_issues
-
-ordered_label_issues = find_label_issues( # One line of code!
- labels=numpy_array_of_noisy_labels,
- pred_probs=numpy_array_of_predicted_probabilities,
- return_indices_ranked_by='normalized_margin', # Orders label issues
- )
-```
-
-Pre-computed **out-of-sample** predicted probabilities for CIFAR-10 train set are available: [here](https://github.com/cleanlab/examples/tree/master/contrib/v1/cifar10#pre-computed-psx-for-every-noise--sparsity-condition).
-
-## Fully characterize label noise and uncertainty in your dataset.
-
-*s* denotes a random variable that represents the observed, noisy label and *y* denotes a random variable representing the hidden, actual labels. Both *s* and *y* take any of the m classes as values. The cleanlab package supports different levels of granularity for computation depending on the needs of the user. Because of this, we support multiple alternatives, all no more than a few lines, to estimate these latent distribution arrays, enabling the user to reduce computation time by only computing what they need to compute, as seen in the examples below.
-
-Throughout these examples, you’ll see a variable called *confident\_joint*. The confident joint is an m x m matrix (m is the number of classes) that counts, for every observed, noisy class, the number of examples that confidently belong to every latent, hidden class. It counts the number of examples that we are confident are labeled correctly or incorrectly for every pair of observed and unobserved classes. The confident joint is an unnormalized estimate of the complete-information latent joint distribution, *Ps,y*.
-
-The label flipping rates are denoted *P(s | y)*, the inverse rates are *P(y | s)*, and the latent prior of the unobserved, true labels, *p(y)*.
-
-Most of the methods in the **cleanlab** package start by first estimating the *confident\_joint*. You can learn more about this in the [confident learning paper](https://arxiv.org/abs/1911.00068).
-
-### Option 1: Compute the confident joint and predicted probs first. Stop if that’s all you need.
-
-``` python
-from cleanlab.count import estimate_latent
-from cleanlab.count import estimate_confident_joint_and_cv_pred_proba
-
-# Compute the confident joint and the n x m predicted probabilities matrix (pred_probs),
-# for n examples, m classes. Stop here if all you need is the confident joint.
-confident_joint, pred_probs = estimate_confident_joint_and_cv_pred_proba(
- X=X_train,
- labels=train_labels_with_errors,
- clf=logreg(), # default, you can use any classifier
-)
-
-# Estimate latent distributions: p(y) as est_py, P(s|y) as est_nm, and P(y|s) as est_inv
-est_py, est_nm, est_inv = estimate_latent(
- confident_joint,
- labels=train_labels_with_errors,
-)
-```
-
-### Option 2: Estimate the latent distribution matrices in a single line of code.
-
-``` python
-from cleanlab.count import estimate_py_noise_matrices_and_cv_pred_proba
-est_py, est_nm, est_inv, confident_joint, pred_probs = estimate_py_noise_matrices_and_cv_pred_proba(
- X=X_train,
- labels=train_labels_with_errors,
-)
-```
-
-### Option 3: Skip computing the predicted probabilities if you already have them.
-
-``` python
-# Already have pred_probs? (n x m matrix of predicted probabilities)
-# For example, you might get them from a pre-trained model (like resnet on ImageNet)
-# With the cleanlab package, you estimate directly with pred_probs.
-from cleanlab.count import estimate_py_and_noise_matrices_from_probabilities
-est_py, est_nm, est_inv, confident_joint = estimate_py_and_noise_matrices_from_probabilities(
- labels=train_labels_with_errors,
- pred_probs=pred_probs,
-)
-```
-
-## Completely characterize label noise in a dataset:
-
-The joint probability distribution of noisy and true labels, *P(s,y)*, completely characterizes label noise with a class-conditional *m x m* matrix.
-
-``` python
-from cleanlab.count import estimate_joint
-joint = estimate_joint(
- labels=noisy_labels,
- pred_probs=probabilities,
- confident_joint=None, # Provide if you have it already
-)
-```
-
-
-
-
-
-Positive-Unlabeled Learning
-(click to learn more)
-
-
-
-Positive-Unlabeled (PU) learning (in which your data only contains a few positively labeled examples with the rest unlabeled) is just a special case of [CleanLearning](https://github.com/cleanlab/cleanlab/blob/master/cleanlab/classification.py#L141) when one of the classes has no error. `P` stands for the positive class and **is assumed to have zero label errors** and `U` stands for unlabeled data, but in practice, we just assume the `U` class is a noisy negative class that actually contains some positive examples. Thus, the goal of PU learning is to (1) estimate the proportion of negatively labeled examples that actually belong to the positive class (see`fraction\_noise\_in\_unlabeled\_class` in the last example), (2) find the errors (see last example), and (3) train on clean data (see first example below). cleanlab does all three, taking into account that there are no label errors in whichever class you specify as positive.
-
-There are two ways to use cleanlab for PU learning. We'll look at each here.
-
-Method 1. If you are using the cleanlab classifier [CleanLearning()](https://github.com/cleanlab/cleanlab/blob/master/cleanlab/classification.py#L141), and your dataset has exactly two classes (positive = 1, and negative = 0), PU
-learning is supported directly in cleanlab. You can perform PU learning like this:
-
-``` python
-from cleanlab.classification import CleanLearning
-from sklearn.linear_model import LogisticRegression
-# Wrap around any classifier. Yup, you can use sklearn/pyTorch/TensorFlow/FastText/etc.
-pu_class = 0 # Should be 0 or 1. Label of class with NO ERRORS. (e.g., P class in PU)
-cl = CleanLearning(clf=LogisticRegression(), pulearning=pu_class)
-cl.fit(X=X_train_data, labels=train_noisy_labels)
-# Estimate the predictions you would have gotten by training with *no* label errors.
-predicted_test_labels = cl.predict(X_test)
-```
-
-Method 2. However, you might be using a more complicated classifier that doesn't work well with [CleanLearning](https://github.com/cleanlab/cleanlab/blob/master/cleanlab/classification.py#L141) (see this example for CIFAR-10). Or you might have 3 or more classes. Here's how to use cleanlab for PU learning in this situation. To let cleanlab know which class has no error (in standard PU learning, this is the P class), you need to set the threshold for that class to 1 (1 means the probability that the labels of that class are correct is 1, i.e. that class has no
-error). Here's the code:
-
-``` python
-import numpy as np
-# K is the number of classes in your dataset
-# pred_probs are the cross-validated predicted probabilities.
-# s is the array/list/iterable of noisy labels
-# pu_class is a 0-based integer for the class that has no label errors.
-thresholds = np.asarray([np.mean(pred_probs[:, k][s == k]) for k in range(K)])
-thresholds[pu_class] = 1.0
-```
-
-Now you can use cleanlab however you were before. Just be sure to pass in `thresholds` as a parameter wherever it applies. For example:
-
-``` python
-# Uncertainty quantification (characterize the label noise
-# by estimating the joint distribution of noisy and true labels)
-cj = compute_confident_joint(s, pred_probs, thresholds=thresholds, )
-# Now the noise (cj) has been estimated taking into account that some class(es) have no error.
-# We can use cj to find label errors like this:
-indices_of_label_issues = find_label_issues(s, pred_probs, confident_joint=cj, )
-
-# In addition to label issues, cleanlab can find the fraction of noise in the unlabeled class.
-# First we need the inv_noise_matrix which contains P(y|s) (proportion of mislabeling).
-_, _, inv_noise_matrix = estimate_latent(confident_joint=cj, labels=s, )
-# Because inv_noise_matrix contains P(y|s), p (y = anything | labels = pu_class) should be 0
-# because the prob(true label is something else | example is in pu_class) is 0.
-# What's more interesting is p(y = anything | s is not put_class), or in the binary case
-# this translates to p(y = pu_class | s = 1 - pu_class) because pu_class is 0 or 1.
-# So, to find the fraction_noise_in_unlabeled_class, for binary, you just compute:
-fraction_noise_in_unlabeled_class = inv_noise_matrix[pu_class][1 - pu_class]
-```
-
-Now that you have `indices_of_label_errors`, you can remove those label issues and train on clean data (or only remove some of the label issues and iteratively use confident learning / cleanlab to improve results).
-
-
-
-
-Many other practical applications are demonstrated in our [Example Notebooks](https://github.com/cleanlab/examples)
-
## Citation and related publications
cleanlab is based on peer-reviewed research. Here are relevant papers to cite if you use this package:
@@ -513,7 +216,7 @@ cleanlab is based on peer-reviewed research. Here are relevant papers to cite if
CROWDLAB for data with multiple annotators (NeurIPS '22) (click to show bibtex)
@inproceedings{goh2022crowdlab,
- title={Utilizing supervised models to infer consensus labels and their quality from data with multiple annotators},
+ title={CROWDLAB: Supervised learning to infer consensus labels and quality scores for data with multiple annotators},
author={Goh, Hui Wen and Tkachenko, Ulyana and Mueller, Jonas},
booktitle={NeurIPS Human in the Loop Learning Workshop},
year={2022}
@@ -521,21 +224,75 @@ cleanlab is based on peer-reviewed research. Here are relevant papers to cite if
+ ActiveLab: Active learning with data re-labeling (ICLR '23) (click to show bibtex)
+
+ @inproceedings{goh2023activelab,
+ title={ActiveLab: Active Learning with Re-Labeling by Multiple Annotators},
+ author={Goh, Hui Wen and Mueller, Jonas},
+ booktitle={ICLR Workshop on Trustworthy ML},
+ year={2023}
+ }
+
+
+
+ Incorrect Annotations in Multi-Label Classification (ICLR '23) (click to show bibtex)
+
+ @inproceedings{thyagarajan2023multilabel,
+ title={Identifying Incorrect Annotations in Multi-Label Classification Data},
+ author={Thyagarajan, Aditya and Snorrason, ElÃas and Northcutt, Curtis and Mueller, Jonas},
+ booktitle={ICLR Workshop on Trustworthy ML},
+ year={2023}
+ }
+
+
+
+ Detecting Dataset Drift and Non-IID Sampling (ICML '23) (click to show bibtex)
+
+ @inproceedings{cummings2023drift,
+ title={Detecting Dataset Drift and Non-IID Sampling via k-Nearest Neighbors},
+ author={Cummings, Jesse and Snorrason, ElÃas and Mueller, Jonas},
+ booktitle={ICML Workshop on Data-centric Machine Learning Research},
+ year={2023}
+ }
+
+
+
+ Detecting Errors in Numerical Data (ICML '23) (click to show bibtex)
+
+ @inproceedings{zhou2023errors,
+ title={Detecting Errors in Numerical Data via any Regression Model},
+ author={Zhou, Hang and Mueller, Jonas and Kumar, Mayank and Wang, Jane-Ling and Lei, Jing},
+ booktitle={ICML Workshop on Data-centric Machine Learning Research},
+ year={2023}
+ }
+
+
+
To understand/cite other cleanlab functionality not described above, check out our [additional publications](https://cleanlab.ai/research/).
## Other resources
+- [Example Notebooks demonstrating practical applications of this package](https://github.com/cleanlab/examples)
+
- [Cleanlab Blog](https://cleanlab.ai/blog/)
- [Blog post: Introduction to Confident Learning](https://l7.curtisnorthcutt.com/confident-learning)
- [NeurIPS 2021 paper: Pervasive Label Errors in Test Sets Destabilize Machine Learning Benchmarks](https://arxiv.org/abs/2103.14749)
-- [Cleanlab Studio](https://cleanlab.ai/studio): No-code Data Improvement
+- [Introduction to Data-centric AI (MIT IAP Course 2023)](https://dcai.csail.mit.edu/)
+
+- [Release notes for past versions](https://github.com/cleanlab/cleanlab/releases)
-While this open-source library **finds** data issues, an interface is needed to efficiently **fix** these issues in your dataset. [Cleanlab Studio](https://cleanlab.ai/studio) is a no-code platform to find and fix problems in real-world ML datasets. Studio automatically runs optimized versions of the algorithms from this open-source library on top of AutoML models fit to your data, and presents detected issues in a smart data editing interface. Think of it like a data cleaning assistant that helps you quickly improve the quality of your data (via AI/automation + streamlined UX).
+- [Cleanlab Studio](https://cleanlab.ai/studio/?utm_source=github&utm_medium=readme&utm_campaign=clostostudio): *No-code Data Improvement*
+
+While this open-source library **finds** data issues, its utility depends on you having a decent existing ML model and an interface to efficiently **fix** these issues in your dataset. Providing all these pieces, [Cleanlab Studio](https://cleanlab.ai/studio/?utm_source=github&utm_medium=readme&utm_campaign=clostostudio) is a no-code platform to **find and fix** problems in real-world ML datasets. Studio automatically runs optimized versions of the algorithms from this open-source library on top of AutoML & Foundation models fit to your data, and presents detected issues in a smart data editing interface. It's a data cleaning assistant to quickly turn unreliable data into reliable models/insights (via AI/automation + streamlined UX). [Try it for free!](https://cleanlab.typeform.com/to/NLnU1XZF?typeform-source=cleanlab.ai)
+
+
+
+
## Join our community
@@ -543,21 +300,26 @@ While this open-source library **finds** data issues, an interface is needed to
* Have ideas for the future of cleanlab? How are you using cleanlab? [Join the discussion](https://github.com/cleanlab/cleanlab/discussions) and check out [our active/planned Projects and what we could use your help with](https://github.com/cleanlab/cleanlab/projects).
-* Interested in contributing? See the [contributing guide](CONTRIBUTING.md) and [ideas on useful contributions](https://github.com/cleanlab/cleanlab/wiki#ideas-for-contributing-to-cleanlab). We welcome your help building a standard open-source library for data-centric AI!
+* Interested in contributing? See the [contributing guide](CONTRIBUTING.md) and [ideas on useful contributions](https://github.com/cleanlab/cleanlab/wiki#ideas-for-contributing-to-cleanlab). We welcome your help building a standard open-source platform for data-centric AI!
* Have code improvements for cleanlab? See the [development guide](DEVELOPMENT.md).
-* Have an issue with cleanlab? [Search existing issues](https://github.com/cleanlab/cleanlab/issues?q=is%3Aissue) or [submit a new issue](https://github.com/cleanlab/cleanlab/issues/new).
+* Have an issue with cleanlab? Search [our FAQ](https://docs.cleanlab.ai/stable/tutorials/faq.html) and [existing issues](https://github.com/cleanlab/cleanlab/issues?q=is%3Aissue), or [submit a new issue](https://github.com/cleanlab/cleanlab/issues/new).
* Need professional help with cleanlab?
-Join our [\#help Slack channel](https://cleanlab.ai/slack) and message one of our core developers, Jonas Mueller, or schedule a meeting via email: team@cleanlab.ai
+Join our [\#help Slack channel](https://cleanlab.ai/slack) and message us there, or reach out via email: team@cleanlab.ai
## License
-Copyright (c) 2017-2022 Cleanlab Inc.
+Copyright (c) 2017 Cleanlab Inc.
cleanlab is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
cleanlab is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See [GNU Affero General Public LICENSE](https://github.com/cleanlab/cleanlab/blob/master/LICENSE) for details.
+You can email us to discuss licensing: team@cleanlab.ai
+
+### Commercial licensing
+
+Commercial licensing is available for teams and enterprises that want to use cleanlab in production workflows, but are unable to open-source their code. Please contact us [here](mailto:sales@cleanlab.ai).
diff --git a/cleanlab/__init__.py b/cleanlab/__init__.py
index 5746a49a21..7ec7a55e50 100644
--- a/cleanlab/__init__.py
+++ b/cleanlab/__init__.py
@@ -9,3 +9,49 @@
from . import outlier
from . import token_classification
from . import multilabel_classification
+from . import object_detection
+from . import regression
+from . import segmentation
+
+
+class DatalabUnavailable:
+ def __init__(self, message):
+ self.message = message
+
+ def __getattr__(self, name):
+ message = self.message + f" (raised when trying to access {name})"
+ raise ImportError(message)
+
+ def __call__(self, *args, **kwargs):
+ message = (
+ self.message + f" (raised when trying to call with args: {args}, kwargs: {kwargs})"
+ )
+ raise ImportError(message)
+
+
+def _datalab_import_factory():
+ try:
+ from .datalab.datalab import Datalab as _Datalab
+
+ return _Datalab
+ except ImportError:
+ return DatalabUnavailable(
+ "Datalab is not available due to missing dependencies. "
+ "To install Datalab, run `pip install 'cleanlab[datalab]'`."
+ )
+
+
+def _issue_manager_import_factory():
+ try:
+ from .datalab.issue_manager import IssueManager as _IssueManager
+
+ return _IssueManager
+ except ImportError:
+ return DatalabUnavailable(
+ "IssueManager is not available due to missing dependencies for Datalab. "
+ "To install Datalab, run `pip install 'cleanlab[datalab]'`."
+ )
+
+
+Datalab = _datalab_import_factory()
+IssueManager = _issue_manager_import_factory()
diff --git a/cleanlab/benchmarking/noise_generation.py b/cleanlab/benchmarking/noise_generation.py
index 47937c4453..63c1352bbb 100644
--- a/cleanlab/benchmarking/noise_generation.py
+++ b/cleanlab/benchmarking/noise_generation.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -26,6 +26,7 @@
import numpy as np
from cleanlab.internal.util import value_counts
+from cleanlab.internal.constants import FLOATING_POINT_COMPARISON
def noise_matrix_is_valid(noise_matrix, py, *, verbose=False) -> bool:
@@ -65,7 +66,7 @@ def noise_matrix_is_valid(noise_matrix, py, *, verbose=False) -> bool:
joint_noise = np.multiply(noise_matrix, py) # / float(N)
# Check that joint_probs is valid probability matrix
- if not (abs(joint_noise.sum() - 1.0) < 1e-6):
+ if not (abs(joint_noise.sum() - 1.0) < FLOATING_POINT_COMPARISON):
return False
# Check that noise_matrix is a valid matrix
@@ -386,11 +387,9 @@ def generate_n_rand_probabilities_that_sum_to_m(
An array of probabilities.
"""
- epsilon = 1e-6 # Imprecision allowed for inequalities with floats
-
if n == 0:
return np.array([])
- if (max_prob + epsilon) < m / float(n):
+ if (max_prob + FLOATING_POINT_COMPARISON) < m / float(n):
raise ValueError(
"max_prob must be greater or equal to m / n, but "
+ "max_prob = "
@@ -402,7 +401,7 @@ def generate_n_rand_probabilities_that_sum_to_m(
+ ", m / n = "
+ str(m / float(n))
)
- if min_prob > (m + epsilon) / float(n):
+ if min_prob > (m + FLOATING_POINT_COMPARISON) / float(n):
raise ValueError(
"min_prob must be less or equal to m / n, but "
+ "max_prob = "
@@ -422,7 +421,7 @@ def generate_n_rand_probabilities_that_sum_to_m(
min_val = min(result)
max_val = max(result)
- while max_val > (max_prob + epsilon):
+ while max_val > (max_prob + FLOATING_POINT_COMPARISON):
new_min = min_val + (max_val - max_prob)
# This adjustment prevents the new max from always being max_prob.
adjustment = (max_prob - new_min) * np.random.rand()
@@ -433,7 +432,7 @@ def generate_n_rand_probabilities_that_sum_to_m(
min_val = min(result)
max_val = max(result)
- while min_val < (min_prob - epsilon):
+ while min_val < (min_prob - FLOATING_POINT_COMPARISON):
min_val = min(result)
max_val = max(result)
new_max = max_val - (min_prob - min_val)
diff --git a/cleanlab/classification.py b/cleanlab/classification.py
index 3ce2362bcd..63456aac8f 100644
--- a/cleanlab/classification.py
+++ b/cleanlab/classification.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -120,7 +120,10 @@ def score(self, X, y, sample_weight=None):
import pandas as pd
import inspect
import warnings
-from typing import TypeVar, Optional
+from typing import Optional, TYPE_CHECKING
+
+if TYPE_CHECKING: # pragma: no cover
+ from typing_extensions import Self
from cleanlab.rank import get_label_quality_scores
from cleanlab import filter
@@ -147,9 +150,6 @@ def score(self, X, y, sample_weight=None):
)
-TCleanLearning = TypeVar("TCleanLearning", bound="CleanLearning") # self type for the class
-
-
class CleanLearning(BaseEstimator): # Inherits sklearn classifier
"""
CleanLearning = Machine Learning with cleaned data (even when training on messy, error-ridden data).
@@ -204,8 +204,9 @@ class CleanLearning(BaseEstimator): # Inherits sklearn classifier
find_label_issues_kwargs : dict, optional
Keyword arguments to pass into :py:func:`filter.find_label_issues
- `. Options that may especially impact
- accuracy include: `filter_by`, `frac_noise`, `min_examples_per_class`.
+ `. Particularly useful options include:
+ `filter_by`, `frac_noise`, `min_examples_per_class` (which all impact ML accuracy),
+ `n_jobs` (set this to 1 to disable multi-processing if it's causing issues).
label_quality_scores_kwargs : dict, optional
Keyword arguments to pass into :py:func:`rank.get_label_quality_scores
@@ -229,7 +230,6 @@ def __init__(
label_quality_scores_kwargs={},
verbose=False,
):
-
if clf is None:
# Use logistic regression if no classifier is provided.
clf = LogReg(multi_class="auto", solver="lbfgs")
@@ -266,7 +266,7 @@ def __init__(
self.clf_final_kwargs = None
def fit(
- self: TCleanLearning,
+ self,
X,
labels=None,
*,
@@ -280,7 +280,7 @@ def fit(
clf_final_kwargs={},
validation_func=None,
y=None,
- ) -> TCleanLearning:
+ ) -> "Self":
"""
Train the model `clf` with error-prone, noisy labels as if
the model had been instead trained on a dataset with the correct labels.
@@ -645,7 +645,6 @@ def score(self, X, y, sample_weight=None) -> float:
"""
if hasattr(self.clf, "score"):
-
# Check if sample_weight in clf.score()
if "sample_weight" in inspect.getfullargspec(self.clf.score).args:
return self.clf.score(X, y, sample_weight=sample_weight)
@@ -841,6 +840,7 @@ def find_label_issues(
pred_probs=pred_probs,
thresholds=thresholds,
)
+
# if pulearning == the integer specifying the class without noise.
if self.num_classes == 2 and self.pulearning is not None: # pragma: no cover
# pulearning = 1 (no error in 1 class) implies p(label=1|true_label=0) = 0
@@ -853,6 +853,12 @@ def find_label_issues(
self.confident_joint[self.pulearning][1 - self.pulearning] = 0
self.confident_joint[1 - self.pulearning][1 - self.pulearning] = 1
+ # Add confident joint to find label issue args if it is not previously specified
+ if "confident_joint" not in self.find_label_issues_kwargs.keys():
+ # however does not add if users specify filter_by="confident_learning", as it will throw a warning
+ if not self.find_label_issues_kwargs.get("filter_by") == "confident_learning":
+ self.find_label_issues_kwargs["confident_joint"] = self.confident_joint
+
labels = labels_to_array(labels)
if self.verbose:
print("Using predicted probabilities to identify label issues ...")
@@ -927,9 +933,6 @@ def save_space(self):
self.label_issues_mask = None
self.find_label_issues_kwargs = None
self.label_quality_scores_kwargs = None
- self.label_issues_df = None
- self.label_issues_mask = None
- self.sample_weight = None
self.confident_joint = None
self.py = None
self.ps = None
diff --git a/cleanlab/count.py b/cleanlab/count.py
index 65db814bfb..d9b22feedb 100644
--- a/cleanlab/count.py
+++ b/cleanlab/count.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -38,6 +38,8 @@
from cleanlab.typing import LabelLike
from cleanlab.internal.multilabel_utils import stack_complement, get_onehot_num_classes
+from cleanlab.internal.constants import TINY_VALUE, CONFIDENT_THRESHOLDS_LOWER_BOUND
+
from cleanlab.internal.util import (
value_counts_fill_missing_classes,
clip_values,
@@ -49,7 +51,6 @@
get_unique_classes,
is_torch_dataset,
is_tensorflow_dataset,
- TINY_VALUE,
)
from cleanlab.internal.latent_algebra import (
compute_inv_noise_matrix,
@@ -87,21 +88,28 @@ def num_label_issues(
Array of estimated class label error statisics used for identifying label issues,
in same format expected by :py:func:`filter.find_label_issues ` function.
The `confident_joint` can be computed using :py:func:`count.compute_confident_joint `.
- If not provided, it is internally computed from the given (noisy) `labels` and `pred_probs`.
+ It is internally computed from the given (noisy) `labels` and `pred_probs`.
estimation_method :
Method for estimating the number of label issues in dataset by counting the examples in the off-diagonal of the `confident_joint` ``P(label=i, true_label=j)``.
- - ``'off_diagonal'``: Counts the number of examples in the off-diagonal of the `confident_joint`. Returns the same value as ``sum(find_label_issues(filter_by='confident_learning'))``
- - ``'off_diagonal_calibrated'``: Calibrates confident joint estimate ``P(label=i, true_label=j)`` such that
- ``np.sum(cj) == len(labels)`` and ``np.sum(cj, axis = 1) == np.bincount(labels)`` before counting the number
- of examples in the off-diagonal. Number will always be equal to or greater than
- ``estimate_issues='off_diagonal'``. You can use this value as the cutoff threshold used with ranking/scoring
- functions from :py:mod:`cleanlab.rank` with `num_label_issues` over ``estimation_method='off_diagonal'`` in
- two cases:
- 1. As we add more label and data quality scoring functions in :py:mod:`cleanlab.rank`, this approach will always work.
- 2. If you have a custom score to rank your data by label quality and you just need to know the cut-off of likely label issues.
-
- TL;DR: use this method to get the most accurate estimate of number of label issues when you don't need the indices of the label issues.
+
+ * ``'off_diagonal'``: Counts the number of examples in the off-diagonal of the `confident_joint`. Returns the same value as ``sum(find_label_issues(filter_by='confident_learning'))``
+
+ * ``'off_diagonal_calibrated'``: Calibrates confident joint estimate ``P(label=i, true_label=j)`` such that
+ ``np.sum(cj) == len(labels)`` and ``np.sum(cj, axis = 1) == np.bincount(labels)`` before counting the number
+ of examples in the off-diagonal. Number will always be equal to or greater than
+ ``estimate_issues='off_diagonal'``. You can use this value as the cutoff threshold used with ranking/scoring
+ functions from :py:mod:`cleanlab.rank` with `num_label_issues` over ``estimation_method='off_diagonal'`` in
+ two cases:
+
+ #. As we add more label and data quality scoring functions in :py:mod:`cleanlab.rank`, this approach will always work.
+ #. If you have a custom score to rank your data by label quality and you just need to know the cut-off of likely label issues.
+
+ * ``'off_diagonal_custom'``: Counts the number of examples in the off-diagonal of a provided `confident_joint` matrix.
+
+ TL;DR: Use this method to get the most accurate estimate of number of label issues when you don't need the indices of the label issues.
+
+ Note: ``'off_diagonal'`` may sometimes underestimate issues for data with few classes, so consider using ``'off_diagonal_calibrated'`` instead if your data has < 4 classes.
multi_label : bool, optional
Set ``False`` if your dataset is for regular (multi-class) classification, where each example belongs to exactly one class.
@@ -113,7 +121,15 @@ def num_label_issues(
num_issues :
The estimated number of examples with label issues in the dataset.
"""
- valid_methods = ["off_diagonal", "off_diagonal_calibrated"]
+ valid_methods = ["off_diagonal", "off_diagonal_calibrated", "off_diagonal_custom"]
+ if isinstance(confident_joint, np.ndarray) and estimation_method != "off_diagonal_custom":
+ warn_str = (
+ "The supplied `confident_joint` is ignored as `confident_joint` is recomuputed internally using "
+ "the supplied `labels` and `pred_probs`. If you still want to use custom `confident_joint` call function "
+ "with `estimation_method='off_diagonal_custom'`."
+ )
+ warnings.warn(warn_str)
+
if multi_label:
return _num_label_issues_multilabel(
labels=labels,
@@ -123,29 +139,54 @@ def num_label_issues(
labels = labels_to_array(labels)
assert_valid_inputs(X=None, y=labels, pred_probs=pred_probs)
- if confident_joint is None:
- # Original non-calibrated counts of confidently correctly and incorrectly labeled examples.
- computed_confident_joint = compute_confident_joint(
- labels=labels, pred_probs=pred_probs, calibrate=False
+ if estimation_method == "off_diagonal":
+ _, cl_error_indices = compute_confident_joint(
+ labels=labels,
+ pred_probs=pred_probs,
+ calibrate=False,
+ return_indices_of_off_diagonals=True,
)
- else:
- computed_confident_joint = confident_joint
- assert isinstance(computed_confident_joint, np.ndarray)
+ label_issues_mask = np.zeros(len(labels), dtype=bool)
+ for idx in cl_error_indices:
+ label_issues_mask[idx] = True
- if estimation_method == "off_diagonal":
- num_issues: int = np.sum(computed_confident_joint) - np.trace(computed_confident_joint)
+ # Remove label issues if given label == model prediction
+ pred = pred_probs.argmax(axis=1)
+ for i, pred_label in enumerate(pred):
+ if pred_label == labels[i]:
+ label_issues_mask[i] = False
+ num_issues = np.sum(label_issues_mask)
elif estimation_method == "off_diagonal_calibrated":
+ calculated_confident_joint = compute_confident_joint(
+ labels=labels,
+ pred_probs=pred_probs,
+ calibrate=True,
+ )
+ assert isinstance(calculated_confident_joint, np.ndarray)
# Estimate_joint calibrates the row sums to match the prior distribution of given labels and normalizes to sum to 1
- joint = estimate_joint(labels, pred_probs, confident_joint=computed_confident_joint)
+ joint = estimate_joint(labels, pred_probs, confident_joint=calculated_confident_joint)
frac_issues = 1.0 - joint.trace()
num_issues = np.rint(frac_issues * len(labels)).astype(int)
+ elif estimation_method == "off_diagonal_custom":
+ if not isinstance(confident_joint, np.ndarray):
+ raise ValueError(
+ f"""
+ No `confident_joint` provided. For 'estimation_method' = {estimation_method} you need to provide pre-calculated
+ `confident_joint` matrix. Use a different `estimation_method` if you want the `confident_joint` matrix to
+ be calculated for you.
+ """
+ )
+ else:
+ joint = estimate_joint(labels, pred_probs, confident_joint=confident_joint)
+ frac_issues = 1.0 - joint.trace()
+ num_issues = np.rint(frac_issues * len(labels)).astype(int)
else:
raise ValueError(
f"""
- {estimation_method} is not a valid estimation method!
- Please choose a valid estimation method: {valid_methods}
- """
+ {estimation_method} is not a valid estimation method!
+ Please choose a valid estimation method: {valid_methods}
+ """
)
return num_issues
@@ -169,12 +210,19 @@ def _num_label_issues_multilabel(
-------
num_issues : int
The estimated number of examples with label issues in the multi-label dataset.
+
+ Note: We set the filter_by method as 'confident_learning' to match the non-multilabel case
+ (analog to the off_diagonal estimation method)
"""
from cleanlab.filter import find_label_issues
issues_idx = find_label_issues(
- labels=labels, pred_probs=pred_probs, confident_joint=confident_joint, multi_label=True
+ labels=labels,
+ pred_probs=pred_probs,
+ confident_joint=confident_joint,
+ multi_label=True,
+ filter_by="confident_learning", # specified to match num_label_issues
)
return sum(issues_idx)
@@ -341,7 +389,12 @@ def estimate_joint(
multi_label=multi_label,
)
else:
- calibrated_cj = calibrate_confident_joint(confident_joint, labels, multi_label=multi_label)
+ if labels is not None:
+ calibrated_cj = calibrate_confident_joint(
+ confident_joint, labels, multi_label=multi_label
+ )
+ else:
+ calibrated_cj = confident_joint
assert isinstance(calibrated_cj, np.ndarray)
if multi_label:
@@ -1402,7 +1455,10 @@ def get_confident_thresholds(
np.mean(pred_probs[:, k][labels == k]) if k in unique_classes else BIG_VALUE
for k in all_classes
]
- return np.array(confident_thresholds)
+ confident_thresholds = np.clip(
+ confident_thresholds, a_min=CONFIDENT_THRESHOLDS_LOWER_BOUND, a_max=None
+ )
+ return confident_thresholds
def _get_confident_thresholds_multilabel(
diff --git a/cleanlab/datalab/__init__.py b/cleanlab/datalab/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cleanlab/datalab/data.py b/cleanlab/datalab/data.py
new file mode 100644
index 0000000000..e04a84c1a6
--- /dev/null
+++ b/cleanlab/datalab/data.py
@@ -0,0 +1,313 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+"""Classes and methods for datasets that are loaded into Datalab."""
+
+import os
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union, cast, TYPE_CHECKING
+
+try:
+ import datasets
+except ImportError as error:
+ raise ImportError(
+ "Cannot import datasets package. "
+ "Please install it and try again, or just install cleanlab with "
+ "all optional dependencies via: `pip install 'cleanlab[all]'`"
+ ) from error
+import numpy as np
+import pandas as pd
+from datasets.arrow_dataset import Dataset
+from datasets import ClassLabel
+
+from cleanlab.internal.validation import labels_to_array
+
+
+if TYPE_CHECKING: # pragma: no cover
+ DatasetLike = Union[Dataset, pd.DataFrame, Dict[str, Any], List[Dict[str, Any]], str]
+
+
+class DataFormatError(ValueError):
+ """Exception raised when the data is not in a supported format."""
+
+ def __init__(self, data: Any):
+ self.data = data
+ message = (
+ f"Unsupported data type: {type(data)}\n"
+ "Supported types: "
+ "datasets.Dataset, pandas.DataFrame, dict, list, str"
+ )
+ super().__init__(message)
+
+
+class DatasetDictError(ValueError):
+ """Exception raised when a DatasetDict is passed to Datalab.
+
+ Usually, this means that a dataset identifier was passed to Datalab, but
+ the dataset is a DatasetDict, which contains multiple splits of the dataset.
+
+ """
+
+ def __init__(self):
+ message = (
+ "Please pass a single dataset, not a DatasetDict. "
+ "Try specifying a split, e.g. `dataset = load_dataset('dataset', split='train')` "
+ "then pass `dataset` to Datalab."
+ )
+ super().__init__(message)
+
+
+class DatasetLoadError(ValueError):
+ """Exception raised when a dataset cannot be loaded.
+
+ Parameters
+ ----------
+ dataset_type: type
+ The type of dataset that failed to load.
+ """
+
+ def __init__(self, dataset_type: type):
+ message = f"Failed to load dataset from {dataset_type}.\n"
+ super().__init__(message)
+
+
+class Data:
+ """
+ Class that holds and validates datasets for Datalab.
+
+ Internally, the data is stored as a datasets.Dataset object and the labels
+ are integers (ranging from 0 to K-1, where K is the number of classes) stored
+ in a numpy array.
+
+ Parameters
+ ----------
+ data :
+ Dataset to be audited by Datalab.
+ Several formats are supported, which will internally be converted to a Dataset object.
+
+ Supported formats:
+ - datasets.Dataset
+ - pandas.DataFrame
+ - dict
+ - keys are strings
+ - values are arrays or lists of equal length
+ - list
+ - list of dictionaries with the same keys
+ - str
+ - path to a local file
+ - Text (.txt)
+ - CSV (.csv)
+ - JSON (.json)
+ - or a dataset identifier on the Hugging Face Hub
+ It checks if the string is a path to a file that exists locally, and if not,
+ it assumes it is a dataset identifier on the Hugging Face Hub.
+
+ label_name : Union[str, List[str]]
+ Name of the label column in the dataset.
+
+ Warnings
+ --------
+ Optional dependencies:
+
+ - datasets :
+ Dataset, DatasetDict and load_dataset are imported from datasets.
+ This is an optional dependency of cleanlab, but is required for
+ :py:class:`Datalab ` to work.
+ """
+
+ def __init__(self, data: "DatasetLike", label_name: Optional[str] = None) -> None:
+ self._validate_data(data)
+ self._data = self._load_data(data)
+ self._data_hash = hash(self._data)
+ self.labels = Label(data=self._data, label_name=label_name)
+
+ def _load_data(self, data: "DatasetLike") -> Dataset:
+ """Checks the type of dataset and uses the correct loader method and
+ assigns the result to the data attribute."""
+ dataset_factory_map: Dict[type, Callable[..., Dataset]] = {
+ Dataset: lambda x: x,
+ pd.DataFrame: Dataset.from_pandas,
+ dict: self._load_dataset_from_dict,
+ list: self._load_dataset_from_list,
+ str: self._load_dataset_from_string,
+ }
+ if not isinstance(data, tuple(dataset_factory_map.keys())):
+ raise DataFormatError(data)
+ return dataset_factory_map[type(data)](data)
+
+ def __len__(self) -> int:
+ return len(self._data)
+
+ def __eq__(self, other) -> bool:
+ if isinstance(other, Data):
+ # Equality checks
+ hashes_are_equal = self._data_hash == other._data_hash
+ labels_are_equal = self.labels == other.labels
+ return all([hashes_are_equal, labels_are_equal])
+ return False
+
+ def __hash__(self) -> int:
+ return self._data_hash
+
+ @property
+ def class_names(self) -> List[str]:
+ return self.labels.class_names
+
+ @property
+ def has_labels(self) -> bool:
+ """Check if labels are available."""
+ return self.labels.is_available
+
+ @staticmethod
+ def _validate_data(data) -> None:
+ if isinstance(data, datasets.DatasetDict):
+ raise DatasetDictError()
+ if not isinstance(data, (Dataset, pd.DataFrame, dict, list, str)):
+ raise DataFormatError(data)
+
+ @staticmethod
+ def _load_dataset_from_dict(data_dict: Dict[str, Any]) -> Dataset:
+ try:
+ return Dataset.from_dict(data_dict)
+ except Exception as error:
+ raise DatasetLoadError(dict) from error
+
+ @staticmethod
+ def _load_dataset_from_list(data_list: List[Dict[str, Any]]) -> Dataset:
+ try:
+ return Dataset.from_list(data_list)
+ except Exception as error:
+ raise DatasetLoadError(list) from error
+
+ @staticmethod
+ def _load_dataset_from_string(data_string: str) -> Dataset:
+ if not os.path.exists(data_string):
+ try:
+ dataset = datasets.load_dataset(data_string)
+ return cast(Dataset, dataset)
+ except Exception as error:
+ raise DatasetLoadError(str) from error
+
+ factory: Dict[str, Callable[[str], Any]] = {
+ ".txt": Dataset.from_text,
+ ".csv": Dataset.from_csv,
+ ".json": Dataset.from_json,
+ }
+
+ extension = os.path.splitext(data_string)[1]
+ if extension not in factory:
+ raise DatasetLoadError(type(data_string))
+
+ dataset = factory[extension](data_string)
+ dataset_cast = cast(Dataset, dataset)
+ return dataset_cast
+
+
+class Label:
+ """
+ Class to represent labels in a dataset.
+
+ Parameters
+ ----------
+ """
+
+ def __init__(self, *, data: Dataset, label_name: Optional[str] = None) -> None:
+ self._data = data
+ self.label_name = label_name
+ self.labels = labels_to_array([])
+ self.label_map: Mapping[str, Any] = {}
+ if label_name is not None:
+ self.labels, self.label_map = _extract_labels(data, label_name)
+ self._validate_labels()
+
+ def __len__(self) -> int:
+ if self.labels is None:
+ return 0
+ return len(self.labels)
+
+ def __eq__(self, __value: object) -> bool:
+ if isinstance(__value, Label):
+ labels_are_equal = np.array_equal(self.labels, __value.labels)
+ names_are_equal = self.label_name == __value.label_name
+ maps_are_equal = self.label_map == __value.label_map
+ return all([labels_are_equal, names_are_equal, maps_are_equal])
+ return False
+
+ def __getitem__(self, __index: Union[int, slice, np.ndarray]) -> np.ndarray:
+ return self.labels[__index]
+
+ def __bool__(self) -> bool:
+ return self.is_available
+
+ @property
+ def class_names(self) -> List[str]:
+ """A list of class names that are present in the dataset.
+
+ Without labels, this will return an empty list.
+ """
+ return list(self.label_map.values())
+
+ @property
+ def is_available(self) -> bool:
+ """Check if labels are available."""
+ empty_labels = self.labels is None or len(self.labels) == 0
+ empty_label_map = self.label_map is None or len(self.label_map) == 0
+ return not (empty_labels or empty_label_map)
+
+ def _validate_labels(self) -> None:
+ if self.label_name not in self._data.column_names:
+ raise ValueError(f"Label column '{self.label_name}' not found in dataset.")
+ labels = self._data[self.label_name]
+ assert isinstance(labels, (np.ndarray, list))
+ assert len(labels) == len(self._data)
+
+
+def _extract_labels(data: Dataset, label_name: str) -> Tuple[np.ndarray, Mapping]:
+ """
+ Picks out labels from the dataset and formats them to be [0, 1, ..., K-1]
+ where K is the number of classes. Also returns a mapping from the formatted
+ labels to the original labels in the dataset.
+
+ Note: This function is not meant to be used directly. It is used by
+ ``cleanlab.data.Data`` to extract the formatted labels from the dataset
+ and stores them as attributes.
+
+ Parameters
+ ----------
+ label_name : str
+ Name of the column in the dataset that contains the labels.
+
+ Returns
+ -------
+ formatted_labels : np.ndarray
+ Labels in the format [0, 1, ..., K-1] where K is the number of classes.
+
+ inverse_map : dict
+ Mapping from the formatted labels to the original labels in the dataset.
+ """
+
+ labels = labels_to_array(data[label_name]) # type: ignore[assignment]
+ if labels.ndim != 1:
+ raise ValueError("labels must be 1D numpy array.")
+
+ label_name_feature = data.features[label_name]
+ if isinstance(label_name_feature, ClassLabel):
+ label_map = {label: label_name_feature.str2int(label) for label in label_name_feature.names}
+ formatted_labels = labels
+ else:
+ label_map = {label: i for i, label in enumerate(np.unique(labels))}
+ formatted_labels = np.vectorize(label_map.get)(labels)
+ inverse_map = {i: label for label, i in label_map.items()}
+
+ return formatted_labels, inverse_map
diff --git a/cleanlab/datalab/data_issues.py b/cleanlab/datalab/data_issues.py
new file mode 100644
index 0000000000..89c1663a98
--- /dev/null
+++ b/cleanlab/datalab/data_issues.py
@@ -0,0 +1,291 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+"""
+Module for the :py:class:`DataIssues` class, which serves as a central repository for storing
+information and statistics about issues found in a dataset.
+
+It collects information from various
+:py:class:`IssueManager `
+instances and keeps track of each issue, a summary for each type of issue,
+related information and statistics about the issues.
+
+The collected information can be accessed using the
+:py:meth:`get_info ` method.
+"""
+from __future__ import annotations
+
+import warnings
+from typing import TYPE_CHECKING, Any, Dict, Optional
+import numpy as np
+
+import pandas as pd
+
+if TYPE_CHECKING: # pragma: no cover
+ from cleanlab.datalab.data import Data
+ from cleanlab.datalab.issue_manager import IssueManager
+
+
+class DataIssues:
+ """
+ Class that collects and stores information and statistics on issues found in a dataset.
+
+ Parameters
+ ----------
+ data :
+ The data object for which the issues are being collected.
+
+ Parameters
+ ----------
+ issues : pd.DataFrame
+ Stores information about each individual issue found in the data,
+ on a per-example basis.
+ issue_summary : pd.DataFrame
+ Summarizes the overall statistics for each issue type.
+ info : dict
+ A dictionary that contains information and statistics about the data and each issue type.
+ """
+
+ def __init__(self, data: Data) -> None:
+ self.issues: pd.DataFrame = pd.DataFrame(index=range(len(data)))
+ self.issue_summary: pd.DataFrame = pd.DataFrame(
+ columns=["issue_type", "score", "num_issues"]
+ ).astype({"score": np.float64, "num_issues": np.int64})
+ self.info: Dict[str, Dict[str, Any]] = {
+ "statistics": get_data_statistics(data),
+ }
+ self._label_map = data.labels.label_map
+
+ @property
+ def statistics(self) -> Dict[str, Any]:
+ """Returns the statistics dictionary.
+
+ Shorthand for self.info["statistics"].
+ """
+ return self.info["statistics"]
+
+ def get_issues(self, issue_name: Optional[str] = None) -> pd.DataFrame:
+ """
+ Use this after finding issues to see which examples suffer from which types of issues.
+
+ Parameters
+ ----------
+ issue_name : str or None
+ The type of issue to focus on. If `None`, returns full DataFrame summarizing all of the types of issues detected in each example from the dataset.
+
+ Raises
+ ------
+ ValueError
+ If `issue_name` is not a type of issue previously considered in the audit.
+
+ Returns
+ -------
+ specific_issues :
+ A DataFrame where each row corresponds to an example from the dataset and columns specify:
+ whether this example exhibits a particular type of issue and how severely (via a numeric quality score where lower values indicate more severe instances of the issue).
+
+ Additional columns may be present in the DataFrame depending on the type of issue specified.
+ """
+ if issue_name is None:
+ return self.issues
+
+ columns = [col for col in self.issues.columns if issue_name in col]
+ if not columns:
+ raise ValueError(f"No columns found for issue type '{issue_name}'.")
+ specific_issues = self.issues[columns]
+ info = self.get_info(issue_name=issue_name)
+ if issue_name == "label":
+ specific_issues = specific_issues.assign(
+ given_label=info["given_label"], predicted_label=info["predicted_label"]
+ )
+
+ if issue_name == "near_duplicate":
+ column_dict = {
+ k: info.get(k)
+ for k in ["near_duplicate_sets", "distance_to_nearest_neighbor"]
+ if info.get(k) is not None
+ }
+ specific_issues = specific_issues.assign(**column_dict)
+ return specific_issues
+
+ def get_issue_summary(self, issue_name: Optional[str] = None) -> pd.DataFrame:
+ """Summarize the issues found in dataset of a particular type,
+ including how severe this type of issue is overall across the dataset.
+
+ Parameters
+ ----------
+ issue_name :
+ Name of the issue type to summarize. If `None`, summarizes each of the different issue types previously considered in the audit.
+
+ Returns
+ -------
+ issue_summary :
+ DataFrame where each row corresponds to a type of issue, and columns quantify:
+ the number of examples in the dataset estimated to exhibit this type of issue,
+ and the overall severity of the issue across the dataset (via a numeric quality score where lower values indicate that the issue is overall more severe).
+ """
+ if self.issue_summary.empty:
+ raise ValueError(
+ "No issues found in the dataset. "
+ "Call `find_issues` before calling `get_issue_summary`."
+ )
+
+ if issue_name is None:
+ return self.issue_summary
+
+ row_mask = self.issue_summary["issue_type"] == issue_name
+ if not any(row_mask):
+ raise ValueError(f"Issue type {issue_name} not found in the summary.")
+ return self.issue_summary[row_mask].reset_index(drop=True)
+
+ def get_info(self, issue_name: Optional[str] = None) -> Dict[str, Any]:
+ """Get the info for the issue_name key.
+
+ This function is used to get the info for a specific issue_name. If the info is not computed yet, it will raise an error.
+
+ Parameters
+ ----------
+ issue_name :
+ The issue name for which the info is required.
+
+ Returns
+ -------
+ info:
+ The info for the issue_name.
+ """
+ info = self.info.get(issue_name, None) if issue_name else self.info
+ if info is None:
+ raise ValueError(
+ f"issue_name {issue_name} not found in self.info. These have not been computed yet."
+ )
+ info = info.copy()
+ if issue_name == "label":
+ if self._label_map is None:
+ raise ValueError(
+ "The label map is not available. "
+ "Most likely, no label column was provided when creating the Data object."
+ )
+ # Labels that are stored as integers may need to be converted to strings.
+ for key in ["given_label", "predicted_label"]:
+ labels = info.get(key, None)
+ if labels is not None:
+ info[key] = np.vectorize(self._label_map.get)(labels)
+
+ info["class_names"] = self.statistics["class_names"]
+ return info
+
+ def collect_statistics_from_issue_manager(self, issue_manager: IssueManager) -> None:
+ """Update the statistics in the info dictionary.
+
+ Parameters
+ ----------
+ statistics :
+ A dictionary of statistics to add/update in the info dictionary.
+
+ Examples
+ --------
+
+ A common use case is to reuse the KNN-graph across multiple issue managers.
+ To avoid recomputing the KNN-graph for each issue manager,
+ we can pass it as a statistic to the issue managers.
+
+ >>> from scipy.sparse import csr_matrix
+ >>> weighted_knn_graph = csr_matrix(...)
+ >>> issue_manager_that_computes_knn_graph = ...
+
+ """
+ key = "statistics"
+ statistics: Dict[str, Any] = issue_manager.info.pop(key, {})
+ if statistics:
+ self.info[key].update(statistics)
+
+ def collect_results_from_issue_manager(self, issue_manager: IssueManager) -> None:
+ """
+ Collects results from an IssueManager and update the corresponding
+ attributes of the Datalab object.
+
+ This includes:
+ - self.issues
+ - self.issue_summary
+ - self.info
+
+ Parameters
+ ----------
+ issue_manager :
+ IssueManager object to collect results from.
+ """
+ overlapping_columns = list(set(self.issues.columns) & set(issue_manager.issues.columns))
+ if overlapping_columns:
+ warnings.warn(
+ f"Overwriting columns {overlapping_columns} in self.issues with "
+ f"columns from issue manager {issue_manager}."
+ )
+ self.issues.drop(columns=overlapping_columns, inplace=True)
+ self.issues = self.issues.join(issue_manager.issues, how="outer")
+
+ if issue_manager.issue_name in self.issue_summary["issue_type"].values:
+ warnings.warn(
+ f"Overwriting row in self.issue_summary with "
+ f"row from issue manager {issue_manager}."
+ )
+ self.issue_summary = self.issue_summary[
+ self.issue_summary["issue_type"] != issue_manager.issue_name
+ ]
+ issue_column_name: str = f"is_{issue_manager.issue_name}_issue"
+ num_issues: int = int(issue_manager.issues[issue_column_name].sum())
+ self.issue_summary = pd.concat(
+ [
+ self.issue_summary,
+ issue_manager.summary.assign(num_issues=num_issues),
+ ],
+ axis=0,
+ ignore_index=True,
+ )
+
+ if issue_manager.issue_name in self.info:
+ warnings.warn(
+ f"Overwriting key {issue_manager.issue_name} in self.info with "
+ f"key from issue manager {issue_manager}."
+ )
+ self.info[issue_manager.issue_name] = issue_manager.info
+
+ def set_health_score(self) -> None:
+ """Set the health score for the dataset based on the issue summary.
+
+ Currently, the health score is the mean of the scores for each issue type.
+ """
+ self.info["statistics"]["health_score"] = self.issue_summary["score"].mean()
+
+
+def get_data_statistics(data: Data) -> Dict[str, Any]:
+ """Get statistics about a dataset.
+
+ This function is called to initialize the "statistics" info in all `Datalab` objects.
+
+ Parameters
+ ----------
+ data : Data
+ Data object containing the dataset.
+ """
+ statistics: Dict[str, Any] = {
+ "num_examples": len(data),
+ "multi_label": False,
+ "health_score": None,
+ }
+ if data.labels.is_available:
+ class_names = data.class_names
+ statistics["class_names"] = class_names
+ statistics["num_classes"] = len(class_names)
+ return statistics
diff --git a/cleanlab/datalab/datalab.py b/cleanlab/datalab/datalab.py
new file mode 100644
index 0000000000..d48536f9b4
--- /dev/null
+++ b/cleanlab/datalab/datalab.py
@@ -0,0 +1,531 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+"""
+Datalab offers a unified audit to detect all kinds of issues in data and labels.
+
+.. note::
+ .. include:: optional_dependencies.rst
+"""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+import numpy as np
+import pandas as pd
+
+import cleanlab
+from cleanlab.datalab.data import Data
+from cleanlab.datalab.data_issues import DataIssues
+from cleanlab.datalab.display import _Displayer
+from cleanlab.datalab.issue_finder import IssueFinder
+from cleanlab.datalab.serialize import _Serializer
+from cleanlab.datalab.report import Reporter
+
+if TYPE_CHECKING: # pragma: no cover
+ import numpy.typing as npt
+ from datasets.arrow_dataset import Dataset
+ from scipy.sparse import csr_matrix
+
+ DatasetLike = Union[Dataset, pd.DataFrame, Dict[str, Any], List[Dict[str, Any]], str]
+
+__all__ = ["Datalab"]
+
+
+class Datalab:
+ """
+ A single object to automatically detect all kinds of issues in datasets.
+ This is how we recommend you interface with the cleanlab library if you want to audit the quality of your data and detect issues within it.
+ If you have other specific goals (or are doing a less standard ML task not supported by Datalab), then consider using the other methods across the library.
+ Datalab tracks intermediate state (e.g. data statistics) from certain cleanlab functions that can be re-used across other cleanlab functions for better efficiency.
+
+ Parameters
+ ----------
+ data : Union[Dataset, pd.DataFrame, dict, list, str]
+ Dataset-like object that can be converted to a Hugging Face Dataset object.
+
+ It should contain the labels for all examples, identified by a
+ `label_name` column in the Dataset object.
+
+ Supported formats:
+ - datasets.Dataset
+ - pandas.DataFrame
+ - dict (keys are strings, values are arrays/lists of length ``N``)
+ - list (list of dictionaries that each have the same keys)
+ - str
+
+ - path to a local file: Text (.txt), CSV (.csv), JSON (.json)
+ - or a dataset identifier on the Hugging Face Hub
+
+ label_name : str
+ The name of the label column in the dataset.
+
+ verbosity : int, optional
+ The higher the verbosity level, the more information
+ Datalab prints when auditing a dataset.
+ Valid values are 0 through 4. Default is 1.
+
+ Examples
+ --------
+ >>> import datasets
+ >>> from cleanlab import Datalab
+ >>> data = datasets.load_dataset("glue", "sst2", split="train")
+ >>> datalab = Datalab(data, label_name="label")
+ """
+
+ def __init__(
+ self,
+ data: "DatasetLike",
+ label_name: Optional[str] = None,
+ verbosity: int = 1,
+ ) -> None:
+ self._data = Data(data, label_name)
+ self.data = self._data._data
+ self._labels = self._data.labels
+ self._label_map = self._labels.label_map
+ self.label_name = self._labels.label_name
+ self._data_hash = self._data._data_hash
+ self.data_issues = DataIssues(self._data)
+ self.cleanlab_version = cleanlab.version.__version__
+ self.verbosity = verbosity
+
+ def __repr__(self) -> str:
+ return _Displayer(data_issues=self.data_issues).__repr__()
+
+ def __str__(self) -> str:
+ return _Displayer(data_issues=self.data_issues).__str__()
+
+ @property
+ def labels(self) -> np.ndarray:
+ """Labels of the dataset, in a [0, 1, ..., K-1] format."""
+ return self._labels.labels
+
+ @property
+ def has_labels(self) -> bool:
+ """Whether the dataset has labels."""
+ return self._labels.is_available
+
+ @property
+ def class_names(self) -> List[str]:
+ """Names of the classes in the dataset.
+
+ If the dataset has no labels, returns an empty list.
+ """
+ return self._labels.class_names
+
+ def find_issues(
+ self,
+ *,
+ pred_probs: Optional[np.ndarray] = None,
+ features: Optional[npt.NDArray] = None,
+ knn_graph: Optional[csr_matrix] = None,
+ issue_types: Optional[Dict[str, Any]] = None,
+ ) -> None:
+ """
+ Checks the dataset for all sorts of common issues in real-world data (in both labels and feature values).
+
+ You can use Datalab to find issues in your data, utilizing *any* model you have already trained.
+ This method only interacts with your model via its predictions or embeddings (and other functions thereof).
+ The more of these inputs you provide, the more types of issues Datalab can detect in your dataset/labels.
+ If you provide a subset of these inputs, Datalab will output what insights it can based on the limited information from your model.
+
+ Note
+ ----
+ This method acts as a wrapper around the :py:meth:`IssueFinder.find_issues ` method,
+ where the core logic for issue detection is implemented.
+
+ Note
+ ----
+ The issues are saved in the ``self.issues`` attribute, but are not returned.
+
+ Parameters
+ ----------
+ pred_probs :
+ Out-of-sample predicted class probabilities made by the model for every example in the dataset.
+ To best detect label issues, provide this input obtained from the most accurate model you can produce.
+
+ If provided, this must be a 2D array with shape (num_examples, K) where K is the number of classes in the dataset.
+
+ features : Optional[np.ndarray]
+ Feature embeddings (vector representations) of every example in the dataset.
+
+ If provided, this must be a 2D array with shape (num_examples, num_features).
+
+ knn_graph :
+ Sparse matrix representing distances between examples in the dataset in a k nearest neighbor graph.
+
+ If provided, this must be a square CSR matrix with shape (num_examples, num_examples) and (k*num_examples) non-zero entries (k is the number of nearest neighbors considered for each example)
+ evenly distributed across the rows.
+ The non-zero entries must be the distances between the corresponding examples. Self-distances must be omitted
+ (i.e. the diagonal must be all zeros and the k nearest neighbors of each example must not include itself).
+
+ For any duplicated examples i,j whose distance is 0, there should be an *explicit* zero stored in the matrix, i.e. ``knn_graph[i,j] = 0``.
+
+ If both `knn_graph` and `features` are provided, the `knn_graph` will take precendence.
+ If `knn_graph` is not provided, it is constructed based on the provided `features`.
+ If neither `knn_graph` nor `features` are provided, certain issue types like (near) duplicates will not be considered.
+
+ issue_types :
+ Collection specifying which types of issues to consider in audit and any non-default parameter settings to use.
+ If unspecified, a default set of issue types and recommended parameter settings is considered.
+
+ This is a dictionary of dictionaries, where the keys are the issue types of interest
+ and the values are dictionaries of parameter values that control how each type of issue is detected (only for advanced users).
+ More specifically, the values are constructor keyword arguments passed to the corresponding ``IssueManager``,
+ which is responsible for detecting the particular issue type.
+
+ .. seealso::
+ :py:class:`IssueManager `
+
+ Examples
+ --------
+
+ Here are some ways to provide inputs to :py:meth:`find_issues`:
+
+ - Passing ``pred_probs``:
+ .. code-block:: python
+
+ >>> from sklearn.linear_model import LogisticRegression
+ >>> import numpy as np
+ >>> from cleanlab import Datalab
+ >>> X = np.array([[0, 1], [1, 1], [2, 2], [2, 0]])
+ >>> y = np.array([0, 1, 1, 0])
+ >>> clf = LogisticRegression(random_state=0).fit(X, y)
+ >>> pred_probs = clf.predict_proba(X)
+ >>> lab = Datalab(data={"X": X, "y": y}, label_name="y")
+ >>> lab.find_issues(pred_probs=pred_probs)
+
+
+ - Passing ``features``:
+ .. code-block:: python
+
+ >>> from sklearn.linear_model import LogisticRegression
+ >>> from sklearn.neighbors import NearestNeighbors
+ >>> import numpy as np
+ >>> from cleanlab import Datalab
+ >>> X = np.array([[0, 1], [1, 1], [2, 2], [2, 0]])
+ >>> y = np.array([0, 1, 1, 0])
+ >>> lab = Datalab(data={"X": X, "y": y}, label_name="y")
+ >>> lab.find_issues(features=X)
+
+ .. note::
+
+ You can pass both ``pred_probs`` and ``features`` to :py:meth:`find_issues` for a more comprehensive audit.
+
+ - Passing a ``knn_graph``:
+ .. code-block:: python
+
+ >>> from sklearn.neighbors import NearestNeighbors
+ >>> import numpy as np
+ >>> from cleanlab import Datalab
+ >>> X = np.array([[0, 1], [1, 1], [2, 2], [2, 0]])
+ >>> y = np.array([0, 1, 1, 0])
+ >>> nbrs = NearestNeighbors(n_neighbors=2, metric="euclidean").fit(X)
+ >>> knn_graph = nbrs.kneighbors_graph(mode="distance")
+ >>> knn_graph # Pass this to Datalab
+ <4x4 sparse matrix of type ''
+ with 8 stored elements in Compressed Sparse Row format>
+ >>> knn_graph.toarray() # DO NOT PASS knn_graph.toarray() to Datalab, only pass the sparse matrix itself
+ array([[0. , 1. , 2.23606798, 0. ],
+ [1. , 0. , 1.41421356, 0. ],
+ [0. , 1.41421356, 0. , 2. ],
+ [0. , 1.41421356, 2. , 0. ]])
+ >>> lab = Datalab(data={"X": X, "y": y}, label_name="y")
+ >>> lab.find_issues(knn_graph=knn_graph)
+
+ - Configuring issue types:
+ Suppose you want to only consider label issues. Just pass a dictionary with the key "label" and an empty dictionary as the value (to use default label issue parameters).
+
+ .. code-block:: python
+
+ >>> issue_types = {"label": {}}
+ >>> # lab.find_issues(pred_probs=pred_probs, issue_types=issue_types)
+
+ If you are advanced user who wants greater control, you can pass keyword arguments to the issue manager that handles the label issues.
+ For example, if you want to pass the keyword argument "clean_learning_kwargs"
+ to the constructor of the :py:class:`LabelIssueManager `, you would pass:
+
+
+ .. code-block:: python
+
+ >>> issue_types = {
+ ... "label": {
+ ... "clean_learning_kwargs": {
+ ... "prune_method": "prune_by_noise_rate",
+ ... },
+ ... },
+ ... }
+ >>> # lab.find_issues(pred_probs=pred_probs, issue_types=issue_types)
+
+ """
+ issue_finder = IssueFinder(datalab=self, verbosity=self.verbosity)
+ issue_finder.find_issues(
+ pred_probs=pred_probs,
+ features=features,
+ knn_graph=knn_graph,
+ issue_types=issue_types,
+ )
+
+ def report(
+ self,
+ *,
+ num_examples: int = 5,
+ verbosity: Optional[int] = None,
+ include_description: bool = True,
+ show_summary_score: bool = False,
+ ) -> None:
+ """Prints informative summary of all issues.
+
+ Parameters
+ ----------
+ num_examples :
+ Number of examples to show for each type of issue.
+ The report shows the top `num_examples` instances in the dataset that suffer the most from each type of issue.
+
+ verbosity :
+ Higher verbosity levels add more information to the report.
+
+ include_description :
+ Whether or not to include a description of each issue type in the report.
+ Consider setting this to ``False`` once you're familiar with how each issue type is defined.
+
+ See Also
+ --------
+ For advanced usage, see documentation for the
+ :py:class:`Reporter ` class.
+ """
+ if verbosity is None:
+ verbosity = self.verbosity
+ reporter = Reporter(
+ data_issues=self.data_issues,
+ verbosity=verbosity,
+ include_description=include_description,
+ show_summary_score=show_summary_score,
+ )
+ reporter.report(num_examples=num_examples)
+
+ @property
+ def issues(self) -> pd.DataFrame:
+ """Issues found in each example from the dataset."""
+ return self.data_issues.issues
+
+ @issues.setter
+ def issues(self, issues: pd.DataFrame) -> None:
+ self.data_issues.issues = issues
+
+ @property
+ def issue_summary(self) -> pd.DataFrame:
+ """Summary of issues found in the dataset and the overall severity of each type of issue.
+
+ This is a wrapper around the ``DataIssues.issue_summary`` attribute.
+
+ Examples
+ -------
+
+ If checks for "label" and "outlier" issues were run,
+ then the issue summary will look something like this:
+
+ >>> datalab.issue_summary
+ issue_type score
+ outlier 0.123
+ label 0.456
+ """
+ return self.data_issues.issue_summary
+
+ @issue_summary.setter
+ def issue_summary(self, issue_summary: pd.DataFrame) -> None:
+ self.data_issues.issue_summary = issue_summary
+
+ @property
+ def info(self) -> Dict[str, Dict[str, Any]]:
+ """Information and statistics about the dataset issues found.
+
+ This is a wrapper around the ``DataIssues.info`` attribute.
+
+ Examples
+ -------
+
+ If checks for "label" and "outlier" issues were run,
+ then the info will look something like this:
+
+ >>> datalab.info
+ {
+ "label": {
+ "given_labels": [0, 1, 0, 1, 1, 1, 1, 1, 0, 1, ...],
+ "predicted_label": [0, 0, 0, 1, 0, 1, 0, 1, 0, 1, ...],
+ ...,
+ },
+ "outlier": {
+ "nearest_neighbor": [3, 7, 1, 2, 8, 4, 5, 9, 6, 0, ...],
+ "distance_to_nearest_neighbor": [0.123, 0.789, 0.456, ...],
+ ...,
+ },
+ }
+ """
+ return self.data_issues.info
+
+ @info.setter
+ def info(self, info: Dict[str, Dict[str, Any]]) -> None:
+ self.data_issues.info = info
+
+ def get_issues(self, issue_name: Optional[str] = None) -> pd.DataFrame:
+ """
+ Use this after finding issues to see which examples suffer from which types of issues.
+
+ NOTE
+ ----
+ This is a wrapper around the :py:meth:`DataIssues.get_issues ` method.
+
+ Parameters
+ ----------
+ issue_name : str or None
+ The type of issue to focus on. If `None`, returns full DataFrame summarizing all of the types of issues detected in each example from the dataset.
+
+ Raises
+ ------
+ ValueError
+ If `issue_name` is not a type of issue previously considered in the audit.
+
+ Returns
+ -------
+ specific_issues :
+ A DataFrame where each row corresponds to an example from the dataset and columns specify:
+ whether this example exhibits a particular type of issue, and how severely (via a numeric quality score where lower values indicate more severe instances of the issue).
+ The quality scores lie between 0-1 and are directly comparable between examples (for the same issue type), but not across different issue types.
+
+ Additional columns may be present in the DataFrame depending on the type of issue specified.
+ """
+ return self.data_issues.get_issues(issue_name=issue_name)
+
+ def get_issue_summary(self, issue_name: Optional[str] = None) -> pd.DataFrame:
+ """Summarize the issues found in dataset of a particular type,
+ including how severe this type of issue is overall across the dataset.
+
+ NOTE
+ ----
+ This is a wrapper around the
+ :py:meth:`DataIssues.get_issue_summary ` method.
+
+ Parameters
+ ----------
+ issue_name :
+ Name of the issue type to summarize. If `None`, summarizes each of the different issue types previously considered in the audit.
+
+ Returns
+ -------
+ issue_summary :
+ DataFrame where each row corresponds to a type of issue, and columns quantify:
+ the number of examples in the dataset estimated to exhibit this type of issue,
+ and the overall severity of the issue across the dataset (via a numeric quality score where lower values indicate that the issue is overall more severe).
+ The quality scores lie between 0-1 and are directly comparable between multiple datasets (for the same issue type), but not across different issue types.
+ """
+ return self.data_issues.get_issue_summary(issue_name=issue_name)
+
+ def get_info(self, issue_name: Optional[str] = None) -> Dict[str, Any]:
+ """Get the info for the issue_name key.
+
+ This function is used to get the info for a specific issue_name. If the info is not computed yet, it will raise an error.
+
+ NOTE
+ ----
+ This is a wrapper around the
+ :py:meth:`DataIssues.get_info ` method.
+
+ Parameters
+ ----------
+ issue_name :
+ The issue name for which the info is required.
+
+ Returns
+ -------
+ :py:meth:`info ` :
+ The info for the issue_name.
+ """
+ return self.data_issues.get_info(issue_name)
+
+ @staticmethod
+ def list_possible_issue_types() -> List[str]:
+ """Returns a list of all registered issue types.
+
+ Any issue type that is not in this list cannot be used in the :py:meth:`find_issues` method.
+
+ Note
+ ----
+ This method is a wrapper around :py:meth:`IssueFinder.list_possible_issue_types `.
+
+ See Also
+ --------
+ :py:class:`REGISTRY ` : All available issue types and their corresponding issue managers can be found here.
+ """
+ return IssueFinder.list_possible_issue_types()
+
+ @staticmethod
+ def list_default_issue_types() -> List[str]:
+ """Returns a list of the issue types that are run by default
+ when :py:meth:`find_issues` is called without specifying `issue_types`.
+
+ Note
+ ----
+ This method is a wrapper around :py:meth:`IssueFinder.list_default_issue_types `.
+
+ See Also
+ --------
+ :py:class:`REGISTRY ` : All available issue types and their corresponding issue managers can be found here.
+ """
+ return IssueFinder.list_default_issue_types()
+
+ def save(self, path: str, force: bool = False) -> None:
+ """Saves this Datalab object to file (all files are in folder at `path/`).
+ We do not guarantee saved Datalab can be loaded from future versions of cleanlab.
+
+ Parameters
+ ----------
+ path :
+ Folder in which all information about this Datalab should be saved.
+
+ force :
+ If ``True``, overwrites any existing files in the folder at `path`. Use this with caution!
+
+ Note
+ ----
+ You have to save the Dataset yourself separately if you want it saved to file.
+ """
+ _Serializer.serialize(path=path, datalab=self, force=force)
+ save_message = f"Saved Datalab to folder: {path}"
+ print(save_message)
+
+ @staticmethod
+ def load(path: str, data: Optional[Dataset] = None) -> "Datalab":
+ """Loads Datalab object from a previously saved folder.
+
+ Parameters
+ ----------
+ `path` :
+ Path to the folder previously specified in ``Datalab.save()``.
+
+ `data` :
+ The dataset used to originally construct the Datalab.
+ Remember the dataset is not saved as part of the Datalab,
+ you must save/load the data separately.
+
+ Returns
+ -------
+ `datalab` :
+ A Datalab object that is identical to the one originally saved.
+ """
+ datalab = _Serializer.deserialize(path=path, data=data)
+ load_message = f"Datalab loaded from folder: {path}"
+ print(load_message)
+ return datalab
diff --git a/cleanlab/datalab/display.py b/cleanlab/datalab/display.py
new file mode 100644
index 0000000000..520b261516
--- /dev/null
+++ b/cleanlab/datalab/display.py
@@ -0,0 +1,61 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+"""
+Module that handles the string representation of Datalab objects.
+"""
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING: # pragma: no cover
+ from cleanlab.datalab.data_issues import DataIssues
+
+
+class _Displayer:
+ def __init__(self, data_issues: "DataIssues") -> None:
+ self.data_issues = data_issues
+
+ def __repr__(self) -> str:
+ """What is displayed in console if user executes: >>> datalab"""
+ checks_run = not self.data_issues.issues.empty
+ display_str = f"checks_run={checks_run}"
+ num_examples = self.data_issues.get_info("statistics")["num_examples"]
+ if num_examples is not None:
+ display_str += f", num_examples={num_examples}"
+ num_classes = self.data_issues.get_info("statistics")["num_classes"]
+ if num_classes is not None:
+ display_str += f", num_classes={num_classes}"
+ if checks_run:
+ issues_identified = self.data_issues.issue_summary["num_issues"].sum()
+ display_str += f", issues_identified={issues_identified}"
+ return f"Datalab({display_str})"
+
+ def __str__(self) -> str:
+ """What is displayed if user executes: print(datalab)"""
+ checks_run = not self.data_issues.issues.empty
+ num_examples = self.data_issues.get_info("statistics").get("num_examples")
+ num_classes = self.data_issues.get_info("statistics").get("num_classes")
+
+ issues_identified = (
+ self.data_issues.issue_summary["num_issues"].sum() if checks_run else "Not checked"
+ )
+ info_list = [
+ f"Checks run: {'Yes' if checks_run else 'No'}",
+ f"Number of examples: {num_examples if num_examples is not None else 'Unknown'}",
+ f"Number of classes: {num_classes if num_classes is not None else 'Unknown'}",
+ f"Issues identified: {issues_identified}",
+ ]
+
+ return "Datalab:\n" + "\n".join(info_list)
diff --git a/cleanlab/datalab/examples/__init__.py b/cleanlab/datalab/examples/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cleanlab/datalab/factory.py b/cleanlab/datalab/factory.py
new file mode 100644
index 0000000000..0cbe8f5262
--- /dev/null
+++ b/cleanlab/datalab/factory.py
@@ -0,0 +1,155 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+"""The factory module provides a factory class for constructing concrete issue managers
+and a decorator for registering new issue managers.
+
+This module provides the :py:meth:`register` decorator for users to register new subclasses of
+:py:class:`IssueManager `
+in the registry. Each IssueManager detects some particular type of issue in a dataset.
+
+
+Note
+----
+
+The :class:`REGISTRY` variable is used by the factory class to keep track
+of registered issue managers.
+The factory class is used as an implementation detail by
+:py:class:`Datalab `,
+which provides a simplified API for constructing concrete issue managers.
+:py:class:`Datalab ` is intended to be used by users
+and provides detailed documentation on how to use the API.
+
+Warning
+-------
+Neither the :class:`REGISTRY` variable nor the factory class should be used directly by users.
+"""
+from __future__ import annotations
+
+from typing import Dict, List, Type
+
+from cleanlab.datalab.issue_manager import (
+ IssueManager,
+ LabelIssueManager,
+ NearDuplicateIssueManager,
+ OutlierIssueManager,
+ NonIIDIssueManager,
+)
+
+
+REGISTRY: Dict[str, Type[IssueManager]] = {
+ "outlier": OutlierIssueManager,
+ "label": LabelIssueManager,
+ "near_duplicate": NearDuplicateIssueManager,
+ "non_iid": NonIIDIssueManager,
+}
+"""Registry of issue managers that can be constructed from a string
+and used in the Datalab class.
+
+:meta hide-value:
+
+Currently, the following issue managers are registered by default:
+
+- ``"outlier"``: :py:class:`OutlierIssueManager `
+- ``"label"``: :py:class:`LabelIssueManager `
+- ``"near_duplicate"``: :py:class:`NearDuplicateIssueManager `
+- ``"non_iid"``: :py:class:`NonIIDIssueManager `
+
+Warning
+-------
+This variable should not be used directly by users.
+"""
+
+
+# Construct concrete issue manager with a from_str method
+class _IssueManagerFactory:
+ """Factory class for constructing concrete issue managers."""
+
+ @classmethod
+ def from_str(cls, issue_type: str) -> Type[IssueManager]:
+ """Constructs a concrete issue manager class from a string."""
+ if isinstance(issue_type, list):
+ raise ValueError(
+ "issue_type must be a string, not a list. Try using from_list instead."
+ )
+ if issue_type not in REGISTRY:
+ raise ValueError(f"Invalid issue type: {issue_type}")
+ return REGISTRY[issue_type]
+
+ @classmethod
+ def from_list(cls, issue_types: List[str]) -> List[Type[IssueManager]]:
+ """Constructs a list of concrete issue manager classes from a list of strings."""
+ return [cls.from_str(issue_type) for issue_type in issue_types]
+
+
+def register(cls: Type[IssueManager]) -> Type[IssueManager]:
+ """Registers the issue manager factory.
+
+ Parameters
+ ----------
+ cls :
+ A subclass of
+ :py:class:`IssueManager `.
+
+ Returns
+ -------
+ cls :
+ The same class that was passed in.
+
+ Example
+ -------
+
+ When defining a new subclass of
+ :py:class:`IssueManager `,
+ you can register it like so:
+
+ .. code-block:: python
+
+ from cleanlab import IssueManager
+ from cleanlab.datalab.factory import register
+
+ @register
+ class MyIssueManager(IssueManager):
+ issue_name: str = "my_issue"
+ def find_issues(self, **kwargs):
+ # Some logic to find issues
+ pass
+
+ or in a function call:
+
+ .. code-block:: python
+
+ from cleanlab import IssueManager
+ from cleanlab.datalab.factory import register
+
+ class MyIssueManager(IssueManager):
+ issue_name: str = "my_issue"
+ def find_issues(self, **kwargs):
+ # Some logic to find issues
+ pass
+
+ register(MyIssueManager)
+ """
+ name: str = str(cls.issue_name)
+ if name in REGISTRY:
+ # Warn user that they are overwriting an existing issue manager
+ print(
+ f"Warning: Overwriting existing issue manager {name} with {cls}. "
+ "This may cause unexpected behavior."
+ )
+ if not issubclass(cls, IssueManager):
+ raise ValueError(f"Class {cls} must be a subclass of IssueManager")
+ REGISTRY[name] = cls
+ return cls
diff --git a/cleanlab/datalab/issue_finder.py b/cleanlab/datalab/issue_finder.py
new file mode 100644
index 0000000000..8a91e7ea4b
--- /dev/null
+++ b/cleanlab/datalab/issue_finder.py
@@ -0,0 +1,370 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+"""
+Module for the :class:`IssueFinder` class, which is responsible for configuring,
+creating and running issue managers.
+
+It determines which types of issues to look for, instatiates the IssueManagers
+via a factory, run the issue managers
+(:py:meth:`IssueManager.find_issues `),
+and collects the results to :py:class:`DataIssues `.
+
+.. note::
+
+ This module is not intended to be used directly. Instead, use the public-facing
+ :py:meth:`Datalab.find_issues ` method.
+"""
+from __future__ import annotations
+
+from typing import Any, List, Optional, Dict, TYPE_CHECKING
+import warnings
+
+import numpy as np
+from scipy.sparse import csr_matrix
+
+from cleanlab.datalab.factory import _IssueManagerFactory, REGISTRY
+
+if TYPE_CHECKING: # pragma: no cover
+ import numpy.typing as npt
+ from cleanlab.datalab.datalab import Datalab
+
+
+class IssueFinder:
+ """
+ The IssueFinder class is responsible for managing the process of identifying
+ issues in the dataset by handling the creation and execution of relevant
+ IssueManagers. It serves as a coordinator or helper class for the Datalab class
+ to encapsulate the specific behavior of the issue finding process.
+
+ At a high level, the IssueFinder is responsible for:
+
+ - Determining which types of issues to look for.
+ - Instantiating the appropriate IssueManagers using a factory.
+ - Running the IssueManagers' `find_issues` methods.
+ - Collecting the results into a DataIssues instance.
+
+ Parameters
+ ----------
+ datalab : Datalab
+ The Datalab instance associated with this IssueFinder.
+
+ verbosity : int
+ Controls the verbosity of the output during the issue finding process.
+
+ Note
+ ----
+ This class is not intended to be used directly. Instead, use the
+ `Datalab.find_issues` method which internally utilizes an IssueFinder instance.
+ """
+
+ def __init__(self, datalab: "Datalab", verbosity=1):
+ self.datalab = datalab
+ self.verbosity = verbosity
+
+ def find_issues(
+ self,
+ *,
+ pred_probs: Optional[np.ndarray] = None,
+ features: Optional[npt.NDArray] = None,
+ knn_graph: Optional[csr_matrix] = None,
+ issue_types: Optional[Dict[str, Any]] = None,
+ ) -> None:
+ """
+ Checks the dataset for all sorts of common issues in real-world data (in both labels and feature values).
+
+ You can use Datalab to find issues in your data, utilizing *any* model you have already trained.
+ This method only interacts with your model via its predictions or embeddings (and other functions thereof).
+ The more of these inputs you provide, the more types of issues Datalab can detect in your dataset/labels.
+ If you provide a subset of these inputs, Datalab will output what insights it can based on the limited information from your model.
+
+ Note
+ ----
+ This method is not intended to be used directly. Instead, use the
+ :py:meth:`Datalab.find_issues ` method.
+
+ Note
+ ----
+ The issues are saved in the ``self.datalab.data_issues.issues`` attribute, but are not returned.
+
+ Parameters
+ ----------
+ pred_probs :
+ Out-of-sample predicted class probabilities made by the model for every example in the dataset.
+ To best detect label issues, provide this input obtained from the most accurate model you can produce.
+
+ If provided, this must be a 2D array with shape (num_examples, K) where K is the number of classes in the dataset.
+
+ features : Optional[np.ndarray]
+ Feature embeddings (vector representations) of every example in the dataset.
+
+ If provided, this must be a 2D array with shape (num_examples, num_features).
+
+ knn_graph :
+ Sparse matrix representing distances between examples in the dataset in a k nearest neighbor graph.
+
+ If provided, this must be a square CSR matrix with shape (num_examples, num_examples) and (k*num_examples) non-zero entries (k is the number of nearest neighbors considered for each example)
+ evenly distributed across the rows.
+ The non-zero entries must be the distances between the corresponding examples. Self-distances must be omitted
+ (i.e. the diagonal must be all zeros and the k nearest neighbors of each example must not include itself).
+
+ For any duplicated examples i,j whose distance is 0, there should be an *explicit* zero stored in the matrix, i.e. ``knn_graph[i,j] = 0``.
+
+ If both `knn_graph` and `features` are provided, the `knn_graph` will take precendence.
+ If `knn_graph` is not provided, it is constructed based on the provided `features`.
+ If neither `knn_graph` nor `features` are provided, certain issue types like (near) duplicates will not be considered.
+
+ issue_types :
+ Collection specifying which types of issues to consider in audit and any non-default parameter settings to use.
+ If unspecified, a default set of issue types and recommended parameter settings is considered.
+
+ This is a dictionary of dictionaries, where the keys are the issue types of interest
+ and the values are dictionaries of parameter values that control how each type of issue is detected (only for advanced users).
+ More specifically, the values are constructor keyword arguments passed to the corresponding ``IssueManager``,
+ which is responsible for detecting the particular issue type.
+
+ .. seealso::
+ :py:class:`IssueManager `
+ """
+
+ if issue_types is not None and not issue_types:
+ warnings.warn(
+ "No issue types were specified. " "No issues will be found in the dataset."
+ )
+ return None
+
+ if issue_types is not None and not issue_types:
+ warnings.warn(
+ "No issue types were specified. " "No issues will be found in the dataset."
+ )
+ return None
+
+ issue_types_copy = self.get_available_issue_types(
+ pred_probs=pred_probs,
+ features=features,
+ knn_graph=knn_graph,
+ issue_types=issue_types,
+ )
+
+ new_issue_managers = [
+ factory(datalab=self.datalab, **issue_types_copy.get(factory.issue_name, {}))
+ for factory in _IssueManagerFactory.from_list(list(issue_types_copy.keys()))
+ ]
+
+ if not new_issue_managers:
+ no_args_passed = all(arg is None for arg in [pred_probs, features, knn_graph])
+ if no_args_passed:
+ warnings.warn("No arguments were passed to find_issues.")
+ warnings.warn("No issue check performed.")
+ return None
+
+ failed_managers = []
+ data_issues = self.datalab.data_issues
+ for issue_manager, arg_dict in zip(new_issue_managers, issue_types_copy.values()):
+ try:
+ if self.verbosity:
+ print(f"Finding {issue_manager.issue_name} issues ...")
+ issue_manager.find_issues(**arg_dict)
+ data_issues.collect_statistics_from_issue_manager(issue_manager)
+ data_issues.collect_results_from_issue_manager(issue_manager)
+ except Exception as e:
+ print(f"Error in {issue_manager.issue_name}: {e}")
+ failed_managers.append(issue_manager)
+
+ if self.verbosity:
+ print(
+ f"Audit complete. {data_issues.issue_summary['num_issues'].sum()} issues found in the dataset."
+ )
+ if failed_managers:
+ print(f"Failed to check for these issue types: {failed_managers}")
+
+ data_issues.set_health_score()
+
+ def _resolve_required_args(self, pred_probs, features, knn_graph):
+ """Resolves the required arguments for each issue type.
+
+ This is a helper function that filters out any issue manager
+ that does not have the required arguments.
+
+ This does not consider custom hyperparameters for each issue type.
+
+
+ Parameters
+ ----------
+ pred_probs :
+ Out-of-sample predicted probabilities made on the data.
+
+ features :
+ Name of column containing precomputed embeddings.
+
+ Returns
+ -------
+ args_dict :
+ Dictionary of required arguments for each issue type, if available.
+ """
+ args_dict = {
+ "label": {"pred_probs": pred_probs},
+ "outlier": {"pred_probs": pred_probs, "features": features, "knn_graph": knn_graph},
+ "near_duplicate": {"features": features, "knn_graph": knn_graph},
+ "non_iid": {"features": features, "knn_graph": knn_graph},
+ }
+
+ args_dict = {
+ k: {k2: v2 for k2, v2 in v.items() if v2 is not None} for k, v in args_dict.items() if v
+ }
+
+ # Prefer `knn_graph` over `features` if both are provided.
+ for v in args_dict.values():
+ if "knn_graph" in v and "features" in v:
+ warnings.warn(
+ "Both `features` and `knn_graph` were provided. "
+ "Most issue managers will likely prefer using `knn_graph` "
+ "instead of `features` for efficiency."
+ )
+
+ args_dict = {k: v for k, v in args_dict.items() if v}
+
+ return args_dict
+
+ def _set_issue_types(
+ self,
+ issue_types: Optional[Dict[str, Any]],
+ required_defaults_dict: Dict[str, Any],
+ ) -> Dict[str, Any]:
+ """Set necessary configuration for each IssueManager in a dictionary.
+
+ While each IssueManager defines default values for its arguments,
+ the Datalab class needs to organize the calls to each IssueManager
+ with different arguments, some of which may be user-provided.
+
+ Parameters
+ ----------
+ issue_types :
+ Dictionary of issue types and argument configuration for their respective IssueManagers.
+ If None, then the `required_defaults_dict` is used.
+
+ required_defaults_dict :
+ Dictionary of default parameter configuration for each issue type.
+
+ Returns
+ -------
+ issue_types_copy :
+ Dictionary of issue types and their parameter configuration.
+ The input `issue_types` is copied and updated with the necessary default values.
+ """
+ if issue_types is not None:
+ issue_types_copy = issue_types.copy()
+ self._check_missing_args(required_defaults_dict, issue_types_copy)
+ else:
+ issue_types_copy = required_defaults_dict.copy()
+ # Check that all required arguments are provided.
+ self._validate_issue_types_dict(issue_types_copy, required_defaults_dict)
+
+ # Remove None values from argument list, rely on default values in IssueManager
+ for key, value in issue_types_copy.items():
+ issue_types_copy[key] = {k: v for k, v in value.items() if v is not None}
+ return issue_types_copy
+
+ @staticmethod
+ def _check_missing_args(required_defaults_dict, issue_types):
+ for key, issue_type_value in issue_types.items():
+ missing_args = set(required_defaults_dict.get(key, {})) - set(issue_type_value.keys())
+ # Impute missing arguments with default values.
+ missing_dict = {
+ missing_arg: required_defaults_dict[key][missing_arg]
+ for missing_arg in missing_args
+ }
+ issue_types[key].update(missing_dict)
+
+ @staticmethod
+ def _validate_issue_types_dict(
+ issue_types: Dict[str, Any], required_defaults_dict: Dict[str, Any]
+ ) -> None:
+ missing_required_args_dict = {}
+ for issue_name, required_args in required_defaults_dict.items():
+ if issue_name in issue_types:
+ missing_args = set(required_args.keys()) - set(issue_types[issue_name].keys())
+ if missing_args:
+ missing_required_args_dict[issue_name] = missing_args
+ if any(missing_required_args_dict.values()):
+ error_message = ""
+ for issue_name, missing_required_args in missing_required_args_dict.items():
+ error_message += f"Required argument {missing_required_args} for issue type {issue_name} was not provided.\n"
+ raise ValueError(error_message)
+
+ @staticmethod
+ def list_possible_issue_types() -> List[str]:
+ """Returns a list of all registered issue types.
+
+ Any issue type that is not in this list cannot be used in the :py:meth:`find_issues` method.
+
+ See Also
+ --------
+ :py:class:`REGISTRY ` : All available issue types and their corresponding issue managers can be found here.
+ """
+ return list(REGISTRY.keys())
+
+ @staticmethod
+ def list_default_issue_types() -> List[str]:
+ """Returns a list of the issue types that are run by default
+ when :py:meth:`find_issues` is called without specifying `issue_types`.
+
+ See Also
+ --------
+ :py:class:`REGISTRY ` : All available issue types and their corresponding issue managers can be found here.
+ """
+ return ["label", "outlier", "near_duplicate", "non_iid"]
+
+ def get_available_issue_types(self, **kwargs):
+ """Returns a dictionary of issue types that can be used in :py:meth:`Datalab.find_issues
+ ` method."""
+
+ pred_probs = kwargs.get("pred_probs", None)
+ features = kwargs.get("features", None)
+ knn_graph = kwargs.get("knn_graph", None)
+ issue_types = kwargs.get("issue_types", None)
+
+ # Determine which parameters are required for each issue type
+ required_args_per_issue_type = self._resolve_required_args(pred_probs, features, knn_graph)
+
+ issue_types_copy = self._set_issue_types(issue_types, required_args_per_issue_type)
+
+ if issue_types is None:
+ # Only run default issue types if no issue types are specified
+ issue_types_copy = {
+ issue: issue_types_copy[issue]
+ for issue in self.list_default_issue_types()
+ if issue in issue_types_copy
+ }
+
+ drop_label_check = "label" in issue_types_copy and not self.datalab.has_labels
+ if drop_label_check:
+ warnings.warn("No labels were provided. " "The 'label' issue type will not be run.")
+ issue_types_copy.pop("label")
+
+ outlier_check_needs_features = "outlier" in issue_types_copy and not self.datalab.has_labels
+ if outlier_check_needs_features:
+ no_features = features is None
+ no_knn_graph = knn_graph is None
+ pred_probs_given = issue_types_copy["outlier"].get("pred_probs", None) is not None
+
+ only_pred_probs_given = pred_probs_given and no_features and no_knn_graph
+ if only_pred_probs_given:
+ warnings.warn(
+ "No labels were provided. " "The 'outlier' issue type will not be run."
+ )
+ issue_types_copy.pop("outlier")
+
+ return issue_types_copy
diff --git a/cleanlab/datalab/issue_manager/__init__.py b/cleanlab/datalab/issue_manager/__init__.py
new file mode 100644
index 0000000000..0352a6a03a
--- /dev/null
+++ b/cleanlab/datalab/issue_manager/__init__.py
@@ -0,0 +1,5 @@
+from .issue_manager import IssueManager # isort:skip
+from .duplicate import NearDuplicateIssueManager
+from .label import LabelIssueManager
+from .outlier import OutlierIssueManager
+from .noniid import NonIIDIssueManager
diff --git a/cleanlab/datalab/issue_manager/duplicate.py b/cleanlab/datalab/issue_manager/duplicate.py
new file mode 100644
index 0000000000..6a8105d285
--- /dev/null
+++ b/cleanlab/datalab/issue_manager/duplicate.py
@@ -0,0 +1,222 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Union
+import warnings
+
+import numpy as np
+import pandas as pd
+from scipy.sparse import csr_matrix
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils.validation import check_is_fitted
+
+from cleanlab.datalab.issue_manager import IssueManager
+
+if TYPE_CHECKING: # pragma: no cover
+ import numpy.typing as npt
+ from cleanlab.datalab.datalab import Datalab
+
+
+class NearDuplicateIssueManager(IssueManager):
+ """Manages issues related to near-duplicate examples."""
+
+ description: ClassVar[
+ str
+ ] = """A (near) duplicate issue refers to two or more examples in
+ a dataset that are extremely similar to each other, relative
+ to the rest of the dataset. The examples flagged with this issue
+ may be exactly duplicated, or lie atypically close together when
+ represented as vectors (i.e. feature embeddings).
+ """
+ issue_name: ClassVar[str] = "near_duplicate"
+ verbosity_levels = {
+ 0: [],
+ 1: [],
+ 2: ["threshold"],
+ }
+
+ def __init__(
+ self,
+ datalab: Datalab,
+ metric: Optional[str] = None,
+ threshold: float = 0.13,
+ k: int = 10,
+ **_,
+ ):
+ super().__init__(datalab)
+ self.metric = metric
+ self.threshold = self._set_threshold(threshold)
+ self.k = k
+ self.near_duplicate_sets: List[List[int]] = []
+
+ def find_issues(
+ self,
+ features: Optional[npt.NDArray] = None,
+ **kwargs,
+ ) -> None:
+ knn_graph = self._process_knn_graph_from_inputs(kwargs)
+ old_knn_metric = self.datalab.get_info("statistics").get("knn_metric")
+ metric_changes = self.metric and self.metric != old_knn_metric
+
+ if knn_graph is None or metric_changes:
+ if features is None:
+ raise ValueError(
+ "If a knn_graph is not provided, features must be provided to fit a new knn."
+ )
+ if self.metric is None:
+ self.metric = "cosine" if features.shape[1] > 3 else "euclidean"
+ knn = NearestNeighbors(n_neighbors=self.k, metric=self.metric)
+
+ if self.metric and self.metric != knn.metric:
+ warnings.warn(
+ f"Metric {self.metric} does not match metric {knn.metric} used to fit knn. "
+ "Most likely an existing NearestNeighbors object was passed in, but a different "
+ "metric was specified."
+ )
+ self.metric = knn.metric
+
+ try:
+ check_is_fitted(knn)
+ except:
+ knn.fit(features)
+
+ knn_graph = knn.kneighbors_graph(mode="distance")
+ N = knn_graph.shape[0]
+ nn_distances = knn_graph.data.reshape(N, -1)[:, 0]
+ scores = np.tanh(nn_distances)
+ is_issue_column = nn_distances < self.threshold * np.median(nn_distances)
+
+ self.issues = pd.DataFrame(
+ {
+ f"is_{self.issue_name}_issue": is_issue_column,
+ self.issue_score_key: scores,
+ },
+ )
+
+ self.near_duplicate_sets = self._neighbors_within_radius(knn_graph, self.threshold)
+
+ self.summary = self.make_summary(score=scores.mean())
+ self.info = self.collect_info(knn_graph=knn_graph)
+
+ @staticmethod
+ def _neighbors_within_radius(knn_graph: csr_matrix, radius: float):
+ """Returns a list of lists of indices of near-duplicate examples.
+
+ Each list of indices represents a set of near-duplicate examples.
+
+ If the list is empty for a given example, then that example is not
+ a near-duplicate of any other example.
+ """
+
+ N = knn_graph.shape[0]
+ distances = knn_graph.data.reshape(N, -1)
+ # Create a mask for the threshold
+ mask = distances < radius
+
+ # Update the indptr to reflect the new number of neighbors
+ indptr = np.zeros(knn_graph.indptr.shape, dtype=knn_graph.indptr.dtype)
+ indptr[1:] = np.cumsum(mask.sum(axis=1))
+
+ # Filter the knn_graph based on the threshold
+ indices = knn_graph.indices[mask.ravel()]
+ near_duplicate_sets = [indices[indptr[i] : indptr[i + 1]] for i in range(N)]
+
+ return near_duplicate_sets
+
+ def _process_knn_graph_from_inputs(self, kwargs: Dict[str, Any]) -> Union[csr_matrix, None]:
+ """Determine if a knn_graph is provided in the kwargs or if one is already stored in the associated Datalab instance."""
+ knn_graph_kwargs: Optional[csr_matrix] = kwargs.get("knn_graph", None)
+ knn_graph_stats = self.datalab.get_info("statistics").get("weighted_knn_graph", None)
+
+ knn_graph: Optional[csr_matrix] = None
+ if knn_graph_kwargs is not None:
+ knn_graph = knn_graph_kwargs
+ elif knn_graph_stats is not None:
+ knn_graph = knn_graph_stats
+
+ if isinstance(knn_graph, csr_matrix) and kwargs.get("k", 0) > (
+ knn_graph.nnz // knn_graph.shape[0]
+ ):
+ # If the provided knn graph is insufficient, then we need to recompute the knn graph
+ # with the provided features
+ knn_graph = None
+ return knn_graph
+
+ def collect_info(self, knn_graph: csr_matrix) -> dict:
+ issues_dict = {
+ "average_near_duplicate_score": self.issues[self.issue_score_key].mean(),
+ "near_duplicate_sets": self.near_duplicate_sets,
+ }
+
+ params_dict = {
+ "metric": self.metric,
+ "k": self.k,
+ "threshold": self.threshold,
+ }
+
+ N = knn_graph.shape[0]
+ dists = knn_graph.data.reshape(N, -1)[:, 0]
+ nn_ids = knn_graph.indices.reshape(N, -1)[:, 0]
+
+ knn_info_dict = {
+ "nearest_neighbor": nn_ids.tolist(),
+ "distance_to_nearest_neighbor": dists.tolist(),
+ }
+
+ statistics_dict = self._build_statistics_dictionary(knn_graph=knn_graph)
+
+ info_dict = {
+ **issues_dict,
+ **params_dict,
+ **knn_info_dict,
+ **statistics_dict,
+ }
+ return info_dict
+
+ def _build_statistics_dictionary(self, knn_graph: csr_matrix) -> Dict[str, Dict[str, Any]]:
+ statistics_dict: Dict[str, Dict[str, Any]] = {"statistics": {}}
+
+ # Add the knn graph as a statistic if necessary
+ graph_key = "weighted_knn_graph"
+ old_knn_graph = self.datalab.get_info("statistics").get(graph_key, None)
+ old_graph_exists = old_knn_graph is not None
+ prefer_new_graph = (
+ not old_graph_exists
+ or knn_graph.nnz > old_knn_graph.nnz
+ or self.metric != self.datalab.get_info("statistics").get("knn_metric", None)
+ )
+ if prefer_new_graph:
+ statistics_dict["statistics"][graph_key] = knn_graph
+ if self.metric is not None:
+ statistics_dict["statistics"]["knn_metric"] = self.metric
+
+ return statistics_dict
+
+ def _set_threshold(
+ self,
+ threshold: float,
+ ) -> float:
+ """Computes nearest-neighbors thresholding for near-duplicate detection."""
+ if threshold < 0:
+ warnings.warn(
+ f"Computed threshold {threshold} is less than 0. "
+ "Setting threshold to 0."
+ "This may indicate that either the only a few examples are in the dataset, "
+ "or the data is heavily skewed."
+ )
+ threshold = 0
+ return threshold
diff --git a/cleanlab/datalab/issue_manager/issue_manager.py b/cleanlab/datalab/issue_manager/issue_manager.py
new file mode 100644
index 0000000000..96795075ec
--- /dev/null
+++ b/cleanlab/datalab/issue_manager/issue_manager.py
@@ -0,0 +1,345 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+from __future__ import annotations
+
+from abc import ABC, ABCMeta, abstractmethod
+from itertools import chain
+from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Set, Tuple, Type, TypeVar
+import json
+
+import numpy as np
+import pandas as pd
+
+if TYPE_CHECKING: # pragma: no cover
+ from cleanlab.datalab.datalab import Datalab
+
+
+T = TypeVar("T", bound="IssueManager")
+TM = TypeVar("TM", bound="IssueManagerMeta")
+
+
+class IssueManagerMeta(ABCMeta):
+ """Metaclass for IssueManager that adds issue_score_key to the class.
+
+ :meta private:
+ """
+
+ issue_name: ClassVar[str]
+ issue_score_key: ClassVar[str]
+ verbosity_levels: ClassVar[Dict[int, List[str]]] = {
+ 0: [],
+ 1: [],
+ 2: [],
+ 3: [],
+ }
+
+ def __new__(
+ meta: Type[TM],
+ name: str,
+ bases: Tuple[Type[Any], ...],
+ class_dict: Dict[str, Any],
+ ) -> TM: # Classes that inherit from ABC don't need to be modified
+ if ABC in bases:
+ return super().__new__(meta, name, bases, class_dict)
+
+ # Ensure that the verbosity levels don't have keys other than those in ["issue", "info"]
+ verbosity_levels = class_dict.get("verbosity_levels", meta.verbosity_levels)
+ for level, level_list in verbosity_levels.items():
+ if not isinstance(level_list, list):
+ raise ValueError(
+ f"Verbosity levels must be lists. "
+ f"Got {level_list} in {name}.verbosity_levels"
+ )
+ prohibited_keys = [key for key in level_list if not isinstance(key, str)]
+ if prohibited_keys:
+ raise ValueError(
+ f"Verbosity levels must be lists of strings. "
+ f"Got {prohibited_keys} in {name}.verbosity_levels[{level}]"
+ )
+
+ # Concrete classes need to have an issue_name attribute
+ if "issue_name" not in class_dict:
+ raise TypeError("IssueManagers need an issue_name class variable")
+
+ # Add issue_score_key to class
+ class_dict["issue_score_key"] = f"{class_dict['issue_name']}_score"
+ return super().__new__(meta, name, bases, class_dict)
+
+
+class IssueManager(ABC, metaclass=IssueManagerMeta):
+ """Base class for managing data issues of a particular type in a Datalab.
+
+ For each example in a dataset, the IssueManager for a particular type of issue should compute:
+ - A numeric severity score between 0 and 1,
+ with values near 0 indicating severe instances of the issue.
+ - A boolean `is_issue` value, which is True
+ if we believe this example suffers from the issue in question.
+ `is_issue` may be determined by thresholding the severity score
+ (with an a priori determined reasonable threshold value),
+ or via some other means (e.g. Confident Learning for flagging label issues).
+
+ The IssueManager should also report:
+ - A global value between 0 and 1 summarizing how severe this issue is in the dataset overall
+ (e.g. the average severity across all examples in dataset
+ or count of examples where `is_issue=True`).
+ - Other interesting `info` about the issue and examples in the dataset,
+ and statistics estimated from current dataset that may be reused
+ to score this issue in future data.
+ For example, `info` for label issues could contain the:
+ confident_thresholds, confident_joint, predicted label for each example, etc.
+ Another example is for (near)-duplicate detection issue, where `info` could contain:
+ which set of examples in the dataset are all (nearly) identical.
+
+ Implementing a new IssueManager:
+ - Define the `issue_name` class attribute, e.g. "label", "duplicate", "outlier", etc.
+ - Implement the abstract methods `find_issues` and `collect_info`.
+ - `find_issues` is responsible for computing computing the `issues` and `summary` dataframes.
+ - `collect_info` is responsible for computing the `info` dict. It is called by `find_issues`,
+ once the manager has set the `issues` and `summary` dataframes as instance attributes.
+ """
+
+ description: ClassVar[str] = ""
+ """Short text that summarizes the type of issues handled by this IssueManager.
+
+ :meta hide-value:
+ """
+ issue_name: ClassVar[str]
+ """Returns a key that is used to store issue summary results about the assigned Lab."""
+ issue_score_key: ClassVar[str]
+ """Returns a key that is used to store issue score results about the assigned Lab."""
+ verbosity_levels: ClassVar[Dict[int, List[str]]] = {
+ 0: [],
+ 1: [],
+ 2: [],
+ 3: [],
+ }
+ """A dictionary of verbosity levels and their corresponding dictionaries of
+ report items to print.
+
+ :meta hide-value:
+
+ Example
+ -------
+
+ >>> verbosity_levels = {
+ ... 0: [],
+ ... 1: ["some_info_key"],
+ ... 2: ["additional_info_key"],
+ ... }
+ """
+
+ def __init__(self, datalab: Datalab, **_):
+ self.datalab = datalab
+ self.info: Dict[str, Any] = {}
+ self.issues: pd.DataFrame = pd.DataFrame()
+ self.summary: pd.DataFrame = pd.DataFrame()
+
+ def __repr__(self):
+ class_name = self.__class__.__name__
+ return class_name
+
+ @classmethod
+ def __init_subclass__(cls):
+ required_class_variables = [
+ "issue_name",
+ ]
+ for var in required_class_variables:
+ if not hasattr(cls, var):
+ raise NotImplementedError(f"Class {cls.__name__} must define class variable {var}")
+
+ @abstractmethod
+ def find_issues(self, *args, **kwargs) -> None:
+ """Finds occurrences of this particular issue in the dataset.
+
+ Computes the `issues` and `summary` dataframes. Calls `collect_info` to compute the `info` dict.
+ """
+ raise NotImplementedError
+
+ def collect_info(self, *args, **kwargs) -> dict:
+ """Collects data for the info attribute of the Datalab.
+
+ NOTE
+ ----
+ This method is called by :py:meth:`find_issues` after :py:meth:`find_issues` has set the `issues` and `summary` dataframes
+ as instance attributes.
+ """
+ raise NotImplementedError
+
+ @classmethod
+ def make_summary(cls, score: float) -> pd.DataFrame:
+ """Construct a summary dataframe.
+
+ Parameters
+ ----------
+ score :
+ The overall score for this issue.
+
+ Returns
+ -------
+ summary :
+ A summary dataframe.
+ """
+ if not 0 <= score <= 1:
+ raise ValueError(f"Score must be between 0 and 1. Got {score}.")
+
+ return pd.DataFrame(
+ {
+ "issue_type": [cls.issue_name],
+ "score": [score],
+ },
+ )
+
+ @classmethod
+ def report(
+ cls,
+ issues: pd.DataFrame,
+ summary: pd.DataFrame,
+ info: Dict[str, Any],
+ num_examples: int = 5,
+ verbosity: int = 0,
+ include_description: bool = False,
+ info_to_omit: Optional[List[str]] = None,
+ ) -> str:
+ """Compose a report of the issues found by this IssueManager.
+
+ Parameters
+ ----------
+ issues :
+ An issues dataframe.
+
+ Example
+ -------
+ >>> import pandas as pd
+ >>> issues = pd.DataFrame(
+ ... {
+ ... "is_X_issue": [True, False, True],
+ ... "X_score": [0.2, 0.9, 0.4],
+ ... },
+ ... )
+
+ summary :
+ The summary dataframe.
+
+ Example
+ -------
+ >>> summary = pd.DataFrame(
+ ... {
+ ... "issue_type": ["X"],
+ ... "score": [0.5],
+ ... },
+ ... )
+
+ info :
+ The info dict.
+
+ Example
+ -------
+ >>> info = {
+ ... "A": "val_A",
+ ... "B": ["val_B1", "val_B2"],
+ ... }
+
+ num_examples :
+ The number of examples to print.
+
+ verbosity :
+ The verbosity level of the report.
+
+ include_description :
+ Whether to include a description of the issue in the report.
+
+ Returns
+ -------
+ report_str :
+ A string containing the report.
+ """
+
+ max_verbosity = max(cls.verbosity_levels.keys())
+ top_level = max_verbosity + 1
+ if verbosity not in list(cls.verbosity_levels.keys()) + [top_level]:
+ raise ValueError(
+ f"Verbosity level {verbosity} not supported. "
+ f"Supported levels: {cls.verbosity_levels.keys()}"
+ f"Use verbosity={top_level} to print all info."
+ )
+ if issues.empty:
+ print(f"No issues found")
+
+ topk_ids = issues.sort_values(by=cls.issue_score_key, ascending=True).index[:num_examples]
+
+ score = summary["score"].loc[0]
+ report_str = f"{' ' + cls.issue_name + ' issues ':-^60}\n\n"
+
+ if include_description and cls.description:
+ description = cls.description
+ if verbosity == 0:
+ description = description.split("\n\n", maxsplit=1)[0]
+ report_str += "About this issue:\n\t" + description + "\n\n"
+ report_str += (
+ f"Number of examples with this issue: {issues[f'is_{cls.issue_name}_issue'].sum()}\n"
+ f"Overall dataset quality in terms of this issue: {score:.4f}\n\n"
+ )
+
+ info_to_print: Set[str] = set()
+ _info_to_omit = set(issues.columns).union(info_to_omit or [])
+ verbosity_levels_values = chain.from_iterable(
+ list(cls.verbosity_levels.values())[: verbosity + 1]
+ )
+ info_to_print.update(set(verbosity_levels_values) - _info_to_omit)
+ if verbosity == top_level:
+ info_to_print.update(set(info.keys()) - _info_to_omit)
+
+ report_str += "Examples representing most severe instances of this issue:\n"
+ report_str += issues.loc[topk_ids].to_string()
+
+ def truncate(s, max_len=4) -> str:
+ if hasattr(s, "shape") or hasattr(s, "ndim"):
+ s = np.array(s)
+ if s.ndim > 1:
+ description = f"array of shape {s.shape}\n"
+ with np.printoptions(threshold=max_len):
+ if s.ndim == 2:
+ description += f"{s}"
+ if s.ndim > 2:
+ description += f"{s}"
+ return description
+ s = s.tolist()
+
+ if isinstance(s, list):
+ if all([isinstance(s_, list) for s_ in s]):
+ return truncate(np.array(s, dtype=object), max_len=max_len)
+ if len(s) > max_len:
+ s = s[:max_len] + ["..."]
+ return str(s)
+
+ if info_to_print:
+ info_to_print_dict = {key: info[key] for key in info_to_print}
+ # Print the info dict, truncating arrays to 4 elements,
+ report_str += f"\n\nAdditional Information: "
+ for key, value in info_to_print_dict.items():
+ if key == "statistics":
+ continue
+ if isinstance(value, dict):
+ report_str += f"\n{key}:\n{json.dumps(value, indent=4)}"
+ elif isinstance(value, pd.DataFrame):
+ max_rows = 5
+ df_str = value.head(max_rows).to_string()
+ if len(value) > max_rows:
+ df_str += f"\n... (total {len(value)} rows)"
+ report_str += f"\n{key}:\n{df_str}"
+ else:
+ report_str += f"\n{key}: {truncate(value)}"
+ return report_str
diff --git a/cleanlab/datalab/issue_manager/label.py b/cleanlab/datalab/issue_manager/label.py
new file mode 100644
index 0000000000..e566080386
--- /dev/null
+++ b/cleanlab/datalab/issue_manager/label.py
@@ -0,0 +1,226 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional
+
+import numpy as np
+
+from cleanlab.classification import CleanLearning
+from cleanlab.datalab.issue_manager import IssueManager
+from cleanlab.internal.validation import assert_valid_inputs
+
+if TYPE_CHECKING: # pragma: no cover
+ import pandas as pd
+
+ from cleanlab.datalab.datalab import Datalab
+
+
+class LabelIssueManager(IssueManager):
+ """Manages label issues in a Datalab.
+
+ Parameters
+ ----------
+ datalab :
+ A Datalab instance.
+
+ clean_learning_kwargs :
+ Keyword arguments to pass to the :py:meth:`CleanLearning ` constructor.
+
+ health_summary_parameters :
+ Keyword arguments to pass to the :py:meth:`health_summary ` function.
+ """
+
+ description: ClassVar[
+ str
+ ] = """Examples whose given label is estimated to be potentially incorrect
+ (e.g. due to annotation error) are flagged as having label issues.
+ """
+
+ issue_name: ClassVar[str] = "label"
+ verbosity_levels = {
+ 0: [],
+ 1: [],
+ 2: [],
+ 3: ["classes_by_label_quality", "overlapping_classes"],
+ }
+
+ def __init__(
+ self,
+ datalab: Datalab,
+ clean_learning_kwargs: Optional[Dict[str, Any]] = None,
+ health_summary_parameters: Optional[Dict[str, Any]] = None,
+ **_,
+ ):
+ super().__init__(datalab)
+ self.cl = CleanLearning(**(clean_learning_kwargs or {}))
+ self.health_summary_parameters: Dict[str, Any] = (
+ health_summary_parameters.copy() if health_summary_parameters else {}
+ )
+ self._reset()
+
+ @staticmethod
+ def _process_find_label_issues_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]:
+ """Searches for keyword arguments that are meant for the
+ CleanLearning.find_label_issues method call
+
+ Examples
+ --------
+ >>> from cleanlab.datalab.issue_manager.label import LabelIssueManager
+ >>> LabelIssueManager._process_clean_learning_kwargs(thresholds=[0.1, 0.9])
+ {'thresholds': [0.1, 0.9]}
+ """
+ accepted_kwargs = [
+ "thresholds",
+ "noise_matrix",
+ "inverse_noise_matrix",
+ "save_space",
+ "clf_kwargs",
+ "validation_func",
+ ]
+ return {k: v for k, v in kwargs.items() if k in accepted_kwargs and v is not None}
+
+ def _reset(self) -> None:
+ """Reset the attributes of this manager based on the available datalab info
+ and the keyword arguments stored as instance attributes.
+
+ This allows the builder to use pre-computed info from the datalab to speed up
+ some computations in the :py:meth:`find_issues` method.
+ """
+ if not self.health_summary_parameters:
+ statistics_dict = self.datalab.get_info("statistics")
+ self.health_summary_parameters = {
+ "labels": self.datalab.labels,
+ "class_names": list(self.datalab._label_map.values()),
+ "num_examples": statistics_dict.get("num_examples"),
+ "joint": statistics_dict.get("joint", None),
+ "confident_joint": statistics_dict.get("confident_joint", None),
+ "multi_label": statistics_dict.get("multi_label", None),
+ "asymmetric": statistics_dict.get("asymmetric", None),
+ "verbose": False,
+ }
+ self.health_summary_parameters = {
+ k: v for k, v in self.health_summary_parameters.items() if v is not None
+ }
+
+ def find_issues(
+ self,
+ pred_probs: np.ndarray,
+ **kwargs,
+ ) -> None:
+ self.health_summary_parameters.update({"pred_probs": pred_probs})
+ # Find examples with label issues
+ self.issues = self.cl.find_label_issues(
+ labels=self.datalab.labels,
+ pred_probs=pred_probs,
+ **self._process_find_label_issues_kwargs(kwargs),
+ )
+ self.issues.rename(columns={"label_quality": self.issue_score_key}, inplace=True)
+
+ summary_dict = self.get_health_summary(pred_probs=pred_probs)
+
+ # Get a summarized dataframe of the label issues
+ self.summary = self.make_summary(score=summary_dict["overall_label_health_score"])
+
+ # Collect info about the label issues
+ self.info = self.collect_info(issues=self.issues, summary_dict=summary_dict)
+
+ # Drop columns from issues that are in the info
+ self.issues = self.issues.drop(columns=["given_label", "predicted_label"])
+
+ def get_health_summary(self, pred_probs) -> dict:
+ """Returns a short summary of the health of this Lab."""
+ from cleanlab.dataset import health_summary
+
+ # Validate input
+ self._validate_pred_probs(pred_probs)
+
+ summary_kwargs = self._get_summary_parameters(pred_probs)
+ summary = health_summary(**summary_kwargs)
+ return summary
+
+ def _get_summary_parameters(self, pred_probs) -> Dict["str", Any]:
+ """Collects a set of input parameters for the health summary function based on
+ any info available in the datalab.
+
+ Parameters
+ ----------
+ pred_probs :
+ The predicted probabilities for each example.
+
+ kwargs :
+ Keyword arguments to pass to the health summary function.
+
+ Returns
+ -------
+ summary_parameters :
+ A dictionary of parameters to pass to the health summary function.
+ """
+ if "confident_joint" in self.health_summary_parameters:
+ summary_parameters = {
+ "confident_joint": self.health_summary_parameters["confident_joint"]
+ }
+ elif all([x in self.health_summary_parameters for x in ["joint", "num_examples"]]):
+ summary_parameters = {
+ k: self.health_summary_parameters[k] for k in ["joint", "num_examples"]
+ }
+ else:
+ summary_parameters = {
+ "pred_probs": pred_probs,
+ "labels": self.datalab.labels,
+ }
+
+ summary_parameters["class_names"] = self.health_summary_parameters["class_names"]
+
+ for k in ["asymmetric", "verbose"]:
+ # Start with the health_summary_parameters, then override with kwargs
+ if k in self.health_summary_parameters:
+ summary_parameters[k] = self.health_summary_parameters[k]
+
+ return (
+ summary_parameters # will be called in `dataset.health_summary(**summary_parameters)`
+ )
+
+ def collect_info(self, issues: pd.DataFrame, summary_dict: dict) -> dict:
+ issues_info = {
+ "num_label_issues": sum(issues[f"is_{self.issue_name}_issue"]),
+ "average_label_quality": issues[self.issue_score_key].mean(),
+ "given_label": issues["given_label"].tolist(),
+ "predicted_label": issues["predicted_label"].tolist(),
+ }
+
+ health_summary_info = {
+ "confident_joint": summary_dict["joint"],
+ "classes_by_label_quality": summary_dict["classes_by_label_quality"],
+ "overlapping_classes": summary_dict["overlapping_classes"],
+ }
+
+ cl_info = {}
+ for k in self.cl.__dict__:
+ if k not in ["py", "noise_matrix", "inverse_noise_matrix", "confident_joint"]:
+ continue
+ cl_info[k] = self.cl.__dict__[k]
+
+ info_dict = {
+ **issues_info,
+ **health_summary_info,
+ **cl_info,
+ }
+
+ return info_dict
+
+ def _validate_pred_probs(self, pred_probs) -> None:
+ assert_valid_inputs(X=None, y=self.datalab.labels, pred_probs=pred_probs)
diff --git a/cleanlab/datalab/issue_manager/noniid.py b/cleanlab/datalab/issue_manager/noniid.py
new file mode 100644
index 0000000000..658ab2ac17
--- /dev/null
+++ b/cleanlab/datalab/issue_manager/noniid.py
@@ -0,0 +1,440 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union, cast
+import warnings
+import itertools
+
+from scipy.stats import gaussian_kde
+import numpy as np
+import pandas as pd
+from scipy.sparse import csr_matrix
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils.validation import check_is_fitted
+
+from cleanlab.datalab.issue_manager import IssueManager
+
+if TYPE_CHECKING: # pragma: no cover
+ import numpy.typing as npt
+ from cleanlab.datalab.datalab import Datalab
+
+
+def simplified_kolmogorov_smirnov_test(
+ neighbor_histogram: npt.NDArray[np.float64],
+ non_neighbor_histogram: npt.NDArray[np.float64],
+) -> float:
+ """Computes the Kolmogorov-Smirnov statistic between two groups of data.
+ The statistic is the largest difference between the empirical cumulative
+ distribution functions (ECDFs) of the two groups.
+
+ Parameters
+ ----------
+ neighbor_histogram :
+ Histogram data for the nearest neighbor group.
+
+ non_neighbor_histogram :
+ Histogram data for the non-neighbor group.
+
+ Returns
+ -------
+ statistic :
+ The KS statistic between the two ECDFs.
+
+ Note
+ ----
+ - Both input arrays should have the same length.
+ - The input arrays are histograms, which means they contain the count
+ or frequency of values in each group. The data in the histograms
+ should be normalized so that they sum to one.
+
+ To calculate the KS statistic, the function first calculates the ECDFs
+ for both input arrays, which are step functions that show the cumulative
+ sum of the data up to each point. The function then calculates the
+ largest absolute difference between the two ECDFs.
+ """
+
+ neighbor_cdf = np.cumsum(neighbor_histogram)
+ non_neighbor_cdf = np.cumsum(non_neighbor_histogram)
+
+ statistic = np.max(np.abs(neighbor_cdf - non_neighbor_cdf))
+ return statistic
+
+
+class NonIIDIssueManager(IssueManager):
+ """Manages issues related to non-iid data distributions.
+
+ Parameters
+ ----------
+ datalab :
+ The Datalab instance that this issue manager searches for issues in.
+
+ metric :
+ The distance metric used to compute the KNN graph of the examples in the dataset.
+ If set to `None`, the metric will be automatically selected based on the dimensionality
+ of the features used to represent the examples in the dataset.
+
+ k :
+ The number of nearest neighbors to consider when computing the KNN graph of the examples.
+
+ num_permutations :
+ The number of trials to run when performing permutation testing to determine whether
+ the distribution of index-distances between neighbors in the dataset is IID or not.
+
+ Note
+ ----
+ This class will only flag a single example as an issue if the dataset is considered non-IID. This type of issue
+ is more relevant to the entire dataset as a whole, rather than to individual examples.
+
+ """
+
+ description: ClassVar[
+ str
+ ] = """Whether the dataset exhibits statistically significant
+ violations of the IID assumption like:
+ changepoints or shift, drift, autocorrelation, etc.
+ The specific violation considered is whether the
+ examples are ordered such that almost adjacent examples
+ tend to have more similar feature values.
+ """
+ issue_name: ClassVar[str] = "non_iid"
+ verbosity_levels = {
+ 0: ["p-value"],
+ 1: [],
+ 2: [],
+ }
+
+ def __init__(
+ self,
+ datalab: Datalab,
+ metric: Optional[str] = None,
+ k: int = 10,
+ num_permutations: int = 25,
+ seed: Optional[int] = 0,
+ significance_threshold: float = 0.05,
+ **_,
+ ):
+ super().__init__(datalab)
+ self.metric = metric
+ self.k = k
+ self.num_permutations = num_permutations
+ self.tests = {
+ "ks": simplified_kolmogorov_smirnov_test,
+ }
+ self.background_distribution = None
+ self.seed = seed
+ self.significance_threshold = significance_threshold
+
+ def find_issues(self, features: Optional[npt.NDArray] = None, **kwargs) -> None:
+ knn_graph = self._process_knn_graph_from_inputs(kwargs)
+ old_knn_metric = self.datalab.get_info("statistics").get("knn_metric")
+ metric_changes = self.metric and self.metric != old_knn_metric
+
+ knn = None # Won't be used if knn_graph is not None
+
+ if knn_graph is None or metric_changes:
+ if features is None:
+ raise ValueError(
+ "If a knn_graph is not provided, features must be provided to fit a new knn."
+ )
+
+ if self.metric is None:
+ self.metric = "cosine" if features.shape[1] > 3 else "euclidean"
+ knn = NearestNeighbors(n_neighbors=self.k, metric=self.metric)
+
+ if self.metric and self.metric != knn.metric:
+ warnings.warn(
+ f"Metric {self.metric} does not match metric {knn.metric} used to fit knn. "
+ "Most likely an existing NearestNeighbors object was passed in, but a different "
+ "metric was specified."
+ )
+ self.metric = knn.metric
+
+ try:
+ check_is_fitted(knn)
+ except:
+ knn.fit(features)
+
+ self.neighbor_index_choices = self._get_neighbors(knn=knn)
+ else:
+ self.neighbor_index_choices = self._get_neighbors(knn_graph=knn_graph)
+
+ self.num_neighbors = self.k
+
+ indices = np.arange(self.N)
+ self.neighbor_index_distances = np.abs(indices.reshape(-1, 1) - self.neighbor_index_choices)
+
+ self.statistics = self._get_statistics(self.neighbor_index_distances)
+
+ self.p_value = self._permutation_test(num_permutations=self.num_permutations)
+
+ scores = self._score_dataset()
+ issue_mask = np.zeros(self.N, dtype=bool)
+ if self.p_value < self.significance_threshold:
+ issue_mask[scores.argmin()] = True
+ self.issues = pd.DataFrame(
+ {
+ f"is_{self.issue_name}_issue": issue_mask,
+ self.issue_score_key: scores,
+ },
+ )
+
+ self.summary = self.make_summary(score=self.p_value)
+
+ if knn_graph is None:
+ self.info = self.collect_info(knn=knn)
+ self.info = self.collect_info(knn_graph=knn_graph, knn=knn)
+
+ def _process_knn_graph_from_inputs(self, kwargs: Dict[str, Any]) -> Union[csr_matrix, None]:
+ """Determine if a knn_graph is provided in the kwargs or if one is already stored in the associated Datalab instance."""
+ knn_graph_kwargs: Optional[csr_matrix] = kwargs.get("knn_graph", None)
+ knn_graph_stats = self.datalab.get_info("statistics").get("weighted_knn_graph", None)
+
+ knn_graph: Optional[csr_matrix] = None
+ if knn_graph_kwargs is not None:
+ knn_graph = knn_graph_kwargs
+ elif knn_graph_stats is not None:
+ knn_graph = knn_graph_stats
+
+ need_to_recompute_knn = isinstance(knn_graph, csr_matrix) and (
+ kwargs.get("k", 0) > knn_graph.nnz // knn_graph.shape[0]
+ or self.k > knn_graph.nnz // knn_graph.shape[0]
+ )
+
+ if need_to_recompute_knn:
+ # If the provided knn graph is insufficient, then we need to recompute the knn graph
+ # with the provided features
+ knn_graph = None
+ return knn_graph
+
+ def collect_info(
+ self, knn_graph: Optional[csr_matrix] = None, knn: Optional[NearestNeighbors] = None
+ ) -> dict:
+ issues_dict = {
+ "p-value": self.p_value,
+ }
+
+ params_dict = {
+ "metric": self.metric,
+ "k": self.k,
+ }
+ if knn_graph is None:
+ assert knn is not None, "If knn_graph is None, knn must be provided."
+ knn_graph = knn.kneighbors_graph(mode="distance") # type: ignore[union-attr]
+
+ assert knn_graph is not None, "knn_graph must be provided or computed."
+ statistics_dict = self._build_statistics_dictionary(knn_graph=knn_graph)
+
+ info_dict = {
+ **issues_dict,
+ **params_dict, # type: ignore[arg-type]
+ **statistics_dict, # type: ignore[arg-type]
+ }
+ return info_dict
+
+ def _build_statistics_dictionary(self, knn_graph: csr_matrix) -> Dict[str, Dict[str, Any]]:
+ statistics_dict: Dict[str, Dict[str, Any]] = {"statistics": {}}
+
+ # Add the knn graph as a statistic if necessary
+ graph_key = "weighted_knn_graph"
+ old_knn_graph = self.datalab.get_info("statistics").get(graph_key, None)
+ old_graph_exists = old_knn_graph is not None
+ prefer_new_graph = (
+ (knn_graph is not None and not old_graph_exists)
+ or knn_graph.nnz > old_knn_graph.nnz
+ or self.metric != self.datalab.get_info("statistics").get("knn_metric", None)
+ )
+ if prefer_new_graph:
+ statistics_dict["statistics"][graph_key] = knn_graph
+ if self.metric is not None:
+ statistics_dict["statistics"]["knn_metric"] = self.metric
+
+ return statistics_dict
+
+ def _permutation_test(self, num_permutations) -> float:
+ N = self.N
+
+ if self.seed is not None:
+ np.random.seed(self.seed)
+ perms = np.fromiter(
+ itertools.chain.from_iterable(
+ np.random.permutation(N) for i in range(num_permutations)
+ ),
+ dtype=int,
+ ).reshape(num_permutations, N)
+
+ neighbor_index_choices = self.neighbor_index_choices
+ neighbor_index_choices = neighbor_index_choices.reshape(1, *neighbor_index_choices.shape)
+ perm_neighbor_choices = perms[:, neighbor_index_choices].reshape(
+ num_permutations, *neighbor_index_choices.shape[1:]
+ )
+ neighbor_index_distances = np.abs(perms[..., None] - perm_neighbor_choices).reshape(
+ num_permutations, -1
+ )
+
+ statistics = []
+ for neighbor_index_dist in neighbor_index_distances:
+ stats = self._get_statistics(
+ neighbor_index_dist,
+ )
+ statistics.append(stats)
+
+ ks_stats = np.array([stats["ks"] for stats in statistics])
+ ks_stats_kde = gaussian_kde(ks_stats)
+ p_value = ks_stats_kde.integrate_box(self.statistics["ks"], 100)
+
+ return p_value
+
+ def _score_dataset(self) -> npt.NDArray[np.float64]:
+ """This function computes a variant of the KS statistic for each
+ datapoint. Rather than computing the maximum difference
+ between the CDF of the neighbor distances (foreground
+ distribution) and the CDF of the all index distances
+ (background distribution), we compute the absolute difference
+ in area-under-the-curve of the two CDFs.
+
+ The foreground distribution is computed by sampling the
+ neighbor distances from the KNN graph, but the background
+ distribution is computed analytically. The background CDF for
+ a datapoint i can be split up into three parts. Let d = min(i,
+ N - i - 1).
+
+ 1. For 0 < j <= d, the slope of the CDF is 2 / (N - 1) since
+ there are two datapoints in the dataset that are distance j
+ from datapoint i. We call this threshold the 'double distance
+ threshold'
+
+ 2. For d < j <= N - d - 1, the slope of the CDF is
+ 1 / (N - 1) since there is only one datapoint in the dataset
+ that is distance j from datapoint i.
+
+ 3. For j > N - d - 1, the slope of the CDF is 0 and is
+ constant at 1.0 since there are no datapoints in the dataset
+ that are distance j from datapoint i.
+
+ We compute the area differences on each of the k intervals for
+ which the foreground CDF is constant which allows for the
+ possibility that the background CDF may intersect the
+ foreground CDF on this interval. We do not account for these
+ cases when computing absolute AUC difference.
+
+ Our algorithm is simple, sort the k sampled neighbor
+ distances. Then, for each of the k neighbor distances sampled,
+ compute the AUC for each CDF up to that point. Then, subtract
+ from each area the previous area in the sorted order to get
+ the AUC of the CDF on the interval between those two
+ points. Subtract the background interval AUCs from the
+ foreground interval AUCs, take the absolute value, and
+ sum. The algorithm is vectorized such that this statistic is
+ computed for each of the N datapoints simultaneously.
+
+ The statistics are then normalized by their respective maximum
+ possible distance (N - d - 1) and then mapped to [0,1] via
+ tanh.
+ """
+ N = self.N
+
+ sorted_neighbors = np.sort(self.neighbor_index_distances, axis=1)
+
+ # find the maximum distance that occurs with double probability
+ middle_idx = np.floor((N - 1) / 2).astype(int)
+ double_distances = np.arange(N).reshape(N, 1)
+ double_distances[double_distances > middle_idx] -= N - 1
+ double_distances = np.abs(double_distances)
+
+ sorted_neighbors = np.hstack([sorted_neighbors, np.ones((N, 1)) * (N - 1)]).astype(int)
+
+ # the set of distances that are less than the double distance threshold
+ set_beginning = sorted_neighbors <= double_distances
+ # the set of distances that are greater than the double distance threshold but have nonzero probability
+ set_middle = (sorted_neighbors > double_distances) & (
+ sorted_neighbors <= (N - double_distances - 1)
+ )
+ # the set of distances that occur with 0 probability
+ set_end = sorted_neighbors > (N - double_distances - 1)
+
+ shifted_neighbors = np.zeros(sorted_neighbors.shape)
+ shifted_neighbors[:, 1:] = sorted_neighbors[:, :-1]
+ diffs = sorted_neighbors - shifted_neighbors # the distances between the sorted indices
+
+ area_beginning = (double_distances**2) / (N - 1)
+ length = N - 2 * double_distances - 1
+ a = 2 * double_distances / (N - 1)
+ area_middle = 0.5 * (a + 1) * length
+
+ # compute the area under the CDF for each of the indices in sorted_neighbors
+ background_area = np.zeros(diffs.shape)
+ background_diffs = np.zeros(diffs.shape)
+ background_area[set_beginning] = ((sorted_neighbors**2) / (N - 1))[set_beginning]
+ background_area[set_middle] = (
+ area_beginning
+ + 0.5
+ * (
+ (sorted_neighbors + 3 * double_distances)
+ * (sorted_neighbors - double_distances)
+ / (N - 1)
+ )
+ )[set_middle]
+ background_area[set_end] = (
+ area_beginning + area_middle + (sorted_neighbors - (N - double_distances - 1) * 1.0)
+ )[set_end]
+
+ # compute the area under the CDF between indices in sorted_neighbors
+ shifted_background = np.zeros(background_area.shape)
+ shifted_background[:, 1:] = background_area[:, :-1]
+ background_diffs = background_area - shifted_background
+
+ # compute the foreground CDF and AUC between indices in sorted_neighbors
+ foreground_cdf = np.arange(sorted_neighbors.shape[1]) / (sorted_neighbors.shape[1] - 1)
+ foreground_diffs = foreground_cdf.reshape(1, -1) * diffs
+
+ # compute the differences between foreground and background area intervals
+ area_diffs = np.abs(foreground_diffs - background_diffs)
+ stats = np.sum(area_diffs, axis=1)
+
+ # normalize scores by the index and transform to [0, 1]
+ indices = np.arange(N)
+ reverse = N - indices
+ normalizer = np.where(indices > reverse, indices, reverse)
+
+ scores = stats / normalizer
+ scores = np.tanh(-1 * scores) + 1
+ return scores
+
+ def _get_neighbors(
+ self, knn: Optional[NearestNeighbors] = None, knn_graph: Optional[csr_matrix] = None
+ ) -> np.ndarray:
+ """
+ Given a fitted knn object or a knn graph, returns an (N, k) array in
+ which j is in A[i] if item i and j are nearest neighbors.
+ """
+ if knn_graph is not None:
+ N = knn_graph.shape[0]
+ kneighbors = knn_graph.indices.reshape(N, -1)
+ elif knn is not None:
+ _, kneighbors = knn.kneighbors()
+ N = kneighbors.shape[0]
+ else:
+ raise ValueError("Must provide either knn or knn_graph")
+ self.N = N
+ return kneighbors
+
+ def _get_statistics(
+ self,
+ neighbor_index_distances,
+ ) -> dict[str, float]:
+ neighbor_index_distances = neighbor_index_distances.flatten()
+ sorted_neighbors = np.sort(neighbor_index_distances)
+ sorted_neighbors = np.hstack([sorted_neighbors, np.ones((1)) * (self.N - 1)]).astype(int)
+
+ if self.background_distribution is None:
+ self.background_distribution = (self.N - np.arange(1, self.N)) / (
+ self.N * (self.N - 1) / 2
+ )
+
+ background_distribution = cast(np.ndarray, self.background_distribution)
+ background_cdf = np.cumsum(background_distribution)
+
+ foreground_cdf = np.arange(sorted_neighbors.shape[0]) / (sorted_neighbors.shape[0] - 1)
+
+ statistic = np.max(np.abs(foreground_cdf - background_cdf[sorted_neighbors - 1]))
+ statistics = {"ks": statistic}
+ return statistics
diff --git a/cleanlab/datalab/issue_manager/outlier.py b/cleanlab/datalab/issue_manager/outlier.py
new file mode 100644
index 0000000000..68149c84f3
--- /dev/null
+++ b/cleanlab/datalab/issue_manager/outlier.py
@@ -0,0 +1,276 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Tuple, Union, cast
+
+from scipy.sparse import csr_matrix
+from scipy.stats import iqr
+import numpy as np
+import pandas as pd
+
+from cleanlab.datalab.issue_manager import IssueManager
+from cleanlab.outlier import OutOfDistribution, transform_distances_to_scores
+
+if TYPE_CHECKING: # pragma: no cover
+ import numpy.typing as npt
+ from sklearn.neighbors import NearestNeighbors
+ from cleanlab.datalab.datalab import Datalab
+
+
+class OutlierIssueManager(IssueManager):
+ """Manages issues related to out-of-distribution examples."""
+
+ description: ClassVar[
+ str
+ ] = """Examples that are very different from the rest of the dataset
+ (i.e. potentially out-of-distribution or rare/anomalous instances).
+ """
+ issue_name: ClassVar[str] = "outlier"
+ verbosity_levels = {
+ 0: [],
+ 1: [],
+ 2: ["average_ood_score"],
+ 3: [],
+ }
+
+ DEFAULT_THRESHOLDS = {
+ "features": 0.37037,
+ "pred_probs": 0.13,
+ }
+ """Default thresholds for outlier detection.
+
+ If outlier detection is performed on the features, an example whose average
+ distance to their k nearest neighbors is greater than
+ Q3_avg_dist + (1 / threshold - 1) * IQR_avg_dist is considered an outlier.
+
+ If outlier detection is performed on the predicted probabilities, an example
+ whose average score is lower than threshold * median_outlier_score is
+ considered an outlier.
+ """
+
+ def __init__(
+ self,
+ datalab: Datalab,
+ threshold: Optional[float] = None,
+ **kwargs,
+ ):
+ super().__init__(datalab)
+
+ ood_kwargs = kwargs.get("ood_kwargs", {})
+
+ valid_ood_params = OutOfDistribution.DEFAULT_PARAM_DICT.keys()
+ params = {
+ key: value
+ for key, value in ((k, kwargs.get(k, None)) for k in valid_ood_params)
+ if value is not None
+ }
+
+ if params:
+ ood_kwargs["params"] = params
+
+ self.ood: OutOfDistribution = OutOfDistribution(**ood_kwargs)
+
+ self.threshold = threshold
+ self._embeddings: Optional[np.ndarray] = None
+ self._metric: str = None # type: ignore
+
+ def find_issues(
+ self,
+ features: Optional[npt.NDArray] = None,
+ pred_probs: Optional[np.ndarray] = None,
+ **kwargs,
+ ) -> None:
+ knn_graph = self._process_knn_graph_from_inputs(kwargs)
+ distances: Optional[np.ndarray] = None
+
+ if knn_graph is not None:
+ N = knn_graph.shape[0]
+ k = knn_graph.nnz // N
+ t = cast(int, self.ood.params["t"])
+ distances = knn_graph.data.reshape(-1, k)
+ assert isinstance(distances, np.ndarray)
+ scores = transform_distances_to_scores(distances, k=k, t=t)
+ elif features is not None:
+ scores = self._score_with_features(features, **kwargs)
+ elif pred_probs is not None:
+ scores = self._score_with_pred_probs(pred_probs, **kwargs)
+ else:
+ if kwargs.get("knn_graph", None) is not None:
+ raise ValueError(
+ "knn_graph is provided, but not sufficiently large to compute the scores based on the provided hyperparameters."
+ )
+ raise ValueError(f"Either features pred_probs must be provided.")
+
+ if features is not None or knn_graph is not None:
+ if knn_graph is None:
+ assert (
+ features is not None
+ ), "features must be provided so that we can compute the knn graph."
+ knn_graph = self._process_knn_graph_from_features(kwargs)
+ distances = knn_graph.data.reshape(knn_graph.shape[0], -1)
+
+ assert isinstance(distances, np.ndarray)
+ (
+ self.threshold,
+ is_issue_column,
+ ) = self._compute_threshold_and_issue_column_from_distances(distances, self.threshold)
+
+ else:
+ assert pred_probs is not None
+ # Threshold based on pred_probs, very small scores are outliers
+ if self.threshold is None:
+ self.threshold = self.DEFAULT_THRESHOLDS["pred_probs"]
+ if not 0 <= self.threshold:
+ raise ValueError(f"threshold must be non-negative, but got {self.threshold}.")
+ is_issue_column = scores < self.threshold * np.median(scores)
+
+ self.issues = pd.DataFrame(
+ {
+ f"is_{self.issue_name}_issue": is_issue_column,
+ self.issue_score_key: scores,
+ },
+ )
+
+ self.summary = self.make_summary(score=scores.mean())
+
+ self.info = self.collect_info(knn_graph=knn_graph)
+
+ def _process_knn_graph_from_inputs(self, kwargs: Dict[str, Any]) -> Union[csr_matrix, None]:
+ """Determine if a knn_graph is provided in the kwargs or if one is already stored in the associated Datalab instance."""
+ knn_graph_kwargs: Optional[csr_matrix] = kwargs.get("knn_graph", None)
+ knn_graph_stats = self.datalab.get_info("statistics").get("weighted_knn_graph", None)
+
+ knn_graph: Optional[csr_matrix] = None
+ if knn_graph_kwargs is not None:
+ knn_graph = knn_graph_kwargs
+ elif knn_graph_stats is not None:
+ knn_graph = knn_graph_stats
+
+ if isinstance(knn_graph, csr_matrix) and kwargs.get("k", 0) > (
+ knn_graph.nnz // knn_graph.shape[0]
+ ):
+ # If the provided knn graph is insufficient, then we need to recompute the knn graph
+ # with the provided features
+ knn_graph = None
+ return knn_graph
+
+ def _compute_threshold_and_issue_column_from_distances(
+ self, distances: np.ndarray, threshold: Optional[float] = None
+ ) -> Tuple[float, np.ndarray]:
+ avg_distances = distances.mean(axis=1)
+ if threshold:
+ if not (isinstance(threshold, (int, float)) and 0 <= threshold <= 1):
+ raise ValueError(
+ f"threshold must be a number between 0 and 1, got {threshold} of type {type(threshold)}."
+ )
+ if threshold is None:
+ threshold = OutlierIssueManager.DEFAULT_THRESHOLDS["features"]
+ q3_distance = np.percentile(avg_distances, 75)
+ iqr_scale = 1 / threshold - 1 if threshold != 0 else np.inf
+ return threshold, avg_distances > q3_distance + iqr_scale * iqr(avg_distances)
+
+ def _process_knn_graph_from_features(self, kwargs: Dict) -> csr_matrix:
+ # Check if the weighted knn graph exists in info
+ knn_graph = self.datalab.get_info("statistics").get("weighted_knn_graph", None)
+
+ k: int = 0 # Used to check if the knn graph needs to be recomputed, already set in the knn object
+ if knn_graph is not None:
+ k = knn_graph.nnz // knn_graph.shape[0]
+
+ knn: NearestNeighbors = self.ood.params["knn"] # type: ignore
+ if kwargs.get("knn", None) is not None or knn.n_neighbors > k: # type: ignore[union-attr]
+ # If the pre-existing knn graph has fewer neighbors than the knn object,
+ # then we need to recompute the knn graph
+ assert knn == self.ood.params["knn"] # type: ignore[union-attr]
+ knn_graph = knn.kneighbors_graph(mode="distance") # type: ignore[union-attr]
+ self._metric = knn.metric # type: ignore[union-attr]
+
+ return knn_graph
+
+ def collect_info(self, *, knn_graph: Optional[csr_matrix] = None) -> dict:
+ issues_dict = {
+ "average_ood_score": self.issues[self.issue_score_key].mean(),
+ "threshold": self.threshold,
+ }
+ pred_probs_issues_dict: Dict[str, Any] = {}
+ feature_issues_dict = {}
+
+ if knn_graph is not None:
+ knn = self.ood.params["knn"] # type: ignore
+ N = knn_graph.shape[0]
+ k = knn_graph.nnz // N
+ dists = knn_graph.data.reshape(N, -1)[:, 0]
+ nn_ids = knn_graph.indices.reshape(N, -1)[:, 0]
+
+ feature_issues_dict.update(
+ {
+ "k": k, # type: ignore[union-attr]
+ "nearest_neighbor": nn_ids.tolist(),
+ "distance_to_nearest_neighbor": dists.tolist(),
+ }
+ )
+ if self.ood.params["knn"] is not None:
+ knn = self.ood.params["knn"]
+ feature_issues_dict.update({"metric": knn.metric}) # type: ignore[union-attr]
+
+ if self.ood.params["confident_thresholds"] is not None:
+ pass #
+ statistics_dict = self._build_statistics_dictionary(knn_graph=knn_graph)
+ ood_params_dict = self.ood.params
+ knn_dict = {
+ **pred_probs_issues_dict,
+ **feature_issues_dict,
+ }
+ info_dict: Dict[str, Any] = {
+ **issues_dict,
+ **ood_params_dict, # type: ignore[arg-type]
+ **knn_dict,
+ **statistics_dict,
+ }
+ return info_dict
+
+ def _build_statistics_dictionary(
+ self, *, knn_graph: Optional[csr_matrix]
+ ) -> Dict[str, Dict[str, Any]]:
+ statistics_dict: Dict[str, Dict[str, Any]] = {"statistics": {}}
+
+ # Add the knn graph as a statistic if necessary
+ graph_key = "weighted_knn_graph"
+ old_knn_graph = self.datalab.get_info("statistics").get(graph_key, None)
+ old_graph_exists = old_knn_graph is not None
+ prefer_new_graph = (
+ not old_graph_exists
+ or (isinstance(knn_graph, csr_matrix) and knn_graph.nnz > old_knn_graph.nnz)
+ or self._metric != self.datalab.get_info("statistics").get("knn_metric", None)
+ )
+ if prefer_new_graph:
+ if knn_graph is not None:
+ statistics_dict["statistics"][graph_key] = knn_graph
+ if self._metric is not None:
+ statistics_dict["statistics"]["knn_metric"] = self._metric
+
+ return statistics_dict
+
+ def _score_with_pred_probs(self, pred_probs: np.ndarray, **kwargs) -> np.ndarray:
+ # Remove "threshold" from kwargs if it exists
+ kwargs.pop("threshold", None)
+ scores = self.ood.fit_score(pred_probs=pred_probs, labels=self.datalab.labels, **kwargs)
+ return scores
+
+ def _score_with_features(self, features: npt.NDArray, **kwargs) -> npt.NDArray:
+ scores = self.ood.fit_score(features=features)
+ return scores
diff --git a/cleanlab/datalab/report.py b/cleanlab/datalab/report.py
new file mode 100644
index 0000000000..aec0d4e9ec
--- /dev/null
+++ b/cleanlab/datalab/report.py
@@ -0,0 +1,144 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+"""
+Module that handles reporting of all types of issues identified in the data.
+"""
+
+from typing import TYPE_CHECKING
+
+import pandas as pd
+
+from cleanlab.datalab.factory import _IssueManagerFactory
+
+
+if TYPE_CHECKING: # pragma: no cover
+ from cleanlab.datalab.data_issues import DataIssues
+
+
+class Reporter:
+ """Class that generates a report about the issues stored in a :py:class:`DataIssues` object.
+
+ Parameters
+ ----------
+ data_issues :
+ The :py:class:`DataIssues` object containing the issues to report on. This is usually
+ generated by the :py:class:`Datalab` class, stored in the :py:attr:`data_issues` attribute,
+ and then passed to the :py:class:`Reporter` class to generate a report.
+
+ verbosity :
+ The default verbosity of the report to generate. Each :py:class`IssueManager`
+ specifies the available verbosity levels and what additional information
+ is included at each level.
+
+ include_description :
+ Whether to include the description of each issue type in the report. The description
+ is included by default, but can be excluded by setting this parameter to ``False``.
+
+ Note
+ ----
+ This class is not intended to be used directly. Instead, use the
+ `Datalab.find_issues` method which internally utilizes an IssueFinder instance.
+ """
+
+ def __init__(
+ self,
+ data_issues: "DataIssues",
+ verbosity: int = 1,
+ include_description: bool = True,
+ show_summary_score: bool = False,
+ ):
+ self.data_issues = data_issues
+ self.verbosity = verbosity
+ self.include_description = include_description
+ self.show_summary_score = show_summary_score
+
+ def report(self, num_examples: int) -> None:
+ """Prints a report about identified issues in the data.
+
+ Parameters
+ ----------
+ num_examples :
+ The number of examples to include in the report for each issue type.
+ """
+ print(self.get_report(num_examples=num_examples))
+
+ def get_report(self, num_examples: int) -> str:
+ """Constructs a report about identified issues in the data.
+
+ Parameters
+ ----------
+ num_examples :
+ The number of examples to include in the report for each issue type.
+
+
+ Returns
+ -------
+ report_str :
+ A string containing the report.
+
+ Examples
+ --------
+ >>> from cleanlab.datalab.report import Reporter
+ >>> reporter = Reporter(data_issues=data_issues, include_description=False)
+ >>> report_str = reporter.get_report(num_examples=5)
+ >>> print(report_str)
+ """
+ report_str = ""
+ issue_summary = self.data_issues.issue_summary
+ issue_summary_sorted = issue_summary.sort_values(by="num_issues", ascending=False)
+ report_str += self._write_summary(summary=issue_summary_sorted)
+
+ issue_reports = [
+ _IssueManagerFactory.from_str(issue_type=key).report(
+ issues=self.data_issues.get_issues(issue_name=key),
+ summary=self.data_issues.get_issue_summary(issue_name=key),
+ info=self.data_issues.get_info(issue_name=key),
+ num_examples=num_examples,
+ verbosity=self.verbosity,
+ include_description=self.include_description,
+ )
+ for key in issue_summary_sorted["issue_type"].tolist()
+ ]
+
+ report_str += "\n\n\n".join(issue_reports)
+ return report_str
+
+ def _write_summary(self, summary: pd.DataFrame) -> str:
+ statistics = self.data_issues.get_info("statistics")
+ num_examples = statistics["num_examples"]
+ num_classes = statistics.get(
+ "num_classes"
+ ) # This may not be required for all types of datasets in the future (e.g. unlabeled/regression)
+
+ dataset_information = f"Dataset Information: num_examples: {num_examples}"
+ if num_classes is not None:
+ dataset_information += f", num_classes: {num_classes}"
+
+ if self.show_summary_score:
+ return (
+ "Here is a summary of the different kinds of issues found in the data:\n\n"
+ + summary.to_string(index=False)
+ + "\n\n"
+ + "(Note: A lower score indicates a more severe issue across all examples in the dataset.)\n\n"
+ + f"{dataset_information}\n\n\n"
+ )
+
+ return (
+ "Here is a summary of the different kinds of issues found in the data:\n\n"
+ + summary.drop(columns=["score"]).to_string(index=False)
+ + "\n\n"
+ + f"{dataset_information}\n\n\n"
+ )
diff --git a/cleanlab/datalab/serialize.py b/cleanlab/datalab/serialize.py
new file mode 100644
index 0000000000..548661f1a4
--- /dev/null
+++ b/cleanlab/datalab/serialize.py
@@ -0,0 +1,138 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+from __future__ import annotations
+
+import os
+import pickle
+import warnings
+from typing import TYPE_CHECKING, Optional
+
+import pandas as pd
+
+import cleanlab
+from cleanlab.datalab.data import Data
+
+if TYPE_CHECKING: # pragma: no cover
+ from datasets.arrow_dataset import Dataset
+
+ from cleanlab.datalab.datalab import Datalab
+
+
+# Constants:
+OBJECT_FILENAME = "datalab.pkl"
+ISSUES_FILENAME = "issues.csv"
+ISSUE_SUMMARY_FILENAME = "summary.csv"
+INFO_FILENAME = "info.pkl"
+DATA_DIRNAME = "data"
+
+
+class _Serializer:
+ @staticmethod
+ def _save_data_issues(path: str, datalab: Datalab) -> None:
+ """Saves the issues to disk."""
+ issues_path = os.path.join(path, ISSUES_FILENAME)
+ datalab.data_issues.issues.to_csv(issues_path, index=False)
+
+ issue_summary_path = os.path.join(path, ISSUE_SUMMARY_FILENAME)
+ datalab.data_issues.issue_summary.to_csv(issue_summary_path, index=False)
+
+ @staticmethod
+ def _save_data(path: str, datalab: Datalab) -> None:
+ """Saves the dataset to disk."""
+ data_path = os.path.join(path, DATA_DIRNAME)
+ datalab.data.save_to_disk(data_path)
+
+ @staticmethod
+ def _validate_version(datalab: Datalab) -> None:
+ current_version = cleanlab.__version__ # type: ignore[attr-defined]
+ datalab_version = datalab.cleanlab_version
+ if current_version != datalab_version:
+ warnings.warn(
+ f"Saved Datalab was created using different version of cleanlab "
+ f"({datalab_version}) than current version ({current_version}). "
+ f"Things may be broken!"
+ )
+
+ @classmethod
+ def serialize(cls, path: str, datalab: Datalab, force: bool) -> None:
+ """Serializes the datalab object to disk.
+
+ Parameters
+ ----------
+ path : str
+ Path to save the datalab object to.
+
+ datalab : Datalab
+ The datalab object to save.
+
+ force : bool
+ If True, will overwrite existing files at the specified path.
+ """
+ path_exists = os.path.exists(path)
+ if not path_exists:
+ os.mkdir(path)
+ else:
+ if not force:
+ raise FileExistsError("Please specify a new path or set force=True")
+ print(f"WARNING: Existing files will be overwritten by newly saved files at: {path}")
+
+ # Save the datalab object to disk.
+ with open(os.path.join(path, OBJECT_FILENAME), "wb") as f:
+ pickle.dump(datalab, f)
+
+ # Save the issues to disk. Use placeholder method for now.
+ cls._save_data_issues(path=path, datalab=datalab)
+
+ # Save the dataset to disk
+ cls._save_data(path=path, datalab=datalab)
+
+ @classmethod
+ def deserialize(cls, path: str, data: Optional[Dataset] = None) -> Datalab:
+ """Deserializes the datalab object from disk."""
+
+ if not os.path.exists(path):
+ raise ValueError(f"No folder found at specified path: {path}")
+
+ with open(os.path.join(path, OBJECT_FILENAME), "rb") as f:
+ datalab: Datalab = pickle.load(f)
+
+ cls._validate_version(datalab)
+
+ # Load the issues from disk.
+ issues_path = os.path.join(path, ISSUES_FILENAME)
+ if not hasattr(datalab.data_issues, "issues") and os.path.exists(issues_path):
+ datalab.data_issues.issues = pd.read_csv(issues_path)
+
+ issue_summary_path = os.path.join(path, ISSUE_SUMMARY_FILENAME)
+ if not hasattr(datalab.data_issues, "issue_summary") and os.path.exists(issue_summary_path):
+ datalab.data_issues.issue_summary = pd.read_csv(issue_summary_path)
+
+ if data is not None:
+ if hash(data) != hash(datalab._data):
+ raise ValueError(
+ "Data has been modified since Lab was saved. "
+ "Cannot load Lab with modified data."
+ )
+
+ if len(data) != len(datalab.labels):
+ raise ValueError(
+ f"Length of data ({len(data)}) does not match length of labels ({len(datalab.labels)})"
+ )
+
+ datalab._data = Data(data, datalab.label_name)
+ datalab.data = datalab._data._data
+
+ return datalab
diff --git a/cleanlab/dataset.py b/cleanlab/dataset.py
index 40e2905a3a..d1f9a9afe9 100644
--- a/cleanlab/dataset.py
+++ b/cleanlab/dataset.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -14,7 +14,6 @@
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab. If not, see .
-
"""
Provides dataset-level and class-level overviews of issues in your classification dataset.
If your task allows you to modify the classes in your dataset, this module can help you determine
@@ -22,6 +21,7 @@
and which classes to merge (see :py:func:`find_overlapping_classes `).
"""
+from typing import Optional, cast
import numpy as np
import pandas as pd
from cleanlab.count import estimate_joint
@@ -53,6 +53,16 @@ def rank_classes_by_label_quality(
Only provide **exactly one of the above input options**, do not provide a combination.
+ Examples
+ --------
+ >>> from cleanlab.dataset import rank_classes_by_label_quality
+ >>> from sklearn.linear_model import LogisticRegression
+ >>> from sklearn.model_selection import cross_val_predict
+ >>> data, labels = get_data_labels_from_dataset()
+ >>> yourFavoriteModel = LogisticRegression()
+ >>> pred_probs = cross_val_predict(yourFavoriteModel, data, labels, cv=3, method="predict_proba")
+ >>> df = rank_classes_by_label_quality(labels=labels, pred_probs=pred_probs)
+
**Parameters**: For parameter info, see the docstring of :py:func:`find_overlapping_classes `.
Returns
@@ -75,13 +85,16 @@ def rank_classes_by_label_quality(
By default, the DataFrame is ordered by "Label Quality Score", ascending.
"""
+ if multi_label:
+ raise ValueError(
+ "For multilabel data, please instead call: multilabel_classification.dataset.overall_multilabel_health_score()"
+ )
if joint is None:
joint = estimate_joint(
labels=labels,
pred_probs=pred_probs,
confident_joint=confident_joint,
- multi_label=multi_label,
)
if num_examples is None:
num_examples = _get_num_examples(labels=labels)
@@ -138,6 +151,16 @@ def find_overlapping_classes(
issues via the approach published in `Northcutt et al.,
2021 `_.
+ Examples
+ --------
+ >>> from cleanlab.dataset import find_overlapping_classes
+ >>> from sklearn.linear_model import LogisticRegression
+ >>> from sklearn.model_selection import cross_val_predict
+ >>> data, labels = get_data_labels_from_dataset()
+ >>> yourFavoriteModel = LogisticRegression()
+ >>> pred_probs = cross_val_predict(yourFavoriteModel, data, labels, cv=3, method="predict_proba")
+ >>> df = find_overlapping_classes(labels=labels, pred_probs=pred_probs)
+
Note
----
The joint distribution of noisy and true labels is asymmetric, and therefore the joint
@@ -203,12 +226,6 @@ class 0, 1, ..., K-1. `pred_probs` should have been computed using 3 (or
The `confident_joint` can be computed using :py:func:`count.compute_confident_joint `.
If not provided, it is computed from the given (noisy) `labels` and `pred_probs`.
- multi_label : bool, optional
- If ``True``, labels should be an iterable (e.g. list) of iterables, containing a
- list of labels for each example, instead of just a single label.
- The multi-label setting supports classification tasks where an example has 1 or more labels.
- Example of a multi-labeled `labels` input: ``[[0,1], [1], [0,2], [0,1,2], [0], [1], ...]``.
-
Returns
-------
overlapping_classes : pd.DataFrame
@@ -240,15 +257,19 @@ def _2d_matrix_to_row_column_value_list(matrix):
return [(*i, v) for i, v in np.ndenumerate(matrix)]
+ if multi_label:
+ raise ValueError(
+ "For multilabel data, please instead call: multilabel_classification.dataset.common_multilabel_issues()"
+ )
+
if joint is None:
joint = estimate_joint(
labels=labels,
pred_probs=pred_probs,
confident_joint=confident_joint,
- multi_label=multi_label,
)
if num_examples is None:
- num_examples = _get_num_examples(labels=labels)
+ num_examples = _get_num_examples(labels=labels, confident_joint=confident_joint)
if asymmetric:
rcv_list = _2d_matrix_to_row_column_value_list(joint)
# Remove diagonal elements
@@ -296,21 +317,35 @@ def overall_label_health_score(
Only provide **exactly one of the above input options**, do not provide a combination.
+ Examples
+ --------
+ >>> from cleanlab.dataset import overall_label_health_score
+ >>> from sklearn.linear_model import LogisticRegression
+ >>> from sklearn.model_selection import cross_val_predict
+ >>> data, labels = get_data_labels_from_dataset()
+ >>> yourFavoriteModel = LogisticRegression()
+ >>> pred_probs = cross_val_predict(yourFavoriteModel, data, labels, cv=3, method="predict_proba")
+ >>> score = overall_label_health_score(labels=labels, pred_probs=pred_probs) # doctest: +SKIP
+
**Parameters**: For parameter info, see the docstring of :py:func:`find_overlapping_classes `.
+
Returns
-------
health_score : float
A score between 0 and 1, where 1 implies all labels in the dataset are estimated to be correct.
A score of 0.5 implies that half of the dataset's labels are estimated to have issues.
"""
+ if multi_label:
+ raise ValueError(
+ "For multilabel data, please instead call: multilabel_classification.dataset.overall_multilabel_health_score()"
+ )
if joint is None:
joint = estimate_joint(
labels=labels,
pred_probs=pred_probs,
confident_joint=confident_joint,
- multi_label=multi_label,
)
if num_examples is None:
num_examples = _get_num_examples(labels=labels)
@@ -337,11 +372,13 @@ def health_summary(
multi_label=False,
verbose=True,
) -> dict:
- """Prints a health summary of your datasets including useful statistics like:
+ """Prints a health summary of your dataset.
+
+ This summary includes useful statistics like:
- * The classes with the most and least label issues
- * Classes that overlap and could potentially be merged
- * Overall data label quality health score statistics for your dataset
+ * The classes with the most and least label issues.
+ * Classes that overlap and could potentially be merged.
+ * Overall label quality scores, summarizing how accurate the labels appear overall.
This method works by providing any one (and only one) of the following inputs:
@@ -351,6 +388,16 @@ def health_summary(
Only provide **exactly one of the above input options**, do not provide a combination.
+ Examples
+ --------
+ >>> from cleanlab.dataset import health_summary
+ >>> from sklearn.linear_model import LogisticRegression
+ >>> from sklearn.model_selection import cross_val_predict
+ >>> data, labels = get_data_labels_from_dataset()
+ >>> yourFavoriteModel = LogisticRegression()
+ >>> pred_probs = cross_val_predict(yourFavoriteModel, data, labels, cv=3, method="predict_proba")
+ >>> summary = health_summary(labels=labels, pred_probs=pred_probs) # doctest: +SKIP
+
**Parameters**: For parameter info, see the docstring of :py:func:`find_overlapping_classes `.
Returns
@@ -365,12 +412,15 @@ def health_summary(
"""
from cleanlab.internal.util import smart_display_dataframe
+ if multi_label:
+ raise ValueError(
+ "For multilabel data, please call multilabel_classification.dataset.health_summary"
+ )
if joint is None:
joint = estimate_joint(
labels=labels,
pred_probs=pred_probs,
confident_joint=confident_joint,
- multi_label=multi_label,
)
if num_examples is None:
num_examples = _get_num_examples(labels=labels)
@@ -397,7 +447,6 @@ def health_summary(
num_examples=num_examples,
joint=joint,
confident_joint=confident_joint,
- multi_label=multi_label,
)
if verbose:
print("Overall Class Quality and Noise across your dataset (below)")
@@ -412,7 +461,6 @@ def health_summary(
num_examples=num_examples,
joint=joint,
confident_joint=confident_joint,
- multi_label=multi_label,
)
if verbose:
print(
@@ -431,7 +479,6 @@ def health_summary(
num_examples=num_examples,
joint=joint,
confident_joint=confident_joint,
- multi_label=multi_label,
verbose=verbose,
)
if verbose:
@@ -444,13 +491,11 @@ def health_summary(
}
-def _get_num_examples(labels=None) -> int:
+def _get_num_examples(labels=None, confident_joint: Optional[np.ndarray] = None) -> int:
"""Helper method that finds the number of examples from the parameters or throws an error
if neither parameter is provided.
- Parameters
- ----------
- For parameter info, see the docstring of `dataset.find_overlapping_classes`
+ **Parameters:** For information about the arguments to this method, see the documentation of `dataset.find_overlapping_classes`
Returns
-------
@@ -462,11 +507,11 @@ def _get_num_examples(labels=None) -> int:
ValueError
If `labels` is None."""
- if labels is not None:
- num_examples = len(labels)
- else:
+ if labels is None and confident_joint is None:
raise ValueError(
- "Error: num_examples is None. You must provide a value for num_examples "
- "when calling this method using the joint as an input parameter."
+ "Error: num_examples is None. You must either provide confident_joint, "
+ "or provide both num_example and joint as input parameters."
)
+ _confident_joint = cast(np.ndarray, confident_joint)
+ num_examples = len(labels) if labels is not None else cast(int, np.sum(_confident_joint))
return num_examples
diff --git a/cleanlab/experimental/README.md b/cleanlab/experimental/README.md
index 3c93ec9c71..3b24007b89 100644
--- a/cleanlab/experimental/README.md
+++ b/cleanlab/experimental/README.md
@@ -2,13 +2,9 @@
Methods in this `experimental` module are bleeding edge and may have sharp edges. They are not guaranteed to be stable between different cleanlab versions.
-Some of these files include various models that can be used with cleanlab to find issues in specific types of data. These require dependencies on deep learning and other machine learning packages that are not official cleanlab dependencies. You must install these dependencies on your own if you wish to use them.
+Some of these files include various models that can be used with cleanlab to find issues in specific types of data. These require dependencies on deep learning and other machine learning packages that are not official cleanlab dependencies. You must install these dependencies on your own if you wish to use them.
-The dependencies are as follows:
-* keras.py - a wrapper to make any Keras model compatible with cleanlab and sklearn
- - tensorflow
-* fasttext.py - a cleanlab-compatible FastText classifier for text data
- - fasttext
+The modules and required dependencies are as follows:
* mnist_pytorch.py - a cleanlab-compatible simplified AlexNet for MNIST using PyTorch
- torch
- torchvision
diff --git a/cleanlab/experimental/cifar_cnn.py b/cleanlab/experimental/cifar_cnn.py
index 15e82d1b8e..13c08d35d3 100644
--- a/cleanlab/experimental/cifar_cnn.py
+++ b/cleanlab/experimental/cifar_cnn.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
diff --git a/cleanlab/experimental/coteaching.py b/cleanlab/experimental/coteaching.py
index b6247ee57d..83223ff523 100644
--- a/cleanlab/experimental/coteaching.py
+++ b/cleanlab/experimental/coteaching.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -35,6 +35,7 @@
MINIMUM_BATCH_SIZE = 16
+
# Loss function for Co-Teaching
def loss_coteaching(
y_1,
diff --git a/cleanlab/experimental/label_issues_batched.py b/cleanlab/experimental/label_issues_batched.py
new file mode 100644
index 0000000000..ef11f91b65
--- /dev/null
+++ b/cleanlab/experimental/label_issues_batched.py
@@ -0,0 +1,752 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""
+Implementation of :py:func:`filter.find_label_issues `
+that does not need much memory by operating in mini-batches.
+You can also use this approach to estimate label quality scores or the number of label issues
+for big datasets with limited memory.
+
+With default settings, the results returned from this approach closely approximate those returned from:
+``cleanlab.filter.find_label_issues(..., filter_by="low_self_confidence", return_indices_ranked_by="self_confidence")``
+
+To run this approach, either use the ``find_label_issues_batched()`` convenience function defined in this module,
+or follow the examples script for the ``LabelInspector`` class if you require greater customization.
+"""
+
+import numpy as np
+from typing import Optional, List, Tuple, Any
+
+from cleanlab.count import get_confident_thresholds
+from cleanlab.rank import find_top_issues, _compute_label_quality_scores
+from cleanlab.typing import LabelLike
+from cleanlab.internal.util import value_counts_fill_missing_classes
+from cleanlab.internal.constants import (
+ CONFIDENT_THRESHOLDS_LOWER_BOUND,
+ FLOATING_POINT_COMPARISON,
+ CLIPPING_LOWER_BOUND,
+)
+
+import platform
+import multiprocessing as mp
+
+try:
+ import psutil
+
+ PSUTIL_EXISTS = True
+except ImportError: # pragma: no cover
+ PSUTIL_EXISTS = False
+
+# global variable for multiproc on linux
+adj_confident_thresholds_shared: np.ndarray
+labels_shared: LabelLike
+pred_probs_shared: np.ndarray
+
+
+def find_label_issues_batched(
+ labels: Optional[LabelLike] = None,
+ pred_probs: Optional[np.ndarray] = None,
+ *,
+ labels_file: Optional[str] = None,
+ pred_probs_file: Optional[str] = None,
+ batch_size: int = 10000,
+ n_jobs: Optional[int] = 1,
+ verbose: bool = True,
+ quality_score_kwargs: Optional[dict] = None,
+ num_issue_kwargs: Optional[dict] = None,
+) -> np.ndarray:
+ """
+ Variant of :py:func:`filter.find_label_issues `
+ that requires less memory by reading from `pred_probs`, `labels` in mini-batches.
+ To avoid loading big `pred_probs`, `labels` arrays into memory,
+ provide these as memory-mapped objects like Zarr arrays or memmap arrays instead of regular numpy arrays.
+ See: https://pythonspeed.com/articles/mmap-vs-zarr-hdf5/
+
+ With default settings, the results returned from this method closely approximate those returned from:
+ ``cleanlab.filter.find_label_issues(..., filter_by="low_self_confidence", return_indices_ranked_by="self_confidence")``
+
+ This function internally implements the example usage script of the ``LabelInspector`` class,
+ but you can further customize that script by running it yourself instead of this function.
+ See the documentation of ``LabelInspector`` to learn more about how this method works internally.
+
+ Parameters
+ ----------
+ labels: np.ndarray-like object, optional
+ 1D array of given class labels for each example in the dataset, (int) values in ``0,1,2,...,K-1``.
+ To avoid loading big objects into memory, you should pass this as a memory-mapped object like:
+ Zarr array loaded with ``zarr.convenience.open(YOURFILE.zarr, mode="r")``,
+ or memmap array loaded with ``np.load(YOURFILE.npy, mmap_mode="r")``.
+
+ Tip: You can save an existing numpy array to Zarr via: ``zarr.convenience.save_array(YOURFILE.zarr, your_array)``,
+ or to .npy file that can be loaded with mmap via: ``np.save(YOURFILE.npy, your_array)``.
+
+ pred_probs: np.ndarray-like object, optional
+ 2D array of model-predicted class probabilities (floats) for each example in the dataset.
+ To avoid loading big objects into memory, you should pass this as a memory-mapped object like:
+ Zarr array loaded with ``zarr.convenience.open(YOURFILE.zarr, mode="r")``
+ or memmap array loaded with ``np.load(YOURFILE.npy, mmap_mode="r")``.
+
+ labels_file: str, optional
+ Specify this instead of `labels` if you want this method to load from file for you into a memmap array.
+ Path to .npy file where the entire 1D `labels` numpy array is stored on disk (list format is not supported).
+ This is loaded using: ``np.load(labels_file, mmap_mode="r")``
+ so make sure this file was created via: ``np.save()`` or other compatible methods (.npz not supported).
+
+ pred_probs_file: str, optional
+ Specify this instead of `pred_probs` if you want this method to load from file for you into a memmap array.
+ Path to .npy file where the entire `pred_probs` numpy array is stored on disk.
+ This is loaded using: ``np.load(pred_probs_file, mmap_mode="r")``
+ so make sure this file was created via: ``np.save()`` or other compatible methods (.npz not supported).
+
+ batch_size : int, optional
+ Size of mini-batches to use for estimating the label issues.
+ To maximize efficiency, try to use the largest `batch_size` your memory allows.
+
+ n_jobs: int, optional
+ Number of processes for multiprocessing (default value = 1). Only used on Linux.
+ If `n_jobs=None`, will use either the number of: physical cores if psutil is installed, or logical cores otherwise.
+
+ verbose : bool, optional
+ Whether to suppress print statements or not.
+
+ quality_score_kwargs : dict, optional
+ Keyword arguments to pass into :py:func:`rank.get_label_quality_scores `.
+
+ num_issue_kwargs : dict, optional
+ Keyword arguments to :py:func:`count.num_label_issues `
+ to control estimation of the number of label issues.
+ The only supported kwarg here for now is: `estimation_method`.
+
+ Returns
+ -------
+ issue_indices : np.ndarray
+ Indices of examples with label issues, sorted by label quality score.
+
+ Examples
+ --------
+ >>> batch_size = 10000 # for efficiency, set this to as large of a value as your memory can handle
+ >>> # Just demonstrating how to save your existing numpy labels, pred_probs arrays to compatible .npy files:
+ >>> np.save("LABELS.npy", labels_array)
+ >>> np.save("PREDPROBS.npy", pred_probs_array)
+ >>> # You can load these back into memmap arrays via: labels = np.load("LABELS.npy", mmap_mode="r")
+ >>> # and then run this method on the memmap arrays, or just run it directly on the .npy files like this:
+ >>> issues = find_label_issues_batched(labels_file="LABELS.npy", pred_probs_file="PREDPROBS.npy", batch_size=batch_size)
+ >>> # This method also works with Zarr arrays:
+ >>> import zarr
+ >>> # Just demonstrating how to save your existing numpy labels, pred_probs arrays to compatible .zarr files:
+ >>> zarr.convenience.save_array("LABELS.zarr", labels_array)
+ >>> zarr.convenience.save_array("PREDPROBS.zarr", pred_probs_array)
+ >>> # You can load from such files into Zarr arrays:
+ >>> labels = zarr.convenience.open("LABELS.zarr", mode="r")
+ >>> pred_probs = zarr.convenience.open("PREDPROBS.zarr", mode="r")
+ >>> # This method can be directly run on Zarr arrays, memmap arrays, or regular numpy arrays:
+ >>> issues = find_label_issues_batched(labels=labels, pred_probs=pred_probs, batch_size=batch_size)
+ """
+ if labels_file is not None:
+ if labels is not None:
+ raise ValueError("only specify one of: `labels` or `labels_file`")
+ if not isinstance(labels_file, str):
+ raise ValueError(
+ "labels_file must be str specifying path to .npy file containing the array of labels"
+ )
+ labels = np.load(labels_file, mmap_mode="r")
+ assert isinstance(labels, np.ndarray)
+
+ if pred_probs_file is not None:
+ if pred_probs is not None:
+ raise ValueError("only specify one of: `pred_probs` or `pred_probs_file`")
+ if not isinstance(pred_probs_file, str):
+ raise ValueError(
+ "pred_probs_file must be str specifying path to .npy file containing 2D array of pred_probs"
+ )
+ pred_probs = np.load(pred_probs_file, mmap_mode="r")
+ assert isinstance(pred_probs, np.ndarray)
+ if verbose:
+ print(
+ f"mmap-loaded numpy arrays have: {len(pred_probs)} examples, {pred_probs.shape[1]} classes"
+ )
+ if labels is None:
+ raise ValueError("must provide one of: `labels` or `labels_file`")
+ if pred_probs is None:
+ raise ValueError("must provide one of: `pred_probs` or `pred_probs_file`")
+
+ assert pred_probs is not None
+ if len(labels) != len(pred_probs):
+ raise ValueError(
+ f"len(labels)={len(labels)} does not match len(pred_probs)={len(pred_probs)}. Perhaps an issue loading mmap numpy arrays from file."
+ )
+ lab = LabelInspector(
+ num_class=pred_probs.shape[1],
+ verbose=verbose,
+ n_jobs=n_jobs,
+ quality_score_kwargs=quality_score_kwargs,
+ num_issue_kwargs=num_issue_kwargs,
+ )
+ n = len(labels)
+ if verbose:
+ from tqdm.auto import tqdm
+
+ pbar = tqdm(desc="number of examples processed for estimating thresholds", total=n)
+ i = 0
+ while i < n:
+ end_index = i + batch_size
+ labels_batch = labels[i:end_index]
+ pred_probs_batch = pred_probs[i:end_index, :]
+ i = end_index
+ lab.update_confident_thresholds(labels_batch, pred_probs_batch)
+ if verbose:
+ pbar.update(batch_size)
+
+ # Next evaluate the quality of the labels (run this on full dataset you want to evaluate):
+ if verbose:
+ pbar.close()
+ pbar = tqdm(desc="number of examples processed for checking labels", total=n)
+ i = 0
+ while i < n:
+ end_index = i + batch_size
+ labels_batch = labels[i:end_index]
+ pred_probs_batch = pred_probs[i:end_index, :]
+ i = end_index
+ _ = lab.score_label_quality(labels_batch, pred_probs_batch)
+ if verbose:
+ pbar.update(batch_size)
+
+ if verbose:
+ pbar.close()
+
+ return lab.get_label_issues()
+
+
+class LabelInspector:
+ """
+ Class for finding label issues in big datasets where memory becomes a problem for other cleanlab methods.
+ Only create one such object per dataset and do not try to use the same ``LabelInspector`` across 2 datasets.
+ For efficiency, this class does little input checking.
+ You can first run :py:func:`filter.find_label_issues `
+ on a small subset of your data to verify your inputs are properly formatted.
+ Do NOT modify any of the attributes of this class yourself!
+ Multi-label classification is not supported by this class, it is only for multi-class classification.
+
+ The recommended usage demonstrated in the examples script below involves two passes over your data:
+ one pass to compute `confident_thresholds`, another to evaluate each label.
+ To maximize efficiency, try to use the largest batch_size your memory allows.
+ To reduce runtime further, you can run the first pass on a subset of your dataset
+ as long as it contains enough data from each class to estimate `confident_thresholds` accurately.
+
+ In the examples script below:
+ - `labels` is a (big) 1D ``np.ndarray`` of class labels represented as integers in ``0,1,...,K-1``.
+ - ``pred_probs`` = is a (big) 2D ``np.ndarray`` of predicted class probabilities,
+ where each row is an example, each column represents a class.
+
+ `labels` and `pred_probs` can be stored in a file instead where you load chunks of them at a time.
+ Methods to load arrays in chunks include: ``np.load(...,mmap_mode='r')``, ``numpy.memmap()``,
+ HDF5 or Zarr files, see: https://pythonspeed.com/articles/mmap-vs-zarr-hdf5/
+
+ Examples
+ --------
+ >>> n = len(labels)
+ >>> batch_size = 10000 # you can change this in between batches, set as big as your RAM allows
+ >>> lab = LabelInspector(num_class = pred_probs.shape[1])
+ >>> # First compute confident thresholds (for faster results, can also do this on a random subset of your data):
+ >>> i = 0
+ >>> while i < n:
+ >>> end_index = i + batch_size
+ >>> labels_batch = labels[i:end_index]
+ >>> pred_probs_batch = pred_probs[i:end_index,:]
+ >>> i = end_index
+ >>> lab.update_confident_thresholds(labels_batch, pred_probs_batch)
+ >>> # See what we calculated:
+ >>> confident_thresholds = lab.get_confident_thresholds()
+ >>> # Evaluate the quality of the labels (run this on full dataset you want to evaluate):
+ >>> i = 0
+ >>> while i < n:
+ >>> end_index = i + batch_size
+ >>> labels_batch = labels[i:end_index]
+ >>> pred_probs_batch = pred_probs[i:end_index,:]
+ >>> i = end_index
+ >>> batch_results = lab.score_label_quality(labels_batch, pred_probs_batch)
+ >>> # Indices of examples with label issues, sorted by label quality score (most severe to least severe):
+ >>> indices_of_examples_with_issues = lab.get_label_issues()
+ >>> # If your `pred_probs` and `labels` are arrays already in memory,
+ >>> # then you can use this shortcut for all of the above:
+ >>> indices_of_examples_with_issues = find_label_issues_batched(labels, pred_probs, batch_size=10000)
+
+ Parameters
+ ----------
+ num_class : int
+ The number of classes in your multi-class classification task.
+
+ store_results : bool, optional
+ Whether this object will store all label quality scores, a 1D array of shape ``(N,)``
+ where ``N`` is the total number of examples in your dataset.
+ Set this to False if you encounter memory problems even for small batch sizes (~1000).
+ If ``False``, you can still identify the label issues yourself by aggregating
+ the label quality scores for each batch, sorting them across all batches, and returning the top ``T`` indices
+ with ``T = self.get_num_issues()``.
+
+ verbose : bool, optional
+ Whether to suppress print statements or not.
+
+ n_jobs: int, optional
+ Number of processes for multiprocessing (default value = 1). Only used on Linux.
+ If `n_jobs=None`, will use either the number of: physical cores if psutil is installed, or logical cores otherwise.
+
+ quality_score_kwargs : dict, optional
+ Keyword arguments to pass into :py:func:`rank.get_label_quality_scores `.
+
+ num_issue_kwargs : dict, optional
+ Keyword arguments to :py:func:`count.num_label_issues `
+ to control estimation of the number of label issues.
+ The only supported kwarg here for now is: `estimation_method`.
+ """
+
+ def __init__(
+ self,
+ *,
+ num_class: int,
+ store_results: bool = True,
+ verbose: bool = True,
+ quality_score_kwargs: Optional[dict] = None,
+ num_issue_kwargs: Optional[dict] = None,
+ n_jobs: Optional[int] = 1,
+ ):
+ if quality_score_kwargs is None:
+ quality_score_kwargs = {}
+ if num_issue_kwargs is None:
+ num_issue_kwargs = {}
+
+ self.num_class = num_class
+ self.store_results = store_results
+ self.verbose = verbose
+ self.quality_score_kwargs = quality_score_kwargs # extra arguments for ``rank.get_label_quality_scores()`` to control label quality scoring
+ self.num_issue_kwargs = num_issue_kwargs # extra arguments for ``count.num_label_issues()`` to control estimation of the number of label issues (only supported argument for now is: `estimation_method`).
+ self.off_diagonal_calibrated = False
+ if num_issue_kwargs.get("estimation_method") == "off_diagonal_calibrated":
+ # store extra attributes later needed for calibration:
+ self.off_diagonal_calibrated = True
+ self.prune_counts = np.zeros(self.num_class)
+ self.class_counts = np.zeros(self.num_class)
+ self.normalization = np.zeros(self.num_class)
+ else:
+ self.prune_count = 0 # number of label issues estimated based on data seen so far (only used when estimation_method is not calibrated)
+
+ if self.store_results:
+ self.label_quality_scores: List[float] = []
+
+ self.confident_thresholds = np.zeros(
+ (num_class,)
+ ) # current estimate of thresholds based on data seen so far
+ self.examples_per_class = np.zeros(
+ (num_class,)
+ ) # current counts of examples with each given label seen so far
+ self.examples_processed_thresh = (
+ 0 # number of examples seen so far for estimating thresholds
+ )
+ self.examples_processed_quality = 0 # number of examples seen so far for estimating label quality and number of label issues
+ # Determine number of cores for multiprocessing:
+ self.n_jobs: Optional[int] = None
+ os_name = platform.system()
+ if os_name != "Linux":
+ self.n_jobs = 1
+ if n_jobs is not None and n_jobs != 1 and self.verbose:
+ print(
+ "n_jobs is overridden to 1 because multiprocessing is only supported for Linux."
+ )
+ elif n_jobs is not None:
+ self.n_jobs = n_jobs
+ else:
+ if PSUTIL_EXISTS:
+ self.n_jobs = psutil.cpu_count(logical=False) # physical cores
+ if not self.n_jobs:
+ # switch to logical cores
+ self.n_jobs = mp.cpu_count()
+ if self.verbose:
+ print(
+ f"Multiprocessing will default to using the number of logical cores ({self.n_jobs}). To default to number of physical cores: pip install psutil"
+ )
+
+ def get_confident_thresholds(self, silent: bool = False) -> np.ndarray:
+ """
+ Fetches already-computed confident thresholds from the data seen so far
+ in same format as: :py:func:`count.get_confident_thresholds `.
+
+
+ Returns
+ -------
+ confident_thresholds : np.ndarray
+ An array of shape ``(K, )`` where ``K`` is the number of classes.
+ """
+ if self.examples_processed_thresh < 1:
+ raise ValueError(
+ "Have not computed any confident_thresholds yet. Call `update_confident_thresholds()` first."
+ )
+ else:
+ if self.verbose and not silent:
+ print(
+ f"Total number of examples used to estimate confident thresholds: {self.examples_processed_thresh}"
+ )
+ return self.confident_thresholds
+
+ def get_num_issues(self, silent: bool = False) -> int:
+ """
+ Fetches already-computed estimate of the number of label issues in the data seen so far
+ in the same format as: :py:func:`count.num_label_issues `.
+
+ Note: The estimated number of issues may differ from :py:func:`count.num_label_issues `
+ by 1 due to rounding differences.
+
+ Returns
+ -------
+ num_issues : int
+ The estimated number of examples with label issues in the data seen so far.
+ """
+ if self.examples_processed_quality < 1:
+ raise ValueError(
+ "Have not evaluated any labels yet. Call `score_label_quality()` first."
+ )
+ else:
+ if self.verbose and not silent:
+ print(
+ f"Total number of examples whose labels have been evaluated: {self.examples_processed_quality}"
+ )
+ if self.off_diagonal_calibrated:
+ calibrated_prune_counts = (
+ self.prune_counts
+ * self.class_counts
+ / np.clip(self.normalization, a_min=CLIPPING_LOWER_BOUND, a_max=None)
+ ) # avoid division by 0
+ return np.rint(np.sum(calibrated_prune_counts)).astype("int")
+ else: # not calibrated
+ return self.prune_count
+
+ def get_quality_scores(self) -> np.ndarray:
+ """
+ Fetches already-computed estimate of the label quality of each example seen so far
+ in the same format as: :py:func:`rank.get_label_quality_scores `.
+
+ Returns
+ -------
+ label_quality_scores : np.ndarray
+ Contains one score (between 0 and 1) per example seen so far.
+ Lower scores indicate more likely mislabeled examples.
+ """
+ if not self.store_results:
+ raise ValueError(
+ "Must initialize the LabelInspector with `store_results` == True. "
+ "Otherwise you can assemble the label quality scores yourself based on "
+ "the scores returned for each batch of data from `score_label_quality()`"
+ )
+ else:
+ return np.asarray(self.label_quality_scores)
+
+ def get_label_issues(self) -> np.ndarray:
+ """
+ Fetches already-computed estimate of indices of examples with label issues in the data seen so far,
+ in the same format as: :py:func:`filter.find_label_issues `
+ with its `return_indices_ranked_by` argument specified.
+
+ Note: this method corresponds to ``filter.find_label_issues(..., filter_by=METHOD1, return_indices_ranked_by=METHOD2)``
+ where by default: ``METHOD1="low_self_confidence"``, ``METHOD2="self_confidence"``
+ or if this object was instantiated with ``quality_score_kwargs = {"method": "normalized_margin"}`` then we instead have:
+ ``METHOD1="low_normalized_margin"``, ``METHOD2="normalized_margin"``.
+
+ Note: The estimated number of issues may differ from :py:func:`filter.find_label_issues `
+ by 1 due to rounding differences.
+
+ Returns
+ -------
+ issue_indices : np.ndarray
+ Indices of examples with label issues, sorted by label quality score.
+ """
+ if not self.store_results:
+ raise ValueError(
+ "Must initialize the LabelInspector with `store_results` == True. "
+ "Otherwise you can identify label issues yourself based on the scores from all "
+ "the batches of data and the total number of issues returned by `get_num_issues()`"
+ )
+ if self.examples_processed_quality < 1:
+ raise ValueError(
+ "Have not evaluated any labels yet. Call `score_label_quality()` first."
+ )
+ if self.verbose:
+ print(
+ f"Total number of examples whose labels have been evaluated: {self.examples_processed_quality}"
+ )
+ return find_top_issues(self.get_quality_scores(), top=self.get_num_issues(silent=True))
+
+ def update_confident_thresholds(self, labels: LabelLike, pred_probs: np.ndarray):
+ """
+ Updates the estimate of confident_thresholds stored in this class using a new batch of data.
+ Inputs should be in same format as for: :py:func:`count.get_confident_thresholds `.
+
+ Parameters
+ ----------
+ labels: np.ndarray or list
+ Given class labels for each example in the batch, values in ``0,1,2,...,K-1``.
+
+ pred_probs: np.ndarray
+ 2D array of model-predicted class probabilities for each example in the batch.
+ """
+ labels = _batch_check(labels, pred_probs, self.num_class)
+ batch_size = len(labels)
+ batch_thresholds = get_confident_thresholds(
+ labels, pred_probs
+ ) # values for missing classes may exceed 1 but should not matter since we multiply by this class counts in the batch
+ batch_class_counts = value_counts_fill_missing_classes(labels, num_classes=self.num_class)
+ self.confident_thresholds = (
+ self.examples_per_class * self.confident_thresholds
+ + batch_class_counts * batch_thresholds
+ ) / np.clip(
+ self.examples_per_class + batch_class_counts, a_min=1, a_max=None
+ ) # avoid division by 0
+ self.confident_thresholds = np.clip(
+ self.confident_thresholds, a_min=CONFIDENT_THRESHOLDS_LOWER_BOUND, a_max=None
+ )
+ self.examples_per_class += batch_class_counts
+ self.examples_processed_thresh += batch_size
+
+ def score_label_quality(
+ self,
+ labels: LabelLike,
+ pred_probs: np.ndarray,
+ *,
+ update_num_issues: bool = True,
+ ) -> np.ndarray:
+ """
+ Scores the label quality of each example in the provided batch of data,
+ and also updates the number of label issues stored in this class.
+ Inputs should be in same format as for: :py:func:`rank.get_label_quality_scores `.
+
+ Parameters
+ ----------
+ labels: np.ndarray
+ Given class labels for each example in the batch, values in ``0,1,2,...,K-1``.
+
+ pred_probs: np.ndarray
+ 2D array of model-predicted class probabilities for each example in the batch of data.
+
+ update_num_issues: bool, optional
+ Whether or not to update the number of label issues or only compute label quality scores.
+ For lower runtimes, set this to ``False`` if you only want to score label quality and not find label issues.
+
+ Returns
+ -------
+ label_quality_scores : np.ndarray
+ Contains one score (between 0 and 1) for each example in the batch of data.
+ """
+ labels = _batch_check(labels, pred_probs, self.num_class)
+ batch_size = len(labels)
+ scores = _compute_label_quality_scores(
+ labels,
+ pred_probs,
+ confident_thresholds=self.get_confident_thresholds(silent=True),
+ **self.quality_score_kwargs,
+ )
+ class_counts = value_counts_fill_missing_classes(labels, num_classes=self.num_class)
+ if update_num_issues:
+ self._update_num_label_issues(labels, pred_probs, **self.num_issue_kwargs)
+ self.examples_processed_quality += batch_size
+ if self.store_results:
+ self.label_quality_scores += list(scores)
+
+ return scores
+
+ def _update_num_label_issues(
+ self,
+ labels: LabelLike,
+ pred_probs: np.ndarray,
+ **kwargs,
+ ):
+ """
+ Update the estimate of num_label_issues stored in this class using a new batch of data.
+ Kwargs are ignored here for now (included for forwards compatibility).
+ Instead of being specified here, `estimation_method` should be declared when this class is initialized.
+ """
+
+ # whether to match the output of count.num_label_issues exactly
+ # default is False, which gives significant speedup on large batches
+ # and empirically matches num_label_issues even on input sizes of
+ # 1M x 10k
+ thorough = False
+ if self.examples_processed_thresh < 1:
+ raise ValueError(
+ "Have not computed any confident_thresholds yet. Call `update_confident_thresholds()` first."
+ )
+
+ if self.n_jobs == 1:
+ adj_confident_thresholds = self.confident_thresholds - FLOATING_POINT_COMPARISON
+ pred_class = np.argmax(pred_probs, axis=1)
+ batch_size = len(labels)
+ if thorough:
+ # add margin for floating point comparison operations:
+ pred_gt_thresholds = pred_probs >= adj_confident_thresholds
+ max_ind = np.argmax(pred_probs * pred_gt_thresholds, axis=1)
+ if not self.off_diagonal_calibrated:
+ mask = (max_ind != labels) & (pred_class != labels)
+ else:
+ # calibrated
+ # should we change to above?
+ mask = pred_class != labels
+ else:
+ max_ind = pred_class
+ mask = pred_class != labels
+
+ if not self.off_diagonal_calibrated:
+ prune_count_batch = np.sum(
+ (
+ pred_probs[np.arange(batch_size), max_ind]
+ >= adj_confident_thresholds[max_ind]
+ )
+ & mask
+ )
+ self.prune_count += prune_count_batch
+ else: # calibrated
+ self.class_counts += value_counts_fill_missing_classes(
+ labels, num_classes=self.num_class
+ )
+ to_increment = (
+ pred_probs[np.arange(batch_size), max_ind] >= adj_confident_thresholds[max_ind]
+ )
+ for class_label in range(self.num_class):
+ labels_equal_to_class = labels == class_label
+ self.normalization[class_label] += np.sum(labels_equal_to_class & to_increment)
+ self.prune_counts[class_label] += np.sum(
+ labels_equal_to_class
+ & to_increment
+ & (max_ind != labels)
+ # & (pred_class != labels)
+ # This is not applied in num_label_issues(..., estimation_method="off_diagonal_custom"). Do we want to add it?
+ )
+ else: # multiprocessing implementation
+ global adj_confident_thresholds_shared
+ adj_confident_thresholds_shared = self.confident_thresholds - FLOATING_POINT_COMPARISON
+
+ global labels_shared, pred_probs_shared
+ labels_shared = labels
+ pred_probs_shared = pred_probs
+
+ # good values for this are ~1000-10000 in benchmarks where pred_probs has 1B entries:
+ processes = 5000
+ if len(labels) <= processes:
+ chunksize = 1
+ else:
+ chunksize = len(labels) // processes
+ inds = split_arr(np.arange(len(labels)), chunksize)
+
+ if thorough:
+ use_thorough = np.ones(len(inds), dtype=bool)
+ else:
+ use_thorough = np.zeros(len(inds), dtype=bool)
+ args = zip(inds, use_thorough)
+ with mp.Pool(self.n_jobs) as pool:
+ if not self.off_diagonal_calibrated:
+ prune_count_batch = np.sum(
+ np.asarray(list(pool.imap_unordered(_compute_num_issues, args)))
+ )
+ self.prune_count += prune_count_batch
+ else:
+ results = list(pool.imap_unordered(_compute_num_issues_calibrated, args))
+ for result in results:
+ class_label = result[0]
+ self.class_counts[class_label] += 1
+ self.normalization[class_label] += result[1]
+ self.prune_counts[class_label] += result[2]
+
+
+def split_arr(arr: np.ndarray, chunksize: int) -> List[np.ndarray]:
+ """
+ Helper function to split array into chunks for multiprocessing.
+ """
+ return np.split(arr, np.arange(chunksize, arr.shape[0], chunksize), axis=0)
+
+
+def _compute_num_issues(arg: Tuple[np.ndarray, bool]) -> int:
+ """
+ Helper function for `_update_num_label_issues` multiprocessing without calibration.
+ """
+ ind = arg[0]
+ thorough = arg[1]
+ label = labels_shared[ind]
+ pred_prob = pred_probs_shared[ind, :]
+ pred_class = np.argmax(pred_prob, axis=-1)
+ batch_size = len(label)
+ if thorough:
+ pred_gt_thresholds = pred_prob >= adj_confident_thresholds_shared
+ max_ind = np.argmax(pred_prob * pred_gt_thresholds, axis=-1)
+ prune_count_batch = np.sum(
+ (pred_prob[np.arange(batch_size), max_ind] >= adj_confident_thresholds_shared[max_ind])
+ & (max_ind != label)
+ & (pred_class != label)
+ )
+ else:
+ prune_count_batch = np.sum(
+ (
+ pred_prob[np.arange(batch_size), pred_class]
+ >= adj_confident_thresholds_shared[pred_class]
+ )
+ & (pred_class != label)
+ )
+ return prune_count_batch
+
+
+def _compute_num_issues_calibrated(arg: Tuple[np.ndarray, bool]) -> Tuple[Any, int, int]:
+ """
+ Helper function for `_update_num_label_issues` multiprocessing with calibration.
+ """
+ ind = arg[0]
+ thorough = arg[1]
+ label = labels_shared[ind]
+ pred_prob = pred_probs_shared[ind, :]
+ batch_size = len(label)
+
+ pred_class = np.argmax(pred_prob, axis=-1)
+ if thorough:
+ pred_gt_thresholds = pred_prob >= adj_confident_thresholds_shared
+ max_ind = np.argmax(pred_prob * pred_gt_thresholds, axis=-1)
+ to_inc = (
+ pred_prob[np.arange(batch_size), max_ind] >= adj_confident_thresholds_shared[max_ind]
+ )
+
+ prune_count_batch = to_inc & (max_ind != label)
+ normalization_batch = to_inc
+ else:
+ to_inc = (
+ pred_prob[np.arange(batch_size), pred_class]
+ >= adj_confident_thresholds_shared[pred_class]
+ )
+ normalization_batch = to_inc
+ prune_count_batch = to_inc & (pred_class != label)
+
+ return (label, normalization_batch, prune_count_batch)
+
+
+def _batch_check(labels: LabelLike, pred_probs: np.ndarray, num_class: int) -> np.ndarray:
+ """
+ Basic checks to ensure batch of data looks ok. For efficiency, this check is quite minimal.
+
+ Returns
+ -------
+ labels : np.ndarray
+ `labels` formatted as a 1D array.
+ """
+ batch_size = pred_probs.shape[0]
+ labels = np.asarray(labels)
+ if len(labels) != batch_size:
+ raise ValueError("labels and pred_probs must have same length")
+ if pred_probs.shape[1] != num_class:
+ raise ValueError("num_class must equal pred_probs.shape[1]")
+
+ return labels
diff --git a/cleanlab/experimental/mnist_pytorch.py b/cleanlab/experimental/mnist_pytorch.py
index bdc9b6ffba..180f628f82 100644
--- a/cleanlab/experimental/mnist_pytorch.py
+++ b/cleanlab/experimental/mnist_pytorch.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -311,7 +311,6 @@ def fit(self, train_idx, train_labels=None, sample_weight=None, loader="train"):
# Train for self.epochs epochs
for epoch in range(1, self.epochs + 1):
-
# Enable dropout and batch norm layers
self.model.train()
for batch_idx, (data, target) in enumerate(train_loader):
diff --git a/cleanlab/filter.py b/cleanlab/filter.py
index 7a65da6622..70b2d8d5da 100644
--- a/cleanlab/filter.py
+++ b/cleanlab/filter.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -17,25 +17,22 @@
"""
Methods to identify which examples have label issues in a classification dataset.
The documentation below assumes a dataset with ``N`` examples and ``K`` classes.
-This module considers two types of datasets:
-
-* standard (multi-class) classification where each example is labeled as belonging to exactly one of K classes (e.g. ``labels = np.array([0,0,1,0,2,1])``)
-* multi-label classification where each example can be labeled as belonging to multiple classes (e.g. ``labels = [[1,2],[1],[0],[],...]``)
+This module is for standard (multi-class) classification where each example is labeled as belonging to exactly one of K classes (e.g. ``labels = np.array([0,0,1,0,2,1])``).
+Some methods here also work for multi-label classification data where each example can be labeled as belonging to multiple classes (e.g. ``labels = [[1,2],[1],[0],[],...]``),
+but we encourage using the methods in the ``cleanlab.multilabel_classification`` module instead for such data.
"""
import numpy as np
from sklearn.metrics import confusion_matrix
import multiprocessing
-from multiprocessing.sharedctypes import RawArray
import sys
import warnings
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, List
from functools import reduce
+import platform
-from cleanlab.count import calibrate_confident_joint
-from cleanlab.rank import (
- order_label_issues,
-)
+from cleanlab.count import calibrate_confident_joint, num_label_issues
+from cleanlab.rank import order_label_issues, get_label_quality_scores
import cleanlab.internal.multilabel_scorer as ml_scorer
from cleanlab.internal.validation import assert_valid_inputs
from cleanlab.internal.util import (
@@ -45,10 +42,10 @@
)
from cleanlab.internal.multilabel_utils import stack_complement, get_onehot_num_classes, int2onehot
from cleanlab.typing import LabelLike
+from cleanlab.multilabel_classification.filter import find_multilabel_issues_per_class
-# tqdm is a module used to print time-to-complete when multiprocessing is used.
-# This module is not necessary, and therefore is not a package dependency, but
-# when installed it improves user experience for large datasets.
+# tqdm is a package to print time-to-complete when multiprocessing is used.
+# This package is not necessary, but when installed improves user experience for large datasets.
try:
import tqdm
@@ -59,6 +56,19 @@
w = """To see estimated completion times for methods in cleanlab.filter, "pip install tqdm"."""
warnings.warn(w)
+# psutil is a package used to count physical cores for multiprocessing
+# This package is not necessary, because we can always fall back to logical cores as the default
+try:
+ import psutil
+
+ psutil_exists = True
+except ImportError as e: # pragma: no cover
+ psutil_exists = False
+
+# global variable for find_label_issues multiprocessing
+pred_probs_by_class: Dict[int, np.ndarray]
+prune_count_matrix_cols: Dict[int, np.ndarray]
+
def find_label_issues(
labels: LabelLike,
@@ -67,13 +77,13 @@ def find_label_issues(
return_indices_ranked_by: Optional[str] = None,
rank_by_kwargs: Optional[Dict[str, Any]] = None,
filter_by: str = "prune_by_noise_rate",
- multi_label: bool = False,
frac_noise: float = 1.0,
- num_to_remove_per_class: Optional[int] = None,
+ num_to_remove_per_class: Optional[List[int]] = None,
min_examples_per_class=1,
confident_joint: Optional[np.ndarray] = None,
n_jobs: Optional[int] = None,
verbose: bool = False,
+ multi_label: bool = False,
) -> np.ndarray:
"""
Identifies potentially bad labels in a classification dataset using confident learning.
@@ -99,8 +109,6 @@ def find_label_issues(
*Format requirements*: for dataset with K classes, each label must be integer in 0, 1, ..., K-1.
For a standard (multi-class) classification dataset where each example is labeled with one class,
`labels` should be 1D array of shape ``(N,)``, for example: ``labels = [1,0,2,1,1,0...]``.
- For a multi-label classification dataset where each example can belong to multiple (or no) classes,
- `labels` should be an iterable of iterables (e.g. ``List[List[int]]``) whose i-th element corresponds to list of classes that i-th example belongs to (e.g. ``labels = [[1,2],[1],[0],[],...]``).
pred_probs : np.ndarray, optional
An array of shape ``(N, K)`` of model-predicted class probabilities,
@@ -129,7 +137,7 @@ class 0, 1, ..., K-1.
label quality score (see :py:func:`rank.get_label_quality_scores
`).
- filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given'}, default='prune_by_noise_rate'
+ filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given', 'low_normalized_margin', 'low_self_confidence'}, default='prune_by_noise_rate'
Method to determine which examples are flagged as having label issue, so you can filter/prune them from the dataset. Options:
- ``'prune_by_noise_rate'``: filters examples with *high probability* of being mislabeled for every non-diagonal in the confident joint (see `prune_counts_matrix` in `filter.py`). These are the examples where (with high confidence) the given label is unlikely to match the predicted label for the example.
@@ -137,13 +145,8 @@ class 0, 1, ..., K-1.
- ``'both'``: filters only those examples that would be filtered by both ``'prune_by_noise_rate'`` and ``'prune_by_class'``.
- ``'confident_learning'``: filters the examples counted as part of the off-diagonals of the confident joint. These are the examples that are confidently predicted to be a different label than their given label.
- ``'predicted_neq_given'``: filters examples for which the predicted class (i.e. argmax of the predicted probabilities) does not match the given label.
-
- multi_label : bool, optional
- If ``True``, labels should be an iterable (e.g. list) of iterables, containing a
- list of class labels for each example, instead of just a single label.
- The multi-label setting supports classification tasks where an example can belong to more than 1 class or none of the classes (rather than exactly one class as in standard multi-class classification).
- Example of a multi-labeled `labels` input: ``[[0,1], [1], [0,2], [0,1,2], [0], [1], [], ...]``. This says the first example in dataset belongs to both class 0 and class 1, according to its given label.
- Each row of `pred_probs` no longer needs to sum to 1 in multi-label settings, since one example can now belong to multiple classes simultaneously.
+ - ``'low_normalized_margin'``: filters the examples with *smallest* normalized margin label quality score. The number of issues returned matches :py:func:`count.num_label_issues `.
+ - ``'low_self_confidence'``: filters the examples with *smallest* self confidence label quality score. The number of issues returned matches :py:func:`count.num_label_issues `.
frac_noise : float, default=1.0
Used to only return the "top" ``frac_noise * num_label_issues``. The choice of which "top"
@@ -155,7 +158,6 @@ class 0, 1, ..., K-1.
When ``frac_noise=1.0``, return all "confident" estimated noise indices (recommended).
frac_noise * number_of_mislabeled_examples_in_class_k.
- Note: specifying `frac_noise` is not yet supported if `multi_label` is True.
num_to_remove_per_class : array_like
An iterable of length K, the number of classes.
@@ -183,11 +185,10 @@ class 0, 1, ..., K-1.
Entry ``(j, k)`` in the matrix is the number of examples confidently counted into the pair of ``(noisy label=j, true label=k)`` classes.
The `confident_joint` can be computed using :py:func:`count.compute_confident_joint `.
If not provided, it is computed from the given (noisy) `labels` and `pred_probs`.
- If `multi_label` is True, `confident_joint` should instead be a one-vs-rest array with shape ``(K, 2, 2)`` as returned by :py:func:`count.compute_confident_joint ` function.
n_jobs : optional
Number of processing threads used by multiprocessing. Default ``None``
- sets to the number of cores on your CPU.
+ sets to the number of cores on your CPU (physical cores if you have ``psutil`` package installed, otherwise logical cores).
Set this to 1 to *disable* parallel processing (if its causing issues).
Windows users may see a speed-up with ``n_jobs=1``.
@@ -211,6 +212,8 @@ class 0, 1, ..., K-1.
rank_by_kwargs = {}
assert filter_by in [
+ "low_normalized_margin",
+ "low_self_confidence",
"prune_by_noise_rate",
"prune_by_class",
"both",
@@ -229,34 +232,79 @@ class 0, 1, ..., K-1.
allow_one_class=allow_one_class,
)
- if filter_by in ["confident_learning", "predicted_neq_given"] and (
- frac_noise != 1.0 or num_to_remove_per_class is not None
- ):
+ if filter_by in [
+ "confident_learning",
+ "predicted_neq_given",
+ "low_normalized_margin",
+ "low_self_confidence",
+ ] and (frac_noise != 1.0 or num_to_remove_per_class is not None):
warn_str = (
- "WARNING! frac_noise and num_to_remove_per_class parameters are only supported"
+ "frac_noise and num_to_remove_per_class parameters are only supported"
" for filter_by 'prune_by_noise_rate', 'prune_by_class', and 'both'. They "
- "are not supported for methods 'confident_learning' or "
- "'predicted_neq_given'."
+ "are not supported for methods 'confident_learning', 'predicted_neq_given', "
+ "'low_normalized_margin' or 'low_self_confidence'."
)
warnings.warn(warn_str)
if (num_to_remove_per_class is not None) and (
- filter_by in ["confident_learning", "predicted_neq_given"]
+ filter_by
+ in [
+ "confident_learning",
+ "predicted_neq_given",
+ "low_normalized_margin",
+ "low_self_confidence",
+ ]
):
- # TODO - add support for these two filters
+ # TODO - add support for these filters
raise ValueError(
- "filter_by 'confident_learning' or 'predicted_neq_given' is not supported (yet) when setting 'num_to_remove_per_class'"
+ "filter_by 'confident_learning', 'predicted_neq_given', 'low_normalized_margin' "
+ "or 'low_self_confidence' is not supported (yet) when setting 'num_to_remove_per_class'"
+ )
+ if filter_by == "confident_learning" and isinstance(confident_joint, np.ndarray):
+ warn_str = (
+ "The supplied `confident_joint` is ignored when `filter_by = 'confident_learning'`; confident joint will be "
+ "re-estimated from the given labels. To use your supplied `confident_joint`, please specify a different "
+ "`filter_by` value."
)
+ warnings.warn(warn_str)
+
+ K = get_num_classes(
+ labels=labels, pred_probs=pred_probs, label_matrix=confident_joint, multi_label=multi_label
+ )
+ # Boolean set to true if dataset is large
+ big_dataset = K * len(labels) > 1e8
# Set-up number of multiprocessing threads
+ # On Windows/macOS, when multi_label is True, multiprocessing is much slower
+ # even for faily large input arrays, so we default to n_jobs=1 in this case
+ os_name = platform.system()
if n_jobs is None:
- n_jobs = multiprocessing.cpu_count()
+ if multi_label and os_name != "Linux":
+ n_jobs = 1
+ else:
+ if psutil_exists:
+ n_jobs = psutil.cpu_count(logical=False) # physical cores
+ elif big_dataset:
+ print(
+ "To default `n_jobs` to the number of physical cores for multiprocessing in find_label_issues(), please: `pip install psutil`.\n"
+ "Note: You can safely ignore this message. `n_jobs` only affects runtimes, results will be the same no matter its value.\n"
+ "Since psutil is not installed, `n_jobs` was set to the number of logical cores by default.\n"
+ "Disable this message by either installing psutil or specifying the `n_jobs` argument."
+ ) # pragma: no cover
+ if not n_jobs:
+ # either psutil does not exist
+ # or psutil can return None when physical cores cannot be determined
+ # switch to logical cores
+ n_jobs = multiprocessing.cpu_count()
else:
assert n_jobs >= 1
if multi_label:
if not isinstance(labels, list):
raise TypeError("`labels` must be list when `multi_label=True`.")
-
+ warnings.warn(
+ "The multi_label argument to filter.find_label_issues() is deprecated and will be removed in future versions. Please use `multilabel_classification.filter.find_label_issues()` instead.",
+ DeprecationWarning,
+ )
return _find_label_issues_multilabel(
labels,
pred_probs,
@@ -272,13 +320,8 @@ class 0, 1, ..., K-1.
)
# Else this is standard multi-class classification
- K = get_num_classes(
- labels=labels, pred_probs=pred_probs, label_matrix=confident_joint, multi_label=multi_label
- )
# Number of examples in each class of labels
label_counts = value_counts_fill_missing_classes(labels, K, multi_label=multi_label)
- # Boolean set to true if dataset is large
- big_dataset = K * len(labels) > 1e8
# Ensure labels are of type np.ndarray()
labels = np.asarray(labels)
if confident_joint is None or filter_by == "confident_learning":
@@ -290,6 +333,25 @@ class 0, 1, ..., K-1.
multi_label=multi_label,
return_indices_of_off_diagonals=True,
)
+
+ if filter_by in ["low_normalized_margin", "low_self_confidence"]:
+ # TODO: consider setting adjust_pred_probs to true based on benchmarks (or adding it kwargs, or ignoring and leaving as false by default)
+ scores = get_label_quality_scores(
+ labels,
+ pred_probs,
+ method=filter_by[4:],
+ adjust_pred_probs=False,
+ )
+ num_errors = num_label_issues(
+ labels, pred_probs, multi_label=multi_label # TODO: Check usage of multilabel
+ )
+ # Find label issues O(nlogn) solution (mapped to boolean mask later in the method)
+ cl_error_indices = np.argsort(scores)[:num_errors]
+ # The following is the O(n) fastest solution (check for one-off errors), but the problem is if lots of the scores are identical you will overcount,
+ # you can end up returning more or less and they aren't ranked in the boolean form so there's no way to drop the highest scores randomly
+ # boundary = np.partition(scores, num_errors)[num_errors] # O(n) solution
+ # label_issues_mask = scores <= boundary
+
if filter_by in ["prune_by_noise_rate", "prune_by_class", "both"]:
# Create `prune_count_matrix` with the number of examples to remove in each class and
# leave at least min_examples_per_class examples per class.
@@ -310,88 +372,71 @@ class 0, 1, ..., K-1.
prune_count_matrix = round_preserving_row_totals(tmp)
# Prepare multiprocessing shared data
- if n_jobs > 1:
- _labels = RawArray("I", labels) # type: ignore
- _label_counts = RawArray("I", label_counts) # type: ignore
- _prune_count_matrix = RawArray("I", prune_count_matrix.flatten()) # type: ignore
- _pred_probs = RawArray("f", pred_probs.flatten()) # type: ignore
- else: # Multiprocessing is turned off. Create tuple with all parameters
- args = (
- labels,
- label_counts,
- prune_count_matrix,
- pred_probs,
- multi_label,
- min_examples_per_class,
- )
+ # On Linux, multiprocessing is started with fork,
+ # so data can be shared with global vairables + COW
+ # On Window/macOS, processes are started with spawn,
+ # so data will need to be pickled to the subprocesses through input args
+ chunksize = max(1, K // n_jobs)
+ if n_jobs == 1 or os_name == "Linux":
+ global pred_probs_by_class, prune_count_matrix_cols
+ pred_probs_by_class = {k: pred_probs[labels == k] for k in range(K)}
+ prune_count_matrix_cols = {k: prune_count_matrix[:, k] for k in range(K)}
+ args = [[k, min_examples_per_class, None] for k in range(K)]
+ else:
+ args = [
+ [k, min_examples_per_class, [pred_probs[labels == k], prune_count_matrix[:, k]]]
+ for k in range(K)
+ ]
# Perform Pruning with threshold probabilities from BFPRT algorithm in O(n)
# Operations are parallelized across all CPU processes
if filter_by == "prune_by_class" or filter_by == "both":
- if n_jobs > 1: # parallelize
- with multiprocessing.Pool(
- n_jobs,
- initializer=_init,
- initargs=(
- _labels,
- _label_counts,
- _prune_count_matrix,
- prune_count_matrix.shape,
- _pred_probs,
- pred_probs.shape,
- multi_label,
- min_examples_per_class,
- ),
- ) as p:
+ if n_jobs > 1:
+ with multiprocessing.Pool(n_jobs) as p:
if verbose: # pragma: no cover
print("Parallel processing label issues by class.")
sys.stdout.flush()
if big_dataset and tqdm_exists:
label_issues_masks_per_class = list(
- tqdm.tqdm(p.imap(_prune_by_class, range(K)), total=K),
+ tqdm.tqdm(p.imap(_prune_by_class, args, chunksize=chunksize), total=K)
)
else:
- label_issues_masks_per_class = p.map(_prune_by_class, range(K))
- else: # n_jobs = 1, so no parallelization
- label_issues_masks_per_class = [_prune_by_class(k, args) for k in range(K)]
- label_issues_mask = np.stack(label_issues_masks_per_class).any(axis=0)
+ label_issues_masks_per_class = p.map(_prune_by_class, args, chunksize=chunksize)
+ else:
+ label_issues_masks_per_class = [_prune_by_class(arg) for arg in args]
+
+ label_issues_mask = np.zeros(len(labels), dtype=bool)
+ for k, mask in enumerate(label_issues_masks_per_class):
+ if len(mask) > 1:
+ label_issues_mask[labels == k] = mask
if filter_by == "both":
label_issues_mask_by_class = label_issues_mask
if filter_by == "prune_by_noise_rate" or filter_by == "both":
- if n_jobs > 1: # parallelize
- with multiprocessing.Pool(
- n_jobs,
- initializer=_init,
- initargs=(
- _labels,
- _label_counts,
- _prune_count_matrix,
- prune_count_matrix.shape,
- _pred_probs,
- pred_probs.shape,
- multi_label,
- min_examples_per_class,
- ),
- ) as p:
+ if n_jobs > 1:
+ with multiprocessing.Pool(n_jobs) as p:
if verbose: # pragma: no cover
print("Parallel processing label issues by noise rate.")
sys.stdout.flush()
if big_dataset and tqdm_exists:
label_issues_masks_per_class = list(
- tqdm.tqdm(p.imap(_prune_by_count, range(K)), total=K)
+ tqdm.tqdm(p.imap(_prune_by_count, args, chunksize=chunksize), total=K)
)
else:
- label_issues_masks_per_class = p.map(_prune_by_count, range(K))
- else: # n_jobs = 1, so no parallelization
- label_issues_masks_per_class = [_prune_by_count(k, args) for k in range(K)]
- label_issues_mask = np.stack(label_issues_masks_per_class).any(axis=0)
+ label_issues_masks_per_class = p.map(_prune_by_count, args, chunksize=chunksize)
+ else:
+ label_issues_masks_per_class = [_prune_by_count(arg) for arg in args]
+
+ label_issues_mask = np.zeros(len(labels), dtype=bool)
+ for k, mask in enumerate(label_issues_masks_per_class):
+ if len(mask) > 1:
+ label_issues_mask[labels == k] = mask
if filter_by == "both":
label_issues_mask = label_issues_mask & label_issues_mask_by_class
- if filter_by == "confident_learning":
+ if filter_by in ["confident_learning", "low_normalized_margin", "low_self_confidence"]:
label_issues_mask = np.zeros(len(labels), dtype=bool)
for idx in cl_error_indices:
label_issues_mask[idx] = True
@@ -399,12 +444,12 @@ class 0, 1, ..., K-1.
if filter_by == "predicted_neq_given":
label_issues_mask = find_predicted_neq_given(labels, pred_probs, multi_label=multi_label)
- # Remove label issues if given label == model prediction
- # TODO: consider use of _multiclass_crossval_predict() here
- pred = pred_probs.argmax(axis=1)
- for i, pred_label in enumerate(pred):
- if pred_label == labels[i]:
- label_issues_mask[i] = False
+ if filter_by not in ["low_self_confidence", "low_normalized_margin"]:
+ # Remove label issues if given label == model prediction if issues haven't been removed yet
+ pred = pred_probs.argmax(axis=1)
+ for i, pred_label in enumerate(pred):
+ if pred_label == labels[i]:
+ label_issues_mask[i] = False
if verbose:
print("Number of label issues found: {}".format(sum(label_issues_mask)))
@@ -429,7 +474,7 @@ def _find_label_issues_multilabel(
rank_by_kwargs={},
filter_by: str = "prune_by_noise_rate",
frac_noise: float = 1.0,
- num_to_remove_per_class: Optional[int] = None,
+ num_to_remove_per_class: Optional[List[int]] = None,
min_examples_per_class=1,
confident_joint: Optional[np.ndarray] = None,
n_jobs: Optional[int] = None,
@@ -440,7 +485,42 @@ def _find_label_issues_multilabel(
This is done via a one-vs-rest reduction for each class and the results are subsequently aggregated across all classes.
Here `labels` must be formatted as an iterable of iterables, e.g. ``List[List[int]]``.
"""
- per_class_issues = _find_multilabel_issues_per_class(
+ if filter_by in ["low_normalized_margin", "low_self_confidence"]:
+ num_errors = sum(
+ find_label_issues(
+ labels=labels,
+ pred_probs=pred_probs,
+ confident_joint=confident_joint,
+ multi_label=True,
+ filter_by="confident_learning",
+ )
+ )
+
+ y_one, num_classes = get_onehot_num_classes(labels, pred_probs)
+ label_quality_scores = ml_scorer.get_label_quality_scores(
+ labels=y_one,
+ pred_probs=pred_probs,
+ )
+
+ cl_error_indices = np.argsort(label_quality_scores)[:num_errors]
+ label_issues_mask = np.zeros(len(labels), dtype=bool)
+ for idx in cl_error_indices:
+ label_issues_mask[idx] = True
+
+ if return_indices_ranked_by is not None:
+ label_quality_scores_issues = ml_scorer.get_label_quality_scores(
+ labels=y_one[label_issues_mask],
+ pred_probs=pred_probs[label_issues_mask],
+ method=ml_scorer.MultilabelScorer(
+ base_scorer=ml_scorer.ClassLabelScorer.from_str(return_indices_ranked_by),
+ ),
+ base_scorer_kwargs=rank_by_kwargs,
+ )
+ return cl_error_indices[np.argsort(label_quality_scores_issues)]
+
+ return label_issues_mask
+
+ per_class_issues = find_multilabel_issues_per_class(
labels,
pred_probs,
return_indices_ranked_by,
@@ -472,131 +552,6 @@ def _find_label_issues_multilabel(
return label_issues_idx[np.argsort(label_quality_scores_issues)]
-def _find_multilabel_issues_per_class(
- labels: list,
- pred_probs: np.ndarray,
- return_indices_ranked_by: Optional[str] = None,
- rank_by_kwargs={},
- filter_by: str = "prune_by_noise_rate",
- frac_noise: float = 1.0,
- num_to_remove_per_class: Optional[int] = None,
- min_examples_per_class=1,
- confident_joint: Optional[np.ndarray] = None,
- n_jobs: Optional[int] = None,
- verbose: bool = False,
-) -> Union[np.ndarray, Tuple[List[np.ndarray], List[Any], List[np.ndarray]]]:
- """
- Parameters
- ----------
- labels : List[List[int]]
- List of noisy labels for multi-label classification where each example can belong to multiple classes (e.g. ``labels = [[1,2],[1],[0],[],...]`` indicates the first example in dataset belongs to both class 1 and class 2.
-
-
- pred_probs : np.ndarray
- An array of shape ``(N, K)`` of model-predicted probabilities,
- ``P(label=k|x)``. Each row of this matrix corresponds
- to an example `x` and contains the model-predicted probabilities that
- `x` belongs to each possible class, for each of the K classes. The
- columns must be ordered such that these probabilities correspond to
- class 0, 1, ..., K-1. They need not sum to 1.0
-
-
- return_indices_ranked_by : {None, 'self_confidence', 'normalized_margin', 'confidence_weighted_entropy'}, default=None
- Refer to documentation for this argument in filter.find_label_issues() for details.
-
- rank_by_kwargs : dict, optional
- Refer to documentation for this argument in filter.find_label_issues() for details.
-
- filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given'}, default='prune_by_noise_rate'
- Refer to documentation for this argument in filter.find_label_issues() for details.
-
- frac_noise : float, default=1.0
- Refer to documentation for this argument in filter.find_label_issues() for details.
-
- num_to_remove_per_class : array_like
- Refer to documentation for this argument in filter.find_label_issues() for details.
-
- min_examples_per_class : int, default=1
- Refer to documentation for this argument in filter.find_label_issues() for details.
-
- confident_joint : np.ndarray, optional
- An array of shape ``(K, 2, 2)`` representing a one-vs-rest formatted confident joint.
- Entry ``(c, i, j)`` in this array is the number of examples confidently counted into a ``(class c, noisy label=i, true label=j)`` bin,
- where `i, j` are either 0 or 1 to denote whether this example belongs to class `c` or not
- (recall examples can belong to multiple classes in multi-label classification).
- The `confident_joint` can be computed using :py:func:`count.compute_confident_joint `.
- If not provided, it is computed from the given (noisy) `labels` and `pred_probs`.
-
- n_jobs : optional
- Refer to documentation for this argument in filter.find_label_issues() for details.
-
- verbose : optional
- If ``True``, prints when multiprocessing happens.
-
- Returns
- -------
- per_class_label_issues : list(np.ndarray)
- If `return_indices_ranked_by` left unspecified, returns a list of boolean **masks** for the entire dataset
- where ``True`` represents a label issue and ``False`` represents an example that is
- accurately labeled with high confidence.
- If `return_indices_ranked_by` is specified, returns a list of shorter arrays of **indices** of examples identified to have
- label issues (i.e. those indices where the mask would be ``True``), sorting by likelihood that the corresponding label is correct is not supported yet.
-
- Note
- ----
- Obtain the *indices* of label issues in your dataset by setting
- `return_indices_ranked_by`.
-
- """
- y_one, num_classes = get_onehot_num_classes(labels, pred_probs)
- if return_indices_ranked_by is None:
- bissues = np.zeros(y_one.shape).astype(bool)
- else:
- label_issues_list = []
- labels_list = []
- pred_probs_list = []
- if confident_joint is not None:
- confident_joint_shape = confident_joint.shape
- if confident_joint_shape == (num_classes, num_classes):
- warnings.warn(
- f"The new recommended format for `confident_joint` in multi_label settings is (num_classes,2,2) as output by compute_confident_joint(...,multi_label=True). Your K x K confident_joint in the old format is being ignored."
- )
- confident_joint = None
- elif confident_joint_shape != (num_classes, 2, 2):
- raise ValueError("confident_joint should be of shape (num_classes, 2, 2)")
- for class_num, (label, pred_prob_for_class) in enumerate(zip(y_one.T, pred_probs.T)):
- pred_probs_binary = stack_complement(pred_prob_for_class)
- if confident_joint is None:
- conf = None
- else:
- conf = confident_joint[class_num]
- binary_label_issues = find_label_issues(
- labels=label,
- pred_probs=pred_probs_binary,
- return_indices_ranked_by=return_indices_ranked_by,
- frac_noise=frac_noise,
- rank_by_kwargs=rank_by_kwargs,
- filter_by=filter_by,
- multi_label=False,
- num_to_remove_per_class=num_to_remove_per_class,
- min_examples_per_class=min_examples_per_class,
- confident_joint=conf,
- n_jobs=n_jobs,
- verbose=verbose,
- )
-
- if return_indices_ranked_by is None:
- bissues[:, class_num] = binary_label_issues
- else:
- label_issues_list.append(binary_label_issues)
- labels_list.append(label)
- pred_probs_list.append(pred_probs_binary)
- if return_indices_ranked_by is None:
- return bissues
- else:
- return label_issues_list, labels_list, pred_probs_list
-
-
def _keep_at_least_n_per_class(
prune_count_matrix: np.ndarray, n: int, *, frac_noise: float = 1.0
) -> np.ndarray:
@@ -807,9 +762,6 @@ class 0, 1, ..., K-1. `pred_probs` should have been computed using 3 (or
label issue and ``False`` represents an example that is accurately
labeled with high confidence.
- Note
- ----
- Multi-label classification is not supported in this method.
"""
assert_valid_inputs(X=None, y=labels, pred_probs=pred_probs, multi_label=False)
@@ -891,7 +843,7 @@ def _get_shared_data() -> Any: # pragma: no cover
# TODO figure out what the types inside args are.
-def _prune_by_class(k: int, args=None) -> np.ndarray:
+def _prune_by_class(args: list) -> np.ndarray:
"""multiprocessing Helper function for find_label_issues()
that assumes globals and produces a mask for class k for each example by
removing the examples with *smallest probability* of
@@ -902,41 +854,34 @@ def _prune_by_class(k: int, args=None) -> np.ndarray:
k : int (between 0 and num classes - 1)
The class of interest."""
- if args: # Single processing - params are passed in
- (
- labels,
- label_counts,
- prune_count_matrix,
- pred_probs,
- multi_label,
- min_examples_per_class,
- ) = args
- else: # Multiprocessing - data is shared across sub-processes
- (
- labels,
- label_counts,
- prune_count_matrix,
- pred_probs,
- multi_label,
- min_examples_per_class,
- ) = _get_shared_data()
+ k, min_examples_per_class, arrays = args
+ if arrays is None:
+ pred_probs = pred_probs_by_class[k]
+ prune_count_matrix = prune_count_matrix_cols[k]
+ else:
+ pred_probs = arrays[0]
+ prune_count_matrix = arrays[1]
- if label_counts[k] > min_examples_per_class: # No prune if not at least min_examples_per_class
- num_issues = label_counts[k] - prune_count_matrix[k][k]
+ label_counts = pred_probs.shape[0]
+ label_issues = np.zeros(label_counts, dtype=bool)
+ if label_counts > min_examples_per_class: # No prune if not at least min_examples_per_class
+ num_issues = label_counts - prune_count_matrix[k]
# Get return_indices_ranked_by of the smallest prob of class k for examples with noisy label k
- label_filter = np.array([k in lst for lst in labels]) if multi_label else labels == k
- class_probs = pred_probs[:, k]
- rank = np.partition(class_probs[label_filter], num_issues)[num_issues]
- return label_filter & (class_probs < rank)
- else:
- warnings.warn(
- f"May not flag all label issues in class: {k}, it has too few examples (see argument: `min_examples_per_class`)"
- )
- return np.zeros(len(labels), dtype=bool)
+ # rank = np.partition(class_probs, num_issues)[num_issues]
+ if num_issues >= 1:
+ class_probs = pred_probs[:, k]
+ order = np.argsort(class_probs)
+ label_issues[order[:num_issues]] = True
+ return label_issues
+
+ warnings.warn(
+ f"May not flag all label issues in class: {k}, it has too few examples (see argument: `min_examples_per_class`)"
+ )
+ return label_issues
# TODO figure out what the types inside args are.
-def _prune_by_count(k: int, args=None) -> np.ndarray:
+def _prune_by_count(args: list) -> np.ndarray:
"""multiprocessing Helper function for find_label_issues() that assumes
globals and produces a mask for class k for each example by
removing the example with noisy label k having *largest margin*,
@@ -948,43 +893,34 @@ def _prune_by_count(k: int, args=None) -> np.ndarray:
k : int (between 0 and num classes - 1)
The true_label class of interest."""
- if args: # Single processing - params are passed in
- (
- labels,
- label_counts,
- prune_count_matrix,
- pred_probs,
- multi_label,
- min_examples_per_class,
- ) = args
- else: # Multiprocessing - data is shared across sub-processes
- (
- labels,
- label_counts,
- prune_count_matrix,
- pred_probs,
- multi_label,
- min_examples_per_class,
- ) = _get_shared_data()
+ k, min_examples_per_class, arrays = args
+ if arrays is None:
+ pred_probs = pred_probs_by_class[k]
+ prune_count_matrix = prune_count_matrix_cols[k]
+ else:
+ pred_probs = arrays[0]
+ prune_count_matrix = arrays[1]
- label_issues_mask = np.zeros(len(pred_probs), dtype=bool)
- pred_probs_k = pred_probs[:, k]
- K = get_num_classes(labels, pred_probs, multi_label=multi_label)
- if label_counts[k] <= min_examples_per_class: # No prune if not at least min_examples_per_class
+ label_counts = pred_probs.shape[0]
+ label_issues_mask = np.zeros(label_counts, dtype=bool)
+ if label_counts <= min_examples_per_class:
warnings.warn(
f"May not flag all label issues in class: {k}, it has too few examples (see `min_examples_per_class` argument)"
)
- return np.zeros(len(labels), dtype=bool)
- for j in range(K): # j is true label index (k is noisy label index)
- num2prune = prune_count_matrix[j][k]
+ return label_issues_mask
+
+ K = pred_probs.shape[1]
+ if K < 1:
+ raise ValueError("Must have at least 1 class.")
+ for j in range(K):
+ num2prune = prune_count_matrix[j]
# Only prune for noise rates, not diagonal entries
if k != j and num2prune > 0:
# num2prune's largest p(true class k) - p(noisy class k)
# for x with true label j
- margin = pred_probs[:, j] - pred_probs_k
- label_filter = np.array([k in lst for lst in labels]) if multi_label else labels == k
- cut = -np.partition(-margin[label_filter], num2prune - 1)[num2prune - 1]
- label_issues_mask = label_issues_mask | (label_filter & (margin >= cut))
+ margin = pred_probs[:, j] - pred_probs[:, k]
+ order = np.argsort(-margin)
+ label_issues_mask[order[:num2prune]] = True
return label_issues_mask
diff --git a/cleanlab/internal/constants.py b/cleanlab/internal/constants.py
new file mode 100644
index 0000000000..a848d48508
--- /dev/null
+++ b/cleanlab/internal/constants.py
@@ -0,0 +1,49 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+
+FLOATING_POINT_COMPARISON = 1e-6 # floating point comparison for fuzzy equals
+CLIPPING_LOWER_BOUND = 1e-6 # lower-bound clipping threshold for expected behavior
+CONFIDENT_THRESHOLDS_LOWER_BOUND = (
+ 2 * FLOATING_POINT_COMPARISON
+) # lower bound imposed to clip confident thresholds from below, has to be larger than floating point comparison
+TINY_VALUE = 1e-100 # very tiny value for clipping
+
+
+# Object Detection Constants
+EUC_FACTOR = 0.1 # Factor to control magnitude of euclidian distance. Increasing the factor makes the distances between two objects go to zero more rapidly.
+MAX_ALLOWED_BOX_PRUNE = 0.97 # This is max allowed percent of boxes that are pruned before a warning is thrown given a specific threshold. Pruning too many boxes negatively affects performance.
+
+ALPHA = 0.9 # Param for objectlab, weight between IoU and distance when considering similarity matrix. High alpha means considering IoU more strongly over distance
+LOW_PROBABILITY_THRESHOLD = 0.5 # Param for get_label_quality_score, lowest predicted class probability threshold allowed when considering predicted boxes to identify badly located label boxes.
+HIGH_PROBABILITY_THRESHOLD = 0.95 # Param for objectlab, high probability threshold for considering predicted boxes to identify overlooked and swapped label boxes
+TEMPERATURE = 0.1 # Param for objectlab, temperature of the softmin function used to pool the per-box quality scores for an error subtype across all boxes into a single subtype score for the image. With a lower temperature, softmin pooling acts more like minimum pooling, alternatively it acts more like mean pooling with high temperature.
+
+OVERLOOKED_THRESHOLD = 0.3 # Param for find_label_issues. Per-box label quality score threshold to determine max score for a box to be considered an overlooked issue
+BADLOC_THRESHOLD = 0.3 # Param for find_label_issues. Per-box label quality score threshold to determine max score for a box to be considered a bad location issue
+SWAP_THRESHOLD = 0.3 # Param for find_label_issues. Per-box label quality score threshold to determine max score for a box to be considered a swap issue
+
+CUSTOM_SCORE_WEIGHT_OVERLOOKED = (
+ 1 / 3
+) # Param for get_label_quality_score, weight to determine how much to value overlooked scores over other subtypes when deciding the overall label quality score for an image.
+CUSTOM_SCORE_WEIGHT_BADLOC = (
+ 1 / 3
+) # Param for get_label_quality_score, weight to determine how much to value badloc scores over other subtypes when deciding issues
+CUSTOM_SCORE_WEIGHT_SWAP = (
+ 1 / 3
+) # Param for get_label_quality_score, weight to determine how much to value swap scores over other subtypes when deciding issues
+
+MAX_CLASS_TO_SHOW = 10 # Nmber of classes to show in legend during the visualize method. Classes over max_class_to_show are cut off.
diff --git a/cleanlab/internal/label_quality_utils.py b/cleanlab/internal/label_quality_utils.py
index 2fbead808d..1215ee665a 100644
--- a/cleanlab/internal/label_quality_utils.py
+++ b/cleanlab/internal/label_quality_utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -14,12 +14,11 @@
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab. If not, see .
-"""
-Helper methods used internally for computing label quality scores
-"""
-
+"""Helper methods used internally for computing label quality scores."""
+import warnings
import numpy as np
from typing import Optional
+from scipy.special import xlogy
from cleanlab.count import get_confident_thresholds
@@ -30,9 +29,12 @@ def _subtract_confident_thresholds(
multi_label: bool = False,
confident_thresholds: Optional[np.ndarray] = None,
) -> np.ndarray:
- """Returns adjusted predicted probabilities by subtracting the class confident thresholds and renormalizing.
+ """
+ Return adjusted predicted probabilities by subtracting the class confident thresholds and renormalizing.
+
The confident class threshold for a class j is the expected (average) "self-confidence" for class j.
The purpose of this adjustment is to handle class imbalance.
+
Parameters
----------
labels : np.ndarray
@@ -52,39 +54,36 @@ def _subtract_confident_thresholds(
the total number of errors considered is based on the number of labels,
not the number of examples. So, the calibrated `confident_joint` will sum
to the number of total labels.
+
Returns
-------
pred_probs_adj : np.ndarray (float)
Adjusted pred_probs.
"""
-
# Get expected (average) self-confidence for each class
# TODO: Test this for multi-label
if confident_thresholds is None:
if labels is None:
raise ValueError(
- f"Cannot calculate confident_thresholds without labels. Pass in either labels or already calculated "
- f"confident_thresholds parameter. "
- )
- else:
- confident_thresholds = get_confident_thresholds(
- labels, pred_probs, multi_label=multi_label
+ "Cannot calculate confident_thresholds without labels. Pass in either labels or already calculated "
+ "confident_thresholds parameter. "
)
+ confident_thresholds = get_confident_thresholds(labels, pred_probs, multi_label=multi_label)
# Subtract the class confident thresholds
pred_probs_adj = pred_probs - confident_thresholds
# Re-normalize by shifting data to take care of negative values from the subtraction
pred_probs_adj += confident_thresholds.max()
- pred_probs_adj /= pred_probs_adj.sum(axis=1)[
- :, None
- ] # The [:, None] adds a dimension to make the /= operator work for broadcasting.
+ pred_probs_adj /= pred_probs_adj.sum(axis=1, keepdims=True)
return pred_probs_adj
-def get_normalized_entropy(pred_probs: np.ndarray, min_allowed_prob: float = 1e-6) -> np.ndarray:
- """Returns the normalized entropy of pred_probs.
+def get_normalized_entropy(
+ pred_probs: np.ndarray, min_allowed_prob: Optional[float] = None
+) -> np.ndarray:
+ """Return the normalized entropy of pred_probs.
Normalized entropy is between 0 and 1. Higher values of entropy indicate higher uncertainty in the model's prediction of the correct label.
@@ -96,22 +95,39 @@ def get_normalized_entropy(pred_probs: np.ndarray, min_allowed_prob: float = 1e-
Parameters
----------
- pred_probs:
+ pred_probs : np.ndarray (shape (N, K))
Each row of this matrix corresponds to an example x and contains the model-predicted
probabilities that x belongs to each possible class: P(label=k|x)
- min_allowed_prob:
- Minimum allowed probability value. Entries of `pred_probs` below this value will be clipped to this value.
- Ensures entropy remains well-behaved even when `pred_probs` contains zeros.
+ min_allowed_prob : float, default: None, deprecated
+ Minimum allowed probability value. If not `None` (default),
+ entries of `pred_probs` below this value will be clipped to this value.
+
+ .. deprecated:: 2.5.0
+ This keyword is deprecated and should be left to the default.
+ The entropy is well-behaved even if `pred_probs` contains zeros,
+ clipping is unnecessary and (slightly) changes the results.
Returns
-------
- entropy:
+ entropy : np.ndarray (shape (N, ))
Each element is the normalized entropy of the corresponding row of ``pred_probs``.
- """
+ Raises
+ ------
+ ValueError
+ An error is raised if any of the probabilities is not in the interval [0, 1].
+ """
+ if np.any(pred_probs < 0) or np.any(pred_probs > 1):
+ raise ValueError("All probabilities are required to be in the interval [0, 1].")
num_classes = pred_probs.shape[1]
+ if min_allowed_prob is not None:
+ warnings.warn(
+ "Using `min_allowed_prob` is not necessary anymore and will be removed.",
+ DeprecationWarning,
+ )
+ pred_probs = np.clip(pred_probs, a_min=min_allowed_prob, a_max=None)
+
# Note that dividing by log(num_classes) changes the base of the log which rescales entropy to 0-1 range
- clipped_pred_probs = np.clip(pred_probs, a_min=min_allowed_prob, a_max=None)
- return -np.sum(pred_probs * np.log(clipped_pred_probs), axis=1) / np.log(num_classes)
+ return -np.sum(xlogy(pred_probs, pred_probs), axis=1) / np.log(num_classes)
diff --git a/cleanlab/internal/latent_algebra.py b/cleanlab/internal/latent_algebra.py
index 20e1c62de2..d42c16c436 100644
--- a/cleanlab/internal/latent_algebra.py
+++ b/cleanlab/internal/latent_algebra.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -18,8 +18,8 @@
"""
Contains mathematical functions relating the latent terms,
``P(given_label)``, ``P(given_label | true_label)``, ``P(true_label | given_label)``, ``P(true_label)``, etc. together.
-For every function here, if the inputs are exact, the output is guaranteed to be exact.
-Every function herein is the computational equivalent of a mathematical equation having a closed, exact form.
+For every function here, if the inputs are exact, the output is guaranteed to be exact.
+Every function herein is the computational equivalent of a mathematical equation having a closed, exact form.
If the inputs are inexact, the error will of course propagate.
Throughout `K` denotes the number of classes in the classification task.
"""
@@ -28,7 +28,8 @@
import numpy as np
from typing import Tuple
-from cleanlab.internal.util import value_counts, clip_values, clip_noise_rates, TINY_VALUE
+from cleanlab.internal.util import value_counts, clip_values, clip_noise_rates
+from cleanlab.internal.constants import TINY_VALUE, CLIPPING_LOWER_BOUND
def compute_ps_py_inv_noise_matrix(
@@ -73,7 +74,7 @@ def compute_py_inv_noise_matrix(ps, noise_matrix) -> Tuple[np.ndarray, np.ndarra
# No class should have probability 0, so we use .000001
# Make sure valid probabilities that sum to 1.0
- py = clip_values(py, low=1e-6, high=1.0, new_sum=1.0)
+ py = clip_values(py, low=CLIPPING_LOWER_BOUND, high=1.0, new_sum=1.0)
# All the work is done in this function (below)
return py, compute_inv_noise_matrix(py=py, noise_matrix=noise_matrix, ps=ps)
@@ -150,7 +151,7 @@ def compute_noise_matrix_from_inverse(ps, inverse_noise_matrix, *, py=None) -> n
Returns
-------
- noise_matrix : np.ndarray
+ noise_matrix : np.ndarray
Array of shape ``(K, K)``, where `K` = number of classes, whose columns sum to 1.
A conditional probability matrix of the form ``P(label=k_s|true_label=k_y)`` containing
the fraction of examples in every class, labeled as every other class.
@@ -267,8 +268,8 @@ def compute_py(
err += " should be in [cnt, eqn, marginal, marginal_ps]"
raise ValueError(err)
- # Clip py (0,1), s.t. no class should have prob 0, hence 1e-5
- py = clip_values(py, low=1e-5, high=1.0, new_sum=1.0)
+ # Clip py (0,1), s.t. no class should have prob 0, hence 1e-6
+ py = clip_values(py, low=CLIPPING_LOWER_BOUND, high=1.0, new_sum=1.0)
return py
diff --git a/cleanlab/internal/multiannotator_utils.py b/cleanlab/internal/multiannotator_utils.py
index 42e34d7903..429321b13e 100644
--- a/cleanlab/internal/multiannotator_utils.py
+++ b/cleanlab/internal/multiannotator_utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -129,17 +129,23 @@ def assert_valid_inputs_multiannotator(
def assert_valid_pred_probs(
- pred_probs: np.ndarray,
+ pred_probs: Optional[np.ndarray] = None,
pred_probs_unlabeled: Optional[np.ndarray] = None,
ensemble: bool = False,
):
- """Validate format of pred_probs for multiannotator functions"""
+ """Validate format of pred_probs for multiannotator active learning functions"""
+ if pred_probs is None and pred_probs_unlabeled is None:
+ raise ValueError(
+ "pred_probs and pred_probs_unlabeled cannot both be None, specify at least one of the two."
+ )
+
if ensemble:
- if pred_probs.ndim != 3:
- error_message = "pred_probs must be a 3d array."
- if pred_probs.ndim == 2: # pragma: no cover
- error_message += " If you have a 2d pred_probs array, use the non-ensemble version of this function."
- raise ValueError(error_message)
+ if pred_probs is not None:
+ if pred_probs.ndim != 3:
+ error_message = "pred_probs must be a 3d array."
+ if pred_probs.ndim == 2: # pragma: no cover
+ error_message += " If you have a 2d pred_probs array (ie. only one predictor), use the non-ensemble version of this function."
+ raise ValueError(error_message)
if pred_probs_unlabeled is not None:
if pred_probs_unlabeled.ndim != 3:
@@ -148,19 +154,19 @@ def assert_valid_pred_probs(
error_message += " If you have a 2d pred_probs_unlabeled array, use the non-ensemble version of this function."
raise ValueError(error_message)
+ if pred_probs is not None and pred_probs_unlabeled is not None:
if pred_probs.shape[2] != pred_probs_unlabeled.shape[2]:
raise ValueError(
"pred_probs and pred_probs_unlabeled must have the same number of classes"
)
else:
- if pred_probs.ndim != 2:
- error_message = "pred_probs must be a 2d array."
- if pred_probs.ndim == 3: # pragma: no cover
- error_message += (
- " If you have a 3d pred_probs array, use the ensemble version of this function."
- )
- raise ValueError(error_message)
+ if pred_probs is not None:
+ if pred_probs.ndim != 2:
+ error_message = "pred_probs must be a 2d array."
+ if pred_probs.ndim == 3: # pragma: no cover
+ error_message += " If you have a 3d pred_probs array, use the ensemble version of this function."
+ raise ValueError(error_message)
if pred_probs_unlabeled is not None:
if pred_probs_unlabeled.ndim != 2:
@@ -169,6 +175,7 @@ def assert_valid_pred_probs(
error_message += " If you have a 3d pred_probs_unlabeled array, use the non-ensemble version of this function."
raise ValueError(error_message)
+ if pred_probs is not None and pred_probs_unlabeled is not None:
if pred_probs.shape[1] != pred_probs_unlabeled.shape[1]:
raise ValueError(
"pred_probs and pred_probs_unlabeled must have the same number of classes"
@@ -200,7 +207,7 @@ def format_multiannotator_labels(labels: LabelLike) -> Tuple[pd.DataFrame, dict]
try:
unique_labels = unique_labels[~np.isnan(unique_labels)]
unique_labels.sort()
- except (TypeError): # np.unique / np.sort cannot handle string values or pd.NA types
+ except TypeError: # np.unique / np.sort cannot handle string values or pd.NA types
nan_mask = np.array([(l is np.NaN) or (l is pd.NA) or (l == "nan") for l in unique_labels])
unique_labels = unique_labels[~nan_mask]
unique_labels.sort()
diff --git a/cleanlab/internal/multilabel_scorer.py b/cleanlab/internal/multilabel_scorer.py
index 287a7a7e34..4d1cbd4d3c 100644
--- a/cleanlab/internal/multilabel_scorer.py
+++ b/cleanlab/internal/multilabel_scorer.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -243,8 +243,10 @@ def softmin(
def softmax(scores: np.ndarray) -> np.ndarray:
"""Softmax function."""
- exp_scores = np.exp(scores / temperature)
- return exp_scores / np.sum(exp_scores, axis=axis, keepdims=True)
+ scores = scores / temperature
+ scores_max = np.amax(scores, axis=axis, keepdims=True)
+ exp_scores_shifted = np.exp(scores - scores_max)
+ return exp_scores_shifted / np.sum(exp_scores_shifted, axis=axis, keepdims=True)
return np.einsum("ij,ij->i", s, softmax(1 - s))
@@ -488,7 +490,7 @@ def get_class_label_quality_scores(
>>> labels = np.array([[0, 1, 0], [1, 0, 1]])
>>> pred_probs = np.array([[0.1, 0.9, 0.7], [0.4, 0.1, 0.6]])
>>> scorer = MultilabelScorer() # Use the default base scorer (SELF_CONFIDENCE)
- >>> class_label_quality_scores = scorer.get_class_label_quality_scores(labels, pred_probs)
+ >>> class_label_quality_scores = scorer.get_label_quality_scores_per_class(labels, pred_probs)
>>> class_label_quality_scores
array([[0.9, 0.9, 0.3],
[0.4, 0.9, 0.6]])
diff --git a/cleanlab/internal/multilabel_utils.py b/cleanlab/internal/multilabel_utils.py
index 22df37cf8f..57fb5da17b 100644
--- a/cleanlab/internal/multilabel_utils.py
+++ b/cleanlab/internal/multilabel_utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
diff --git a/cleanlab/internal/object_detection_utils.py b/cleanlab/internal/object_detection_utils.py
new file mode 100644
index 0000000000..5b6168c0dc
--- /dev/null
+++ b/cleanlab/internal/object_detection_utils.py
@@ -0,0 +1,102 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""
+Helper functions used internally for object detection tasks.
+"""
+from typing import List, Optional, Dict, Any
+
+import numpy as np
+
+
+def bbox_xyxy_to_xywh(bbox: List[float]) -> Optional[List[float]]:
+ """Converts bounding box coodrinate types from x1y1,x2y2 to x,y,w,h"""
+ if len(bbox) == 4:
+ x1, y1, x2, y2 = bbox
+ w = x2 - x1
+ h = y2 - y1
+ return [x1, y1, w, h]
+ else:
+ print("Wrong bbox shape", len(bbox))
+ return None
+
+
+def softmax(x: np.ndarray, temperature: float = 0.99, axis: int = 0) -> np.ndarray:
+ """Gets softmax of scores."""
+ x = x / temperature
+ x_max = np.amax(x, axis=axis, keepdims=True)
+ exp_x_shifted = np.exp(x - x_max)
+ return exp_x_shifted / np.sum(exp_x_shifted, axis=axis, keepdims=True)
+
+
+def softmin1d(scores: np.ndarray, temperature: float = 0.99, axis: int = 0) -> float:
+ """Returns softmin of passed in scores."""
+ scores = np.array(scores)
+ softmax_scores = softmax(-1 * scores, temperature, axis)
+ return np.dot(softmax_scores, scores)
+
+
+def assert_valid_aggregation_weights(aggregation_weights: Dict[str, Any]) -> None:
+ """assert aggregation weights are in the proper format"""
+ weights = np.array(list(aggregation_weights.values()))
+ if (not np.isclose(np.sum(weights), 1.0)) or (np.min(weights) < 0.0):
+ raise ValueError(
+ f"""Aggregation weights should be non-negative and must sum to 1.0
+ """
+ )
+
+
+def assert_valid_inputs(
+ labels: List[Dict[str, Any]],
+ predictions,
+ method: Optional[str] = None,
+ threshold: Optional[float] = None,
+):
+ """Asserts proper input format."""
+ if len(labels) != len(predictions):
+ raise ValueError(
+ f"labels and predictions length needs to match. len(labels) == {len(labels)} while len(predictions) == {len(predictions)}."
+ )
+ # Typecheck labels and predictions
+ if not isinstance(labels[0], dict):
+ raise ValueError(
+ f"Labels has to be a list of dicts. Instead it is list of {type(labels[0])}."
+ )
+ # check last column of predictions is probabilities ( < 1.)?
+ if not isinstance(predictions[0], (list, np.ndarray)):
+ raise ValueError(
+ f"Prediction has to be a list or np.ndarray. Instead it is type {type(predictions[0])}."
+ )
+ if not predictions[0][0].shape[1] == 5:
+ raise ValueError(
+ f"Prediction values have to be of format [x1,y1,x2,y2,pred_prob]. Please refer to the documentation for predicted probabilities under object_detection.rank.get_label_quality_scores for details"
+ )
+
+ valid_methods = ["objectlab"]
+ if method is not None and method not in valid_methods:
+ raise ValueError(
+ f"""
+ {method} is not a valid object detection scoring method!
+ Please choose a valid scoring_method: {valid_methods}
+ """
+ )
+
+ if threshold is not None and threshold > 1.0:
+ raise ValueError(
+ f"""
+ Threshold is a cutoff of predicted probabilities and therefore should be <= 1.
+ """
+ )
diff --git a/cleanlab/internal/outlier.py b/cleanlab/internal/outlier.py
new file mode 100644
index 0000000000..57004a0a21
--- /dev/null
+++ b/cleanlab/internal/outlier.py
@@ -0,0 +1,68 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""
+Helper functions used internally for outlier detection tasks.
+"""
+
+import numpy as np
+
+
+def transform_distances_to_scores(distances: np.ndarray, k: int, t: int) -> np.ndarray:
+ """Returns an outlier score for each example based on its average distance to its k nearest neighbors.
+
+ The transformation of a distance, :math:`d` , to a score, :math:`o` , is based on the following formula:
+
+ .. math::
+ o = \\exp\\left(-dt\\right)
+
+ where :math:`t` scales the distance to a score in the range [0,1].
+
+ Parameters
+ ----------
+ distances : np.ndarray
+ An array of distances of shape ``(N, num_neighbors)``, where N is the number of examples.
+ Each row contains the distances to each example's `num_neighbors` nearest neighbors.
+ It is assumed that each row is sorted in ascending order.
+
+ k : int
+ Number of neighbors used to compute the average distance to each example.
+ This assumes that the second dimension of distances is k or greater, but it
+ uses slicing to avoid indexing errors.
+
+ t : int
+ Controls transformation of distances between examples into similarity scores that lie in [0,1].
+
+ Returns
+ -------
+ ood_features_scores : np.ndarray
+ An array of outlier scores of shape ``(N,)`` for N examples.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> from cleanlab.outlier import transform_distances_to_scores
+ >>> distances = np.array([[0.0, 0.1, 0.25],
+ ... [0.15, 0.2, 0.3]])
+ >>> transform_distances_to_scores(distances, k=2, t=1)
+ array([0.95122942, 0.83945702])
+ """
+ # Calculate average distance to k-nearest neighbors
+ avg_knn_distances = distances[:, :k].mean(axis=1)
+
+ # Map ood_features_scores to range 0-1 with 0 = most concerning
+ ood_features_scores: np.ndarray = np.exp(-1 * avg_knn_distances * t)
+ return ood_features_scores
diff --git a/cleanlab/internal/regression_utils.py b/cleanlab/internal/regression_utils.py
new file mode 100644
index 0000000000..50e096db9d
--- /dev/null
+++ b/cleanlab/internal/regression_utils.py
@@ -0,0 +1,130 @@
+# Copyright (C) 2017-2022 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+
+"""
+Helper functions internally used in cleanlab.regression.
+"""
+
+import numpy as np
+import pandas as pd
+from numpy.typing import ArrayLike
+from typing import Tuple, Union
+
+
+def assert_valid_prediction_inputs(
+ labels: ArrayLike,
+ predictions: ArrayLike,
+ method: str,
+) -> Tuple[np.ndarray, np.ndarray]:
+ """Checks that ``labels``, ``predictions``, ``method`` are correctly formatted."""
+
+ # Load array_like input as numpy array. If not raise error.
+ try:
+ labels = np.asarray(labels)
+ except:
+ raise ValueError(f"labels must be array_like.")
+
+ try:
+ predictions = np.asarray(predictions)
+ except:
+ raise ValueError(f"predictions must be array_like.")
+
+ # Check if labels and predictions are 1-D and numeric
+ valid_labels = check_dimension_and_datatype(check_input=labels, text="labels")
+ valid_predictions = check_dimension_and_datatype(check_input=predictions, text="predictions")
+
+ # Check if number of examples are same.
+ assert (
+ valid_labels.shape == valid_predictions.shape
+ ), f"Number of examples in labels {labels.shape} and predictions {predictions.shape} are not same."
+
+ # Check if inputs have missing values
+ check_missing_values(valid_labels, text="labels")
+ check_missing_values(valid_predictions, text="predictions")
+
+ # Check if method is among allowed scoring method
+ scoring_methods = ["residual", "outre"]
+ if method not in scoring_methods:
+ raise ValueError(f"Specified method '{method}' must be one of: {scoring_methods}.")
+
+ # return 1-D numpy array
+ return valid_labels, valid_predictions
+
+
+def assert_valid_regression_inputs(
+ X: Union[np.ndarray, pd.DataFrame],
+ y: ArrayLike,
+) -> Tuple[np.ndarray, np.ndarray]:
+ """
+ Checks that regression inputs are properly formatted and returns the inputs in numpy array format.
+ """
+ try:
+ X = np.asarray(X)
+ except:
+ raise ValueError(f"X must be array_like.")
+
+ y = check_dimension_and_datatype(y, "y")
+ check_missing_values(y, text="y")
+
+ if len(X) != len(y):
+ raise ValueError("X and y must have same length.")
+
+ return X, y
+
+
+def check_dimension_and_datatype(check_input: ArrayLike, text: str) -> np.ndarray:
+ """
+ Raises errors related to:
+ 1. If input is empty
+ 2. If input is not 1-D
+ 3. If input is not numeric
+
+ If all the checks are passed, it returns the squeezed 1-D array required by the main algorithm.
+ """
+
+ try:
+ check_input = np.asarray(check_input)
+ except:
+ raise ValueError(f"{text} could not be converted to numpy array, check input.")
+
+ # Check if input is empty
+ if not check_input.size:
+ raise ValueError(f"{text} cannot be empty array.")
+
+ # Remove axis with length one
+ check_input = np.squeeze(check_input)
+
+ # Check if input is 1-D
+ if check_input.ndim != 1:
+ raise ValueError(
+ f"Expected 1-Dimensional inputs for {text}, got {check_input.ndim} dimensions."
+ )
+
+ # Check if datatype is numeric
+ if not np.issubdtype(check_input.dtype, np.number):
+ raise ValueError(
+ f"Expected {text} to contain numeric values, got values of type {check_input.dtype}."
+ )
+
+ return check_input
+
+
+def check_missing_values(check_input: np.ndarray, text: str):
+ """Raise error if there are any missing values in Numpy array."""
+
+ if np.isnan(check_input).any():
+ raise ValueError(f"{text} cannot contain missing values.")
diff --git a/cleanlab/internal/segmentation_utils.py b/cleanlab/internal/segmentation_utils.py
new file mode 100644
index 0000000000..f1354273b3
--- /dev/null
+++ b/cleanlab/internal/segmentation_utils.py
@@ -0,0 +1,70 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""
+Helper functions used internally for segmentation tasks.
+"""
+from typing import Optional, List
+
+import numpy as np
+
+
+def _get_valid_optional_params(
+ batch_size: Optional[int] = None,
+ n_jobs: Optional[int] = None,
+):
+ """Takes in optional args and returns good values for them if they are None."""
+ if batch_size is None:
+ batch_size = 10000
+ return batch_size, n_jobs
+
+
+def _get_summary_optional_params(
+ class_names: Optional[List[str]] = None,
+ exclude: Optional[List[int]] = None,
+ top: Optional[int] = None,
+):
+ """Takes in optional args and returns good values for them if they are None for summary functions."""
+ if exclude is None:
+ exclude = []
+ if top is None:
+ top = 20
+ return class_names, exclude, top
+
+
+def _check_input(labels: np.ndarray, pred_probs: np.ndarray) -> None:
+ """
+ Checks that the input labels and predicted probabilities are valid.
+
+ Parameters
+ ----------
+ labels:
+ Array of shape ``(N, H, W)`` of integer labels, where `N` is the number of images in the dataset and `H` and `W` are the height and width of the images.
+
+ pred_probs:
+ Array of shape ``(N, K, H, W)`` of predicted probabilities, where `N` is the number of images in the dataset, `K` is the number of classes, and `H` and `W` are the height and width of the images.
+ """
+ if len(labels.shape) != 3:
+ raise ValueError("labels must have a shape of (N, H, W)")
+
+ if len(pred_probs.shape) != 4:
+ raise ValueError("pred_probs must have a shape of (N, K, H, W)")
+
+ num_images, height, width = labels.shape
+ num_images_pred, num_classes, height_pred, width_pred = pred_probs.shape
+
+ if num_images != num_images_pred or height != height_pred or width != width_pred:
+ raise ValueError("labels and pred_probs must have matching dimensions for N, H, and W")
diff --git a/cleanlab/internal/token_classification_utils.py b/cleanlab/internal/token_classification_utils.py
index fdc012e3a6..d6abafea75 100644
--- a/cleanlab/internal/token_classification_utils.py
+++ b/cleanlab/internal/token_classification_utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -17,12 +17,18 @@
"""
Helper methods used internally in cleanlab.token_classification
"""
+from __future__ import annotations
import re
import string
import numpy as np
from termcolor import colored
-from typing import List, Optional, Callable, Tuple
+from typing import List, Optional, Callable, Tuple, TypeVar, TYPE_CHECKING
+
+if TYPE_CHECKING: # pragma: no cover
+ import numpy.typing as npt
+
+ T = TypeVar("T", bound=npt.NBitBase)
def get_sentence(words: List[str]) -> str:
@@ -171,14 +177,16 @@ def mapping(entities: List[int], maps: List[int]) -> List[int]:
return list(map(f, entities))
-def merge_probs(probs: np.ndarray, maps: List[int]) -> np.ndarray:
+def merge_probs(
+ probs: npt.NDArray["np.floating[T]"], maps: List[int]
+) -> npt.NDArray["np.floating[T]"]:
"""
Merges model-predictive probabilities with desired mapping
Parameters
----------
probs:
- np.array of shape `(N, K)`, where N is the number of tokens, and K is the number of classes for the model
+ A 2D np.array of shape `(N, K)`, where N is the number of tokens, and K is the number of classes for the model
maps:
a list of mapped index, such that the probability of the token being in the i'th class is mapped to the
@@ -188,7 +196,7 @@ def merge_probs(probs: np.ndarray, maps: List[int]) -> np.ndarray:
Returns
---------
probs_merged:
- np.array of shape ``(N, K')``, where `K` is the number of new classes. Probabilities are merged and
+ A 2D np.array of shape ``(N, K')``, where `K'` is the number of new classes. Probabilities are merged and
re-normalized if necessary.
Examples
diff --git a/cleanlab/internal/util.py b/cleanlab/internal/util.py
index ea7ef5f8c3..ee2d53f2b0 100644
--- a/cleanlab/internal/util.py
+++ b/cleanlab/internal/util.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -25,9 +25,7 @@
from cleanlab.typing import DatasetLike, LabelLike
from cleanlab.internal.validation import labels_to_array
-
-
-TINY_VALUE = 1e-100
+from cleanlab.internal.constants import FLOATING_POINT_COMPARISON, TINY_VALUE
def remove_noise_from_class(noise_matrix, class_without_noise) -> np.ndarray:
@@ -234,7 +232,7 @@ def round_preserving_sum(iterable) -> np.ndarray:
orig_sum = np.sum(floats).round()
int_sum = np.sum(ints).round()
# Adjust the integers so that they sum to orig_sum
- while abs(int_sum - orig_sum) > 1e-6:
+ while abs(int_sum - orig_sum) > FLOATING_POINT_COMPARISON:
diff = np.round(orig_sum - int_sum)
increment = -1 if int(diff < 0.0) else 1
changes = min(int(abs(diff)), len(iterable))
diff --git a/cleanlab/internal/validation.py b/cleanlab/internal/validation.py
index 5ce017d1d7..f36603515f 100644
--- a/cleanlab/internal/validation.py
+++ b/cleanlab/internal/validation.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -19,6 +19,7 @@
"""
from cleanlab.typing import LabelLike, DatasetLike
+from cleanlab.internal.constants import FLOATING_POINT_COMPARISON
from typing import Any, List, Optional, Union
import warnings
import numpy as np
@@ -95,7 +96,9 @@ def assert_valid_inputs(
f"pred_probs must have at least {highest_class} columns, based on the largest class index which appears in labels."
)
# Check for valid probabilities.
- if (np.min(pred_probs) < 0) or (np.max(pred_probs) > 1):
+ if (np.min(pred_probs) < 0 - FLOATING_POINT_COMPARISON) or (
+ np.max(pred_probs) > 1 + FLOATING_POINT_COMPARISON
+ ):
raise ValueError("Values in pred_probs must be between 0 and 1.")
if X is not None:
warnings.warn("When X and pred_probs are both provided, the former may be ignored.")
diff --git a/cleanlab/models/README.md b/cleanlab/models/README.md
new file mode 100644
index 0000000000..6551aa13ea
--- /dev/null
+++ b/cleanlab/models/README.md
@@ -0,0 +1,11 @@
+# Useful models adapted for use with cleanlab
+
+Methods in this ``models`` module are not guaranteed to be stable between different ``cleanlab`` versions.
+
+Some of these files include various models that can be used with cleanlab to find issues in specific types of data. These require dependencies on deep learning and other machine learning packages that are not official cleanlab dependencies. You must install these dependencies on your own if you wish to use them.
+
+The dependencies are as follows:
+* keras.py - a wrapper to make any Keras model compatible with cleanlab and sklearn
+ - tensorflow
+* fasttext.py - a cleanlab-compatible FastText classifier for text data
+ - fasttext
diff --git a/cleanlab/models/__init__.py b/cleanlab/models/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cleanlab/experimental/fasttext.py b/cleanlab/models/fasttext.py
similarity index 92%
rename from cleanlab/experimental/fasttext.py
rename to cleanlab/models/fasttext.py
index 6247e3cf97..3b2cd66259 100644
--- a/cleanlab/experimental/fasttext.py
+++ b/cleanlab/models/fasttext.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -15,10 +15,16 @@
# along with cleanlab. If not, see .
"""
-Text classification with FastText models that are compatible with cleanlab.
+Text classification with fastText models that are compatible with cleanlab.
This module allows you to easily find label issues in your text datasets.
-You must first ``pip install fasttext``
+You must have fastText installed: ``pip install fasttext``.
+
+Tips:
+
+* Check out our example using this class: `fasttext_amazon_reviews `_
+* Our `unit tests `_ also provide basic usage examples.
+
"""
import time
@@ -96,6 +102,17 @@ def _split_labels_and_text(batch):
class FastTextClassifier(BaseEstimator): # Inherits sklearn base classifier
+ """Instantiate a fastText classifier that is compatible with :py:class:`CleanLearning `.
+
+ Parameters
+ ----------
+ train_data_fn: str
+ File name of the training data in the format compatible with fastText.
+
+ test_data_fn: str, optional
+ File name of the test data in the format compatible with fastText.
+ """
+
def __init__(
self,
train_data_fn,
@@ -151,7 +168,7 @@ def _create_train_data(self, data_indices):
masked_fn = "fastTextClf_" + str(int(time.time())) + ".txt"
open(masked_fn, "w").close()
# Read in training data one line at a time
- with open(self.train_data_fn, "rU") as rf:
+ with open(self.train_data_fn, "r") as rf:
idx = 0
data_idx = data_indices.pop()
for line in rf:
diff --git a/cleanlab/experimental/keras.py b/cleanlab/models/keras.py
similarity index 57%
rename from cleanlab/experimental/keras.py
rename to cleanlab/models/keras.py
index d4d3e8cf03..13e2442719 100644
--- a/cleanlab/experimental/keras.py
+++ b/cleanlab/models/keras.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -16,26 +16,29 @@
"""
Wrapper class you can use to make any Keras model compatible with :py:class:`CleanLearning ` and sklearn.
-Use :py:class:`KerasWrapperModel` to wrap existing functional API code for ``keras.Model`` objects,
+Use :py:class:`KerasWrapperModel` to wrap existing functional API code for ``keras.Model`` objects,
and :py:class:`KerasWrapperSequential` to wrap existing ``tf.keras.models.Sequential`` objects.
-Most of the instance methods of this class work the same as the ones for the wrapped Keras model,
+Most of the instance methods of this class work the same as the ones for the wrapped Keras model,
see the `Keras documentation `_ for details.
This is a good example of making any bespoke neural network compatible with cleanlab.
You must have `Tensorflow 2 installed `_ (only compatible with Python versions >= 3.7).
+This wrapper class is only fully compatible with ``tensorflow<2.11``, if using ``tensorflow>=2.11``,
+please replace your Optimizer class with the legacy Optimizer `here `_.
Tips:
* If this class lacks certain functionality, you can alternatively try `scikeras `_.
* Unlike scikeras, our `KerasWrapper` classes can operate directly on ``tensorflow.data.Dataset`` objects (like regular Keras models).
* To call ``fit()`` on a tensorflow ``Dataset`` object with a Keras model, the ``Dataset`` should already be batched.
-* Check out our `example `_ using this class: `huggingface_keras_imdb `_
-* Our `unit tests `_ also provide basic usage examples.
+* Check out our example using this class: `huggingface_keras_imdb `_
+* Our `unit tests `_ also provide basic usage examples.
"""
import tensorflow as tf
+import keras # type: ignore
import numpy as np
from typing import Callable, Optional
@@ -50,7 +53,7 @@ class KerasWrapperModel:
Parameters
----------
model: Callable
- A callable function to construct the Keras Model (using functional API). Pass in the function here, not the constructed model!
+ A callable function to construct the Keras Model (using functional API). Pass in the function here, not the constructed model!
For example::
@@ -73,29 +76,63 @@ def __init__(
compile_kwargs: dict = {
"loss": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
},
+ params: Optional[dict] = None,
):
+ if params is None:
+ params = {}
+
self.model = model
self.model_kwargs = model_kwargs
self.compile_kwargs = compile_kwargs
+ self.params = params
self.net = None
def get_params(self, deep=True):
+ """Returns the parameters of the Keras model."""
return {
"model": self.model,
"model_kwargs": self.model_kwargs,
"compile_kwargs": self.compile_kwargs,
+ "params": self.params,
}
+ def set_params(self, **params):
+ """Set the parameters of the Keras model."""
+ self.params.update(params)
+ return self
+
def fit(self, X, y=None, **kwargs):
- """Note that `X` dataset object must already contain the labels as is required for standard Keras fit.
- You can optionally provide the labels again here as argument `y` to be compatible with sklearn, but they are ignored.
+ """Trains a Keras model.
+
+ Parameters
+ ----------
+ X : tf.Dataset or np.array or pd.DataFrame
+ If ``X`` is a tensorflow dataset object, it must already contain the labels as is required for standard Keras fit.
+
+ y : np.array or pd.DataFrame, default = None
+ If ``X`` is a tensorflow dataset object, you can optionally provide the labels again here as argument `y` to be compatible with sklearn,
+ but they are ignored.
+ If ``X`` is a numpy array or pandas dataframe, the labels have to be passed in using this argument.
"""
- self.net = self.model(**self.model_kwargs)
- self.net.compile(**self.compile_kwargs)
- self.net.fit(X, **kwargs)
+ if self.net is None:
+ self.net = self.model(**self.model_kwargs)
+ self.net.compile(**self.compile_kwargs)
+
+ # TODO: check for generators
+ if y is not None and not isinstance(X, (tf.data.Dataset, keras.utils.Sequence)):
+ kwargs["y"] = y
+
+ self.net.fit(X, **{**self.params, **kwargs})
def predict_proba(self, X, *, apply_softmax=True, **kwargs):
- """Set extra argument `apply_softmax` to True to indicate your network only outputs logits not probabilities."""
+ """Predict class probabilities for all classes using the wrapped Keras model.
+ Set extra argument `apply_softmax` to True to indicate your network only outputs logits not probabilities.
+
+ Parameters
+ ----------
+ X : tf.Dataset or np.array or pd.DataFrame
+ Data in the same format as the original ``X`` provided to ``fit()``.
+ """
if self.net is None:
raise ValueError("must call fit() before predict()")
pred_probs = self.net.predict(X, **kwargs)
@@ -104,11 +141,24 @@ def predict_proba(self, X, *, apply_softmax=True, **kwargs):
return pred_probs
def predict(self, X, **kwargs):
+ """Predict class labels using the wrapped Keras model.
+
+ Parameters
+ ----------
+ X : tf.Dataset or np.array or pd.DataFrame
+ Data in the same format as the original ``X`` provided to ``fit()``.
+
+ """
pred_probs = self.predict_proba(X, **kwargs)
return np.argmax(pred_probs, axis=1)
def summary(self, **kwargs):
- self.net.summary(**kwargs)
+ """Returns the summary of the Keras model."""
+ if self.net is None:
+ self.net = self.model(**self.model_kwargs)
+ self.net.compile(**self.compile_kwargs)
+
+ return self.net.summary(**kwargs)
class KerasWrapperSequential:
@@ -137,29 +187,63 @@ def __init__(
compile_kwargs: dict = {
"loss": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
},
+ params: Optional[dict] = None,
):
+ if params is None:
+ params = {}
+
self.layers = layers
self.name = name
self.compile_kwargs = compile_kwargs
+ self.params = params
self.net = None
def get_params(self, deep=True):
+ """Returns the parameters of the Keras model."""
return {
"layers": self.layers,
"name": self.name,
"compile_kwargs": self.compile_kwargs,
+ "params": self.params,
}
+ def set_params(self, **params):
+ """Set the parameters of the Keras model."""
+ self.params.update(params)
+ return self
+
def fit(self, X, y=None, **kwargs):
- """Note that `X` dataset object must already contain the labels as is required for standard Keras fit.
- You can optionally provide the labels again here as argument `y` to be compatible with sklearn, but they are ignored.
+ """Trains a Sequential Keras model.
+
+ Parameters
+ ----------
+ X : tf.Dataset or np.array or pd.DataFrame
+ If ``X`` is a tensorflow dataset object, it must already contain the labels as is required for standard Keras fit.
+
+ y : np.array or pd.DataFrame, default = None
+ If ``X`` is a tensorflow dataset object, you can optionally provide the labels again here as argument `y` to be compatible with sklearn,
+ but they are ignored.
+ If ``X`` is a numpy array or pandas dataframe, the labels have to be passed in using this argument.
"""
- self.net = tf.keras.models.Sequential(self.layers, self.name)
- self.net.compile(**self.compile_kwargs)
- self.net.fit(X, **kwargs)
+ if self.net is None:
+ self.net = tf.keras.models.Sequential(self.layers, self.name)
+ self.net.compile(**self.compile_kwargs)
+
+ # TODO: check for generators
+ if y is not None and not isinstance(X, (tf.data.Dataset, keras.utils.Sequence)):
+ kwargs["y"] = y
+
+ self.net.fit(X, **{**self.params, **kwargs})
def predict_proba(self, X, *, apply_softmax=True, **kwargs):
- """Set extra argument `apply_softmax` to True to indicate your network only outputs logits not probabilities."""
+ """Predict class probabilities for all classes using the wrapped Keras model.
+ Set extra argument `apply_softmax` to True to indicate your network only outputs logits not probabilities.
+
+ Parameters
+ ----------
+ X : tf.Dataset or np.array or pd.DataFrame
+ Data in the same format as the original ``X`` provided to ``fit()``.
+ """
if self.net is None:
raise ValueError("must call fit() before predict()")
pred_probs = self.net.predict(X, **kwargs)
@@ -168,8 +252,20 @@ def predict_proba(self, X, *, apply_softmax=True, **kwargs):
return pred_probs
def predict(self, X, **kwargs):
+ """Predict class labels using the wrapped Keras model.
+
+ Parameters
+ ----------
+ X : tf.Dataset or np.array or pd.DataFrame
+ Data in the same format as the original ``X`` provided to ``fit()``.
+ """
pred_probs = self.predict_proba(X, **kwargs)
return np.argmax(pred_probs, axis=1)
def summary(self, **kwargs):
- self.net.summary(**kwargs)
+ """Returns the summary of the Keras model."""
+ if self.net is None:
+ self.net = tf.keras.models.Sequential(self.layers, self.name)
+ self.net.compile(**self.compile_kwargs)
+
+ return self.net.summary(**kwargs)
diff --git a/cleanlab/multiannotator.py b/cleanlab/multiannotator.py
index 214cd8b247..a985cd7925 100644
--- a/cleanlab/multiannotator.py
+++ b/cleanlab/multiannotator.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -25,14 +25,17 @@
* An analogous label quality score for each individual label chosen by one annotator for a particular example.
* An overall quality score for each annotator which measures our confidence in the overall correctness of labels obtained from this annotator.
-The underlying algorithms used to compute the statistics are described in `the CROWDLAB paper `_.
+The algorithms to compute these estimates are described in `the CROWDLAB paper `_.
If you have some labeled and unlabeled data (with multiple annotators for some labeled examples) and want to decide what data to collect additional labels for,
use the :py:func:`get_active_learning_scores ` function, which is intended for active learning.
-This function estimates an active learning quality score for each example,
+This function estimates an ActiveLab quality score for each example,
which can be used to prioritize which examples are most informative to collect additional labels for.
-This function is effective for settings where some examples have been labeled by one or more annotators and other examples can have no labels at all so far,
-as well as settings where new labels are collected either in batches of examples or one at a time.
+This function is effective for settings where some examples have been labeled by one or more annotators and other examples can have no labels at all so far,
+as well as settings where new labels are collected either in batches of examples or one at a time.
+Here is an `example notebook `_ showcasing the use of this ActiveLab method for active learning with data re-labeling.
+
+The algorithms to compute these active learning scores are described in `the ActiveLab paper `_.
Each of the main functions in this module utilizes any trained classifier model.
Variants of these functions are provided for settings where you have trained an ensemble of multiple models.
@@ -46,6 +49,8 @@
from cleanlab.rank import get_label_quality_scores
from cleanlab.internal.util import get_num_classes, value_counts
+from cleanlab.internal.constants import CLIPPING_LOWER_BOUND
+
from cleanlab.internal.multiannotator_utils import (
assert_valid_inputs_multiannotator,
assert_valid_pred_probs,
@@ -68,7 +73,7 @@ def get_label_quality_multiannotator(
verbose: bool = True,
label_quality_score_kwargs: dict = {},
) -> Dict[str, Any]:
- """Returns label quality scores for each example and for each annotator.
+ """Returns label quality scores for each example and for each annotator in a dataset labeled by multiple annotators.
This function is for multiclass classification datasets where examples have been labeled by
multiple annotators (not necessarily the same number of annotators per example).
@@ -76,14 +81,14 @@ def get_label_quality_multiannotator(
It computes one consensus label for each example that best accounts for the labels chosen by each
annotator (and their quality), as well as a consensus quality score for how confident we are that this consensus label is actually correct.
It also computes similar quality scores for each annotator's individual labels, and the quality of each annotator.
- Scores are between 0 and 1; lower scores indicate labels/annotators less likely to be correct.
+ Scores are between 0 and 1 (estimated via methods like CROWDLAB); lower scores indicate labels/annotators less likely to be correct.
To decide what data to collect additional labels for, try the :py:func:`get_active_learning_scores `
- function, which is intended for active learning with multiple annotators.
+ (ActiveLab) function, which is intended for active learning with multiple annotators.
Parameters
----------
- labels_multiannotator : pd.DataFrame of np.ndarray
+ labels_multiannotator : pd.DataFrame or np.ndarray
2D pandas DataFrame or array of multiple given labels for each example with shape ``(N, M)``,
where N is the number of examples and M is the number of annotators.
``labels_multiannotator[n][m]`` = label for n-th example given by m-th annotator.
@@ -358,7 +363,7 @@ def get_label_quality_multiannotator_ensemble(
Parameters
----------
- labels_multiannotator : pd.DataFrame of np.ndarray
+ labels_multiannotator : pd.DataFrame or np.ndarray
Multiannotator labels in the same format expected by :py:func:`get_label_quality_multiannotator `.
pred_probs : np.ndarray
An array of shape ``(P, N, K)`` where P is the number of models, consisting of predicted class probabilities from the ensemble models.
@@ -534,13 +539,13 @@ def get_label_quality_multiannotator_ensemble(
def get_active_learning_scores(
- labels_multiannotator: Union[pd.DataFrame, np.ndarray],
- pred_probs: np.ndarray,
+ labels_multiannotator: Optional[Union[pd.DataFrame, np.ndarray]] = None,
+ pred_probs: Optional[np.ndarray] = None,
pred_probs_unlabeled: Optional[np.ndarray] = None,
) -> Tuple[np.ndarray, np.ndarray]:
- """Returns an active learning quality score for each example in the dataset.
+ """Returns an ActiveLab quality score for each example in the dataset, to estimate which examples are most informative to (re)label next in active learning.
- We consider settings where one example can be labeled by multiple annotators and some examples have no labels at all so far.
+ We consider settings where one example can be labeled by one or more annotators and some examples have no labels at all so far.
The score is in between 0 and 1, and can be used to prioritize what data to collect additional labels for.
Lower scores indicate examples whose true label we are least confident about based on the current data;
@@ -548,27 +553,35 @@ def get_active_learning_scores(
To use an annotation budget most efficiently, select a batch of examples with the lowest scores and collect one additional label for each example,
and repeat this process after retraining your classifier.
+ You can use this function to get active learning scores for: examples that already have one or more labels (specify ``labels_multiannotator`` and ``pred_probs``
+ as arguments), or for unlabeled examples (specify ``pred_probs_unlabeled``), or for both types of examples (specify all of the above arguments).
+
To analyze a fixed dataset labeled by multiple annotators rather than collecting additional labels, try the
- :py:func:`get_label_quality_multiannotator ` function instead.
+ :py:func:`get_label_quality_multiannotator ` (CROWDLAB) function instead.
Parameters
----------
- labels_multiannotator : pd.DataFrame of np.ndarray
+ labels_multiannotator : pd.DataFrame or np.ndarray, optional
2D pandas DataFrame or array of multiple given labels for each example with shape ``(N, M)``,
- where N is the number of examples and M is the number of annotators.
+ where N is the number of examples and M is the number of annotators. Note that this function also works with
+ datasets where there is only one annotator (M=1).
For more details, labels in the same format expected by the :py:func:`get_label_quality_multiannotator `.
Note that examples that have no annotator labels should not be included in this DataFrame/array.
- pred_probs : np.ndarray
+ This argument is optional if ``pred_probs`` is not provided (you might only provide ``pred_probs_unlabeled`` to only get active learning scores for the unlabeled examples).
+ pred_probs : np.ndarray, optional
An array of shape ``(N, K)`` of predicted class probabilities from a trained classifier model.
Predicted probabilities in the same format expected by the :py:func:`get_label_quality_scores `.
+ This argument is optional if you only want to get active learning scores for unlabeled examples (specify only ``pred_probs_unlabeled`` instead).
pred_probs_unlabeled : np.ndarray, optional
An array of shape ``(N, K)`` of predicted class probabilities from a trained classifier model for examples that have no annotator labels.
Predicted probabilities in the same format expected by the :py:func:`get_label_quality_scores `.
+ This argument is optional if you only want to get active learning scores for already-labeled examples (specify only ``pred_probs`` instead).
Returns
-------
active_learning_scores : np.ndarray
- Array of shape ``(N,)`` indicating the active learning quality scores for each example.
+ Array of shape ``(N,)`` indicating the ActiveLab quality scores for each example.
+ This array is empty if no already-labeled data was provided via ``labels_multiannotator``.
Examples with the lowest scores are those we should label next in order to maximally improve our classifier model.
active_learning_scores_unlabeled : np.ndarray
@@ -578,58 +591,83 @@ def get_active_learning_scores(
(scores for unlabeled data are directly comparable with the `active_learning_scores` for labeled data).
"""
- if isinstance(labels_multiannotator, np.ndarray):
- labels_multiannotator = pd.DataFrame(labels_multiannotator)
-
assert_valid_pred_probs(pred_probs=pred_probs, pred_probs_unlabeled=pred_probs_unlabeled)
- num_classes = get_num_classes(pred_probs=pred_probs)
+ # compute multiannotator stats if labeled data is provided
+ if pred_probs is not None:
+ if labels_multiannotator is None:
+ raise ValueError(
+ "labels_multiannotator cannot be None when passing in pred_probs. ",
+ "Either provide labels_multiannotator to obtain active learning scores for the labeled examples, "
+ "or just pass in pred_probs_unlabeled to get active learning scores for unlabeled examples.",
+ )
- optimal_temp = find_best_temp_scaler(labels_multiannotator, pred_probs)
- pred_probs = temp_scale_pred_probs(pred_probs, optimal_temp)
+ if isinstance(labels_multiannotator, np.ndarray):
+ labels_multiannotator = pd.DataFrame(labels_multiannotator)
- # if all examples are only labeled by a single annotator
- if labels_multiannotator.apply(lambda s: len(s.dropna()) == 1, axis=1).all():
- assert_valid_inputs_multiannotator(
- labels_multiannotator, pred_probs, allow_single_label=True
- )
+ num_classes = get_num_classes(pred_probs=pred_probs)
- consensus_label = get_majority_vote_label(
- labels_multiannotator=labels_multiannotator,
- pred_probs=pred_probs,
- verbose=False,
- )
- quality_of_consensus_labeled = get_label_quality_scores(consensus_label, pred_probs)
+ # if all examples are only labeled by a single annotator
+ if labels_multiannotator.apply(lambda s: len(s.dropna()) == 1, axis=1).all():
+ optimal_temp = 1.0 # do not temp scale for single annotator case, temperature is defined here for later use
+
+ assert_valid_inputs_multiannotator(
+ labels_multiannotator, pred_probs, allow_single_label=True
+ )
+
+ consensus_label = get_majority_vote_label(
+ labels_multiannotator=labels_multiannotator,
+ pred_probs=pred_probs,
+ verbose=False,
+ )
+ quality_of_consensus_labeled = get_label_quality_scores(consensus_label, pred_probs)
+ model_weight = 1
+ annotator_weight = np.full(labels_multiannotator.shape[1], 1)
+ avg_annotator_weight = np.mean(annotator_weight)
+
+ # examples are annotated by multiple annotators
+ else:
+ optimal_temp = find_best_temp_scaler(labels_multiannotator, pred_probs)
+ pred_probs = temp_scale_pred_probs(pred_probs, optimal_temp)
+
+ multiannotator_info = get_label_quality_multiannotator(
+ labels_multiannotator,
+ pred_probs,
+ return_annotator_stats=False,
+ return_detailed_quality=False,
+ return_weights=True,
+ )
+
+ quality_of_consensus_labeled = multiannotator_info["label_quality"][
+ "consensus_quality_score"
+ ]
+ model_weight = multiannotator_info["model_weight"]
+ annotator_weight = multiannotator_info["annotator_weight"]
+ avg_annotator_weight = np.mean(annotator_weight)
+
+ # compute scores for labeled data
+ active_learning_scores = np.full(len(labels_multiannotator), np.nan)
+ for i in range(len(active_learning_scores)):
+ annotator_labels = labels_multiannotator.iloc[i]
+ active_learning_scores[i] = np.average(
+ (quality_of_consensus_labeled[i], 1 / num_classes),
+ weights=(
+ np.sum(annotator_weight[annotator_labels.notna()]) + model_weight,
+ avg_annotator_weight,
+ ),
+ )
+
+ # no labeled data provided so do not estimate temperature and model/annotator weights
+ elif pred_probs_unlabeled is not None:
+ num_classes = get_num_classes(pred_probs=pred_probs_unlabeled)
+ optimal_temp = 1
model_weight = 1
- annotator_weight = np.full(labels_multiannotator.shape[1], 1)
- avg_annotator_weight = np.mean(annotator_weight)
+ avg_annotator_weight = 1
+ active_learning_scores = np.array([])
else:
- multiannotator_info = get_label_quality_multiannotator(
- labels_multiannotator,
- pred_probs,
- return_annotator_stats=False,
- return_detailed_quality=False,
- return_weights=True,
- )
-
- quality_of_consensus_labeled = multiannotator_info["label_quality"][
- "consensus_quality_score"
- ]
- model_weight = multiannotator_info["model_weight"]
- annotator_weight = multiannotator_info["annotator_weight"]
- avg_annotator_weight = np.mean(annotator_weight)
-
- # compute scores for labeled data
- active_learning_scores = np.full(len(labels_multiannotator), np.nan)
- for i in range(len(active_learning_scores)):
- annotator_labels = labels_multiannotator.iloc[i]
- active_learning_scores[i] = np.average(
- (quality_of_consensus_labeled[i], 1 / num_classes),
- weights=(
- np.sum(annotator_weight[annotator_labels.notna()]) + model_weight,
- avg_annotator_weight,
- ),
+ raise ValueError(
+ "pred_probs and pred_probs_unlabeled cannot both be None, specify at least one of the two."
)
# compute scores for unlabeled data
@@ -655,26 +693,30 @@ def get_active_learning_scores(
def get_active_learning_scores_ensemble(
- labels_multiannotator: Union[pd.DataFrame, np.ndarray],
- pred_probs: np.ndarray,
+ labels_multiannotator: Optional[Union[pd.DataFrame, np.ndarray]] = None,
+ pred_probs: Optional[np.ndarray] = None,
pred_probs_unlabeled: Optional[np.ndarray] = None,
) -> Tuple[np.ndarray, np.ndarray]:
- """Returns an active learning quality score for each example in the dataset, based on predictions from an ensemble of models.
+ """Returns an ActiveLab quality score for each example in the dataset, based on predictions from an ensemble of models.
This function is similar to :py:func:`get_active_learning_scores ` but allows for an
- ensemble of multiple classifier models to be trained and will aggregate predictions from the models to compute the active learning quality score.
+ ensemble of multiple classifier models to be trained and will aggregate predictions from the models to compute the ActiveLab quality score.
Parameters
----------
labels_multiannotator : pd.DataFrame or np.ndarray
Multiannotator labels in the same format expected by :py:func:`get_active_learning_scores `.
+ This argument is optional if ``pred_probs`` is not provided (in cases where you only provide ``pred_probs_unlabeled`` to get active learning scores for unlabeled examples).
pred_probs : np.ndarray
An array of shape ``(P, N, K)`` where P is the number of models, consisting of predicted class probabilities from the ensemble models.
+ Note that this function also works with datasets where there is only one annotator (M=1).
Each set of predicted probabilities with shape ``(N, K)`` is in the same format expected by the :py:func:`get_label_quality_scores `.
+ This argument is optional if you only want to get active learning scores for unlabeled examples (pass in ``pred_probs_unlabeled`` instead).
pred_probs_unlabeled : np.ndarray, optional
An array of shape ``(P, N, K)`` where P is the number of models, consisting of predicted class probabilities from a trained classifier model
for examples that have no annotated labels so far (but which we may want to label in the future, and hence compute active learning quality scores for).
Each set of predicted probabilities with shape ``(N, K)`` is in the same format expected by the :py:func:`get_label_quality_scores `.
+ This argument is optional if you only want to get active learning scores for labeled examples (pass in ``pred_probs`` instead).
Returns
-------
@@ -688,66 +730,91 @@ def get_active_learning_scores_ensemble(
get_active_learning_scores
"""
- if isinstance(labels_multiannotator, np.ndarray):
- labels_multiannotator = pd.DataFrame(labels_multiannotator)
-
assert_valid_pred_probs(
pred_probs=pred_probs, pred_probs_unlabeled=pred_probs_unlabeled, ensemble=True
)
- num_classes = get_num_classes(pred_probs=pred_probs[0])
+ # compute multiannotator stats if labeled data is provided
+ if pred_probs is not None:
+ if labels_multiannotator is None:
+ raise ValueError(
+ "labels_multiannotator cannot be None when passing in pred_probs. ",
+ "You can either provide labels_multiannotator to obtain active learning scores for the labeled examples, "
+ "or just pass in pred_probs_unlabeled to get active learning scores for unlabeled examples.",
+ )
- # temp scale pred_probs
- optimal_temp = np.full(len(pred_probs), np.NaN)
- for i in range(len(pred_probs)):
- curr_pred_probs = pred_probs[i]
- curr_optimal_temp = find_best_temp_scaler(labels_multiannotator, curr_pred_probs)
- pred_probs[i] = temp_scale_pred_probs(curr_pred_probs, curr_optimal_temp)
- optimal_temp[i] = curr_optimal_temp
-
- # if all examples are only labeled by a single annotator
- if labels_multiannotator.apply(lambda s: len(s.dropna()) == 1, axis=1).all():
- assert_valid_inputs_multiannotator(
- labels_multiannotator, pred_probs, ensemble=True, allow_single_label=True
- )
+ if isinstance(labels_multiannotator, np.ndarray):
+ labels_multiannotator = pd.DataFrame(labels_multiannotator)
- avg_pred_probs = np.mean(pred_probs, axis=0)
- consensus_label = get_majority_vote_label(
- labels_multiannotator=labels_multiannotator,
- pred_probs=avg_pred_probs,
- verbose=False,
- )
- quality_of_consensus_labeled = get_label_quality_scores(consensus_label, avg_pred_probs)
- model_weight = np.full(len(pred_probs), 1)
- annotator_weight = np.full(labels_multiannotator.shape[1], 1)
- avg_annotator_weight = np.mean(annotator_weight)
+ num_classes = get_num_classes(pred_probs=pred_probs[0])
- else:
- multiannotator_info = get_label_quality_multiannotator_ensemble(
- labels_multiannotator,
- pred_probs,
- return_annotator_stats=False,
- return_detailed_quality=False,
- return_weights=True,
- )
+ # if all examples are only labeled by a single annotator
+ if labels_multiannotator.apply(lambda s: len(s.dropna()) == 1, axis=1).all():
+ # do not temp scale for single annotator case, temperature is defined here for later use
+ optimal_temp = np.full(len(pred_probs), 1.0)
- quality_of_consensus_labeled = multiannotator_info["label_quality"][
- "consensus_quality_score"
- ]
- model_weight = multiannotator_info["model_weight"]
- annotator_weight = multiannotator_info["annotator_weight"]
- avg_annotator_weight = np.mean(annotator_weight)
-
- # compute scores for labeled data
- active_learning_scores = np.full(len(labels_multiannotator), np.nan)
- for i in range(len(active_learning_scores)):
- annotator_labels = labels_multiannotator.iloc[i]
- active_learning_scores[i] = np.average(
- (quality_of_consensus_labeled[i], 1 / num_classes),
- weights=(
- np.sum(annotator_weight[annotator_labels.notna()]) + np.sum(model_weight),
- avg_annotator_weight,
- ),
+ assert_valid_inputs_multiannotator(
+ labels_multiannotator, pred_probs, ensemble=True, allow_single_label=True
+ )
+
+ avg_pred_probs = np.mean(pred_probs, axis=0)
+ consensus_label = get_majority_vote_label(
+ labels_multiannotator=labels_multiannotator,
+ pred_probs=avg_pred_probs,
+ verbose=False,
+ )
+ quality_of_consensus_labeled = get_label_quality_scores(consensus_label, avg_pred_probs)
+ model_weight = np.full(len(pred_probs), 1)
+ annotator_weight = np.full(labels_multiannotator.shape[1], 1)
+ avg_annotator_weight = np.mean(annotator_weight)
+
+ # examples are annotated by multiple annotators
+ else:
+ optimal_temp = np.full(len(pred_probs), np.NaN)
+ for i in range(len(pred_probs)):
+ curr_pred_probs = pred_probs[i]
+ curr_optimal_temp = find_best_temp_scaler(labels_multiannotator, curr_pred_probs)
+ pred_probs[i] = temp_scale_pred_probs(curr_pred_probs, curr_optimal_temp)
+ optimal_temp[i] = curr_optimal_temp
+
+ multiannotator_info = get_label_quality_multiannotator_ensemble(
+ labels_multiannotator,
+ pred_probs,
+ return_annotator_stats=False,
+ return_detailed_quality=False,
+ return_weights=True,
+ )
+
+ quality_of_consensus_labeled = multiannotator_info["label_quality"][
+ "consensus_quality_score"
+ ]
+ model_weight = multiannotator_info["model_weight"]
+ annotator_weight = multiannotator_info["annotator_weight"]
+ avg_annotator_weight = np.mean(annotator_weight)
+
+ # compute scores for labeled data
+ active_learning_scores = np.full(len(labels_multiannotator), np.nan)
+ for i in range(len(active_learning_scores)):
+ annotator_labels = labels_multiannotator.iloc[i]
+ active_learning_scores[i] = np.average(
+ (quality_of_consensus_labeled[i], 1 / num_classes),
+ weights=(
+ np.sum(annotator_weight[annotator_labels.notna()]) + np.sum(model_weight),
+ avg_annotator_weight,
+ ),
+ )
+
+ # no labeled data provided so do not estimate temperature and model/annotator weights
+ elif pred_probs_unlabeled is not None:
+ num_classes = get_num_classes(pred_probs=pred_probs_unlabeled[0])
+ optimal_temp = np.full(len(pred_probs_unlabeled), 1.0)
+ model_weight = np.full(len(pred_probs_unlabeled), 1)
+ avg_annotator_weight = 1
+ active_learning_scores = np.array([])
+
+ else:
+ raise ValueError(
+ "pred_probs and pred_probs_unlabeled cannot both be None, specify at least one of the two."
)
# compute scores for unlabeled data
@@ -848,6 +915,7 @@ def get_majority_vote_label(
tied_idx[idx] = label_mode[max_pred_probs]
# tiebreak 2: using empirical class frequencies
+ # current tiebreak will select the minority class (to prevent larger class imbalance)
if len(tied_idx) > 0:
if pred_probs is not None:
num_classes = pred_probs.shape[1]
@@ -859,14 +927,14 @@ def get_majority_vote_label(
lambda s: pd.Series(np.bincount(s[s.notna()], minlength=num_classes)), axis=1
).sum()
for idx, label_mode in tied_idx.copy().items():
- max_frequency = np.where(
- class_frequencies[label_mode] == np.max(class_frequencies[label_mode])
+ min_frequency = np.where(
+ class_frequencies[label_mode] == np.min(class_frequencies[label_mode])
)[0]
- if len(max_frequency) == 1:
- majority_vote_label[idx] = label_mode[max_frequency[0]]
+ if len(min_frequency) == 1:
+ majority_vote_label[idx] = label_mode[min_frequency[0]]
del tied_idx[idx]
else:
- tied_idx[idx] = label_mode[max_frequency]
+ tied_idx[idx] = label_mode[min_frequency]
# tiebreak 3: using initial annotator quality scores
if len(tied_idx) > 0:
@@ -875,7 +943,13 @@ def get_majority_vote_label(
annotator_agreement_with_consensus = nontied_labels_multiannotator.apply(
lambda s: np.mean(s[pd.notna(s)] == nontied_majority_vote_label[pd.notna(s)]),
axis=0,
- ).to_numpy()
+ )
+
+ # impute average annotator accuracy for any annotator that do not overlap with consensus
+ mask = annotator_agreement_with_consensus.isna()
+ avg_annotator_agreement = np.mean(annotator_agreement_with_consensus[~mask])
+ annotator_agreement_with_consensus[mask] = avg_annotator_agreement
+
for idx, label_mode in tied_idx.copy().items():
label_quality_score = np.array(
[
@@ -1221,7 +1295,6 @@ def _get_post_pred_probs_and_weights(
quality_method: str = "crowdlab",
verbose: bool = True,
) -> Tuple[np.ndarray, Any, Any]:
-
"""Return the posterior predicted probabilities of each example given a specified quality method.
Parameters
@@ -1286,7 +1359,7 @@ def _get_post_pred_probs_and_weights(
consensus_label_subset
!= np.argmax(np.bincount(consensus_label_subset, minlength=num_classes))
),
- a_min=1e-6,
+ a_min=CLIPPING_LOWER_BOUND,
a_max=None,
)
@@ -1296,14 +1369,14 @@ def _get_post_pred_probs_and_weights(
)
annotator_error = 1 - annotator_agreement_with_annotators
adjusted_annotator_agreement = np.clip(
- 1 - (annotator_error / most_likely_class_error), a_min=1e-6, a_max=None
+ 1 - (annotator_error / most_likely_class_error), a_min=CLIPPING_LOWER_BOUND, a_max=None
)
# compute model weight
model_error = np.mean(np.argmax(prior_pred_probs_subset, axis=1) != consensus_label_subset)
- model_weight = np.max([(1 - (model_error / most_likely_class_error)), 1e-6]) * np.sqrt(
- np.mean(num_annotations)
- )
+ model_weight = np.max(
+ [(1 - (model_error / most_likely_class_error)), CLIPPING_LOWER_BOUND]
+ ) * np.sqrt(np.mean(num_annotations))
# compute weighted average
post_pred_probs = np.full(prior_pred_probs.shape, np.nan)
@@ -1412,7 +1485,7 @@ def _get_post_pred_probs_and_weights_ensemble(
consensus_label_subset
!= np.argmax(np.bincount(consensus_label_subset, minlength=num_classes))
),
- a_min=1e-6,
+ a_min=CLIPPING_LOWER_BOUND,
a_max=None,
)
@@ -1422,7 +1495,7 @@ def _get_post_pred_probs_and_weights_ensemble(
)
annotator_error = 1 - annotator_agreement_with_annotators
adjusted_annotator_agreement = np.clip(
- 1 - (annotator_error / most_likely_class_error), a_min=1e-6, a_max=None
+ 1 - (annotator_error / most_likely_class_error), a_min=CLIPPING_LOWER_BOUND, a_max=None
)
# compute model weight
@@ -1431,9 +1504,9 @@ def _get_post_pred_probs_and_weights_ensemble(
prior_pred_probs_subset = prior_pred_probs[idx][mask]
model_error = np.mean(np.argmax(prior_pred_probs_subset, axis=1) != consensus_label_subset)
- model_weight[idx] = np.max([(1 - (model_error / most_likely_class_error)), 1e-6]) * np.sqrt(
- np.mean(num_annotations)
- )
+ model_weight[idx] = np.max(
+ [(1 - (model_error / most_likely_class_error)), CLIPPING_LOWER_BOUND]
+ ) * np.sqrt(np.mean(num_annotations))
# compute weighted average
post_pred_probs = np.full(prior_pred_probs[0].shape, np.nan)
diff --git a/cleanlab/multilabel_classification.py b/cleanlab/multilabel_classification.py
deleted file mode 100644
index 45307adbfa..0000000000
--- a/cleanlab/multilabel_classification.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
-# This file is part of cleanlab.
-#
-# cleanlab is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published
-# by the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# cleanlab is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with cleanlab. If not, see .
-
-"""
-Methods to rank the severity of label issues in multi-label classification datasets.
-Here each example can belong to one or more classes, or none of the classes at all.
-Unlike in standard multi-class classification, predicted class probabilities from model need not sum to 1 for each row in multi-label classification.
-"""
-
-import numpy as np
-from typing import List
-
-from cleanlab.internal.validation import assert_valid_inputs
-from cleanlab.internal.util import get_num_classes
-from cleanlab.internal.multilabel_scorer import MultilabelScorer, ClassLabelScorer, Aggregator
-from cleanlab.internal.multilabel_utils import int2onehot
-
-
-def get_label_quality_scores(
- labels: List,
- pred_probs: np.ndarray,
- *,
- method: str = "self_confidence",
- adjust_pred_probs: bool = False,
- aggregator_kwargs: dict = {"method": "exponential_moving_average", "alpha": 0.8}
-) -> np.ndarray:
- """Computes a label quality score each example in a multi-label classification dataset.
-
- Scores are between 0 and 1 with lower scores indicating examples whose label more likely contains an error.
- For each example, this method internally computes a separate score for each individual class
- and then aggregates these per-class scores into an overall label quality score for the example.
-
- To estimate exactly which examples are mislabeled in a multi-label classification dataset,
- you can also use :py:func:`filter.find_label_issues ` with argument ``multi_label=True``.
-
- Parameters
- ----------
- labels : List[List[int]]
- Multi-label classification labels for each example, which is allowed to belong to multiple classes.
- The i-th element of `labels` corresponds to list of classes that i-th example belongs to (e.g. ``labels = [[1,2],[1],[0],..]``).
-
- Important
- ---------
- *Format requirements*: For dataset with K classes, individual class labels must be integers in 0, 1, ..., K-1.
-
- pred_probs : np.ndarray
- An array of shape ``(N, K)`` of model-predicted probabilities,
- ``P(label=k|x)``. Each row of this matrix corresponds
- to an example `x` and contains the model-predicted probabilities that
- `x` belongs to each possible class, for each of the K classes. The
- columns must be ordered such that these probabilities correspond to
- class 0, 1, ..., K-1. In multi-label classification, the rows of `pred_probs` need not sum to 1.
-
- Note
- ----
- Estimated label quality scores are most accurate when they are computed based on out-of-sample ``pred_probs`` from your model.
- To obtain out-of-sample predicted probabilities for every example in your dataset, you can use :ref:`cross-validation `.
- This is encouraged to get better results.
-
- method : {"self_confidence", "normalized_margin", "confidence_weighted_entropy"}, default = "self_confidence"
- Method to calculate separate per class annotation scores that are subsequently aggregated to form an overall label quality score.
- These scores are separately calculated for each class based on the corresponding column of `pred_probs` in a one-vs-rest manner,
- and are standard label quality scores for multi-class classification.
-
- See also
- --------
- :py:func:`rank.get_label_quality_scores ` function for details about each option.
-
- adjust_pred_probs : bool, default = False
- Account for class imbalance in the label-quality scoring by adjusting predicted probabilities
- via subtraction of class confident thresholds and renormalization.
- Set this to ``True`` if you prefer to account for class-imbalance.
- See `Northcutt et al., 2021 `_.
-
- aggregator_kwargs : dict, default = {"method": "exponential_moving_average", "alpha": 0.8}
- A dictionary of hyperparameter values for aggregating per class scores into an overall label quality score for each example.
- Options for ``"method"`` include: ``"exponential_moving_average"`` or ``"softmin"`` or your own callable function.
- See :py:class:`internal.multilabel_scorer.Aggregator ` for details about each option and other possible hyperparameters.
-
- Returns
- -------
- label_quality_scores : np.ndarray
- A 1D array of shape ``(N,)`` with a label quality score (between 0 and 1) for each example in the dataset.
- Lower scores indicate examples whose label is more likely to contain annotation errors.
-
-
- Examples
- --------
- >>> from cleanlab.multilabel_classification import get_label_quality_scores
- >>> import numpy as np
- >>> labels = [[1], [0,2]]
- >>> pred_probs = np.array([[0.1, 0.9, 0.1], [0.4, 0.1, 0.9]])
- >>> scores = get_label_quality_scores(labels, pred_probs)
- >>> scores
- array([0.9, 0.5])
- """
-
- assert_valid_inputs(
- X=None, y=labels, pred_probs=pred_probs, multi_label=True, allow_one_class=True
- )
- num_classes = get_num_classes(labels=labels, pred_probs=pred_probs, multi_label=True)
- binary_labels = int2onehot(labels, K=num_classes)
- base_scorer = ClassLabelScorer.from_str(method)
- base_scorer_kwargs = {"adjust_pred_probs": adjust_pred_probs}
- aggregator = Aggregator(**aggregator_kwargs)
- scorer = MultilabelScorer(base_scorer, aggregator)
- return scorer(binary_labels, pred_probs, base_scorer_kwargs=base_scorer_kwargs)
diff --git a/cleanlab/multilabel_classification/__init__.py b/cleanlab/multilabel_classification/__init__.py
new file mode 100644
index 0000000000..fcddecdb5d
--- /dev/null
+++ b/cleanlab/multilabel_classification/__init__.py
@@ -0,0 +1,4 @@
+from .rank import get_label_quality_scores
+from . import rank
+from . import dataset
+from . import filter
diff --git a/cleanlab/multilabel_classification/dataset.py b/cleanlab/multilabel_classification/dataset.py
new file mode 100644
index 0000000000..1f040a98bc
--- /dev/null
+++ b/cleanlab/multilabel_classification/dataset.py
@@ -0,0 +1,342 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""
+Methods to summarize overall labeling issues across a multi-label classification dataset.
+Here each example can belong to one or more classes, or none of the classes at all.
+Unlike in standard multi-class classification, model-predicted class probabilities need not sum to 1 for each row in multi-label classification.
+"""
+
+import pandas as pd
+import numpy as np
+from typing import Optional, cast, Dict, Any # noqa: F401
+from cleanlab.multilabel_classification.filter import (
+ find_multilabel_issues_per_class,
+ find_label_issues,
+)
+from cleanlab.internal.multilabel_utils import get_onehot_num_classes
+from collections import defaultdict
+
+
+def common_multilabel_issues(
+ labels=list,
+ pred_probs=None,
+ *,
+ class_names=None,
+ confident_joint=None,
+) -> pd.DataFrame:
+ """Summarizes which classes in a multi-label dataset appear most often mislabeled overall.
+
+ Since classes are not mutually exclusive in multi-label classification, this method summarizes the label issues for each class independently of the others.
+
+ Parameters
+ ----------
+ labels : List[List[int]]
+ List of noisy labels for multi-label classification where each example can belong to multiple classes.
+ Refer to documentation for this argument in :py:func:`multilabel_classification.filter.find_label_issues ` for further details.
+
+ pred_probs : np.ndarray
+ An array of shape ``(N, K)`` of model-predicted class probabilities.
+ Refer to documentation for this argument in :py:func:`multilabel_classification.filter.find_label_issues ` for further details.
+
+ class_names : Iterable[str], optional
+ A list or other iterable of the string class names. Its order must match the label indices.
+ If class 0 is 'dog' and class 1 is 'cat', then ``class_names = ['dog', 'cat']``.
+ If provided, the returned DataFrame will have an extra *Class Name* column with this info.
+
+ confident_joint : np.ndarray, optional
+ An array of shape ``(K, 2, 2)`` representing a one-vs-rest formatted confident joint.
+ Refer to documentation for this argument in :py:func:`multilabel_classification.filter.find_label_issues ` for details.
+
+ Returns
+ -------
+ common_multilabel_issues : pd.DataFrame
+ DataFrame where each row corresponds to a class summarized by the following columns:
+ - *Class Name*: The name of the class if class_names is provided.
+ - *Class Index*: The index of the class.
+ - *In Given Label*: Whether the Class is originally annotated True or False in the given label.
+ - *In Suggested Label*: Whether the Class should be True or False in the suggested label (based on model's prediction).
+ - *Num Examples*: Number of examples flagged as a label issue where this Class is True/False "In Given Label" but cleanlab estimates the annotation should actually be as specified "In Suggested Label". I.e. the number of examples in your dataset where this Class was labeled as True but likely should have been False (or vice versa).
+ - *Issue Probability*: The *Num Examples* column divided by the total number of examples in the dataset; i.e. the relative overall frequency of each type of label issue in your dataset.
+
+ By default, the rows in this DataFrame are ordered by "Issue Probability" (descending).
+ """
+
+ num_examples = _get_num_examples_multilabel(labels=labels, confident_joint=confident_joint)
+ summary_issue_counts = defaultdict(list)
+ y_one, num_classes = get_onehot_num_classes(labels, pred_probs)
+ label_issues_list, labels_list, pred_probs_list = find_multilabel_issues_per_class(
+ labels=labels,
+ pred_probs=pred_probs,
+ confident_joint=confident_joint,
+ return_indices_ranked_by="self_confidence",
+ )
+
+ for class_num, (label, issues_for_class) in enumerate(zip(y_one.T, label_issues_list)):
+ binary_label_issues = np.zeros(len(label)).astype(bool)
+ binary_label_issues[issues_for_class] = True
+ true_but_false_count = sum(np.logical_and(label == 1, binary_label_issues))
+ false_but_true_count = sum(np.logical_and(label == 0, binary_label_issues))
+
+ if class_names is not None:
+ summary_issue_counts["Class Name"].append(class_names[class_num])
+ summary_issue_counts["Class Index"].append(class_num)
+ summary_issue_counts["In Given Label"].append(True)
+ summary_issue_counts["In Suggested Label"].append(False)
+ summary_issue_counts["Num Examples"].append(true_but_false_count)
+ summary_issue_counts["Issue Probability"].append(true_but_false_count / num_examples)
+
+ if class_names is not None:
+ summary_issue_counts["Class Name"].append(class_names[class_num])
+ summary_issue_counts["Class Index"].append(class_num)
+ summary_issue_counts["In Given Label"].append(False)
+ summary_issue_counts["In Suggested Label"].append(True)
+ summary_issue_counts["Num Examples"].append(false_but_true_count)
+ summary_issue_counts["Issue Probability"].append(false_but_true_count / num_examples)
+ return (
+ pd.DataFrame.from_dict(summary_issue_counts)
+ .sort_values(by=["Issue Probability"], ascending=False)
+ .reset_index(drop=True)
+ )
+
+
+def rank_classes_by_multilabel_quality(
+ labels=None,
+ pred_probs=None,
+ *,
+ class_names=None,
+ joint=None,
+ confident_joint=None,
+) -> pd.DataFrame:
+ """
+ Returns a DataFrame with three overall label quality scores per class for a multi-label dataset.
+
+ These numbers summarize all examples annotated with the class (details listed below under the Returns parameter).
+ By default, classes are ordered by "Label Quality Score", so the most problematic classes are reported first in the DataFrame.
+
+ Score values are unnormalized and may be very small. What matters is their relative ranking across the classes.
+
+ **Parameters**:
+
+ For information about the arguments to this method, see the documentation of
+ :py:func:`common_multilabel_issues `.
+
+ Returns
+ -------
+ overall_label_quality : pd.DataFrame
+ Pandas DataFrame with one row per class and columns: "Class Index", "Label Issues",
+ "Inverse Label Issues", "Label Issues", "Inverse Label Noise", "Label Quality Score".
+ Some entries are overall quality scores between 0 and 1, summarizing how good overall the labels
+ appear to be for that class (lower values indicate more erroneous labels).
+ Other entries are estimated counts of annotation errors related to this class.
+
+ Here is what each column represents:
+ - *Class Name*: The name of the class if class_names is provided.
+ - *Class Index*: The index of the class in 0, 1, ..., K-1.
+ - *Label Issues*: Estimated number of examples in the dataset that are labeled as belonging to class k but actually should not belong to this class.
+ - *Inverse Label Issues*: Estimated number of examples in the dataset that should actually be labeled as class k but did not receive this label.
+ - *Label Noise*: Estimated proportion of examples in the dataset that are labeled as class k but should not be. For each class k: this is computed by dividing the number of examples with "Label Issues" that were labeled as class k by the total number of examples labeled as class k.
+ - *Inverse Label Noise*: Estimated proportion of examples in the dataset that should actually be labeled as class k but did not receive this label.
+ - *Label Quality Score*: Estimated proportion of examples labeled as class k that have been labeled correctly, i.e. ``1 - label_noise``.
+
+ By default, the DataFrame is ordered by "Label Quality Score" (in ascending order), so the classes with the most label issues appear first.
+ """
+
+ issues_df = common_multilabel_issues(
+ labels=labels, pred_probs=pred_probs, class_names=class_names, confident_joint=joint
+ )
+ issues_dict = defaultdict(defaultdict) # type: Dict[str, Any]
+ num_examples = _get_num_examples_multilabel(labels=labels, confident_joint=confident_joint)
+ return_columns = [
+ "Class Name",
+ "Class Index",
+ "Label Issues",
+ "Inverse Label Issues",
+ "Label Noise",
+ "Inverse Label Noise",
+ "Label Quality Score",
+ ]
+ if class_names is None:
+ return_columns = return_columns[1:]
+ for class_num, row in issues_df.iterrows():
+ if row["In Given Label"]:
+ if class_names is not None:
+ issues_dict[row["Class Index"]]["Class Name"] = row["Class Name"]
+ issues_dict[row["Class Index"]]["Label Issues"] = int(
+ row["Issue Probability"] * num_examples
+ )
+ issues_dict[row["Class Index"]]["Label Noise"] = row["Issue Probability"]
+ issues_dict[row["Class Index"]]["Label Quality Score"] = (
+ 1 - issues_dict[row["Class Index"]]["Label Noise"]
+ )
+ else:
+ if class_names is not None:
+ issues_dict[row["Class Index"]]["Class Name"] = row["Class Name"]
+ issues_dict[row["Class Index"]]["Inverse Label Issues"] = int(
+ row["Issue Probability"] * num_examples
+ )
+ issues_dict[row["Class Index"]]["Inverse Label Noise"] = row["Issue Probability"]
+
+ issues_df_dict = defaultdict(list)
+ for i in issues_dict:
+ issues_df_dict["Class Index"].append(i)
+ for j in issues_dict[i]:
+ issues_df_dict[j].append(issues_dict[i][j])
+ return (
+ pd.DataFrame.from_dict(issues_df_dict)
+ .sort_values(by="Label Quality Score", ascending=True)
+ .reset_index(drop=True)
+ )[return_columns]
+
+
+def _get_num_examples_multilabel(labels=None, confident_joint: Optional[np.ndarray] = None) -> int:
+ """Helper method that finds the number of examples from the parameters or throws an error
+ if neither parameter is provided.
+
+ Parameters
+ ----------
+ For parameter info, see the docstring of :py:func:`common_multilabel_issues `.
+
+ Returns
+ -------
+ num_examples : int
+ The number of examples in the dataset.
+
+ Raises
+ ------
+ ValueError
+ If `labels` is None.
+ """
+
+ if labels is None and confident_joint is None:
+ raise ValueError(
+ "Error: num_examples is None. You must either provide confident_joint, "
+ "or provide both num_example and joint as input parameters."
+ )
+ _confident_joint = cast(np.ndarray, confident_joint)
+ num_examples = len(labels) if labels is not None else cast(int, np.sum(_confident_joint[0]))
+ return num_examples
+
+
+def overall_multilabel_health_score(
+ labels=None,
+ pred_probs=None,
+ *,
+ confident_joint=None,
+) -> float:
+ """Returns a single score between 0 and 1 measuring the overall quality of all labels in a multi-label classification dataset.
+ Intuitively, the score is the average correctness of the given labels across all examples in the
+ dataset. So a score of 1 suggests your data is perfectly labeled and a score of 0.5 suggests
+ half of the examples in the dataset may be incorrectly labeled. Thus, a higher
+ score implies a higher quality dataset.
+
+ **Parameters**: For information about the arguments to this method, see the documentation of
+ :py:func:`common_multilabel_issues `.
+
+ Returns
+ -------
+ health_score : float
+ A overall score between 0 and 1, where 1 implies all labels in the dataset are estimated to be correct.
+ A score of 0.5 implies that half of the dataset's labels are estimated to have issues.
+ """
+ num_examples = _get_num_examples_multilabel(labels=labels)
+ issues = find_label_issues(
+ labels=labels, pred_probs=pred_probs, confident_joint=confident_joint
+ )
+ return 1.0 - sum(issues) / num_examples
+
+
+def multilabel_health_summary(
+ labels=None,
+ pred_probs=None,
+ *,
+ class_names=None,
+ num_examples=None,
+ confident_joint=None,
+ verbose=True,
+) -> Dict:
+ """Prints a health summary of your multi-label dataset.
+
+ This summary includes useful statistics like:
+
+ * The classes with the most and least label issues.
+ * Overall label quality scores, summarizing how accurate the labels appear across the entire dataset.
+
+ **Parameters**: For information about the arguments to this method, see the documentation of
+ :py:func:`common_multilabel_issues `.
+
+ Returns
+ -------
+ summary : dict
+ A dictionary containing keys (see the corresponding functions' documentation to understand the values):
+ - ``"overall_label_health_score"``, corresponding to output of :py:func:`overall_multilabel_health_score `
+ - ``"classes_by_multilabel_quality"``, corresponding to output of :py:func:`rank_classes_by_multilabel_quality `
+ - ``"common_multilabel_issues"``, corresponding to output of :py:func:`common_multilabel_issues `
+ """
+ from cleanlab.internal.util import smart_display_dataframe
+
+ if num_examples is None:
+ num_examples = _get_num_examples_multilabel(labels=labels)
+
+ if verbose:
+ longest_line = f"| for your dataset with {num_examples:,} examples "
+ print(
+ "-" * (len(longest_line) - 1)
+ + "\n"
+ + f"| Generating a Cleanlab Dataset Health Summary{' ' * (len(longest_line) - 49)}|\n"
+ + longest_line
+ + f"| Note, Cleanlab is not a medical doctor... yet.{' ' * (len(longest_line) - 51)}|\n"
+ + "-" * (len(longest_line) - 1)
+ + "\n",
+ )
+
+ df_class_label_quality = rank_classes_by_multilabel_quality(
+ labels=labels,
+ pred_probs=pred_probs,
+ class_names=class_names,
+ confident_joint=confident_joint,
+ )
+ if verbose:
+ print("Overall Class Quality and Noise across your dataset (below)")
+ print("-" * 60, "\n", flush=True)
+ smart_display_dataframe(df_class_label_quality)
+
+ df_common_issues = common_multilabel_issues(
+ labels=labels,
+ pred_probs=pred_probs,
+ class_names=class_names,
+ confident_joint=confident_joint,
+ )
+ if verbose:
+ print(
+ "\nCommon multilabel issues are" + "\n" + "-" * 83 + "\n",
+ flush=True,
+ )
+ smart_display_dataframe(df_common_issues)
+ print()
+
+ health_score = overall_multilabel_health_score(
+ labels=labels,
+ pred_probs=pred_probs,
+ confident_joint=confident_joint,
+ )
+ if verbose:
+ print("\nGenerated with <3 from Cleanlab.\n")
+ return {
+ "overall_multilabel_health_score": health_score,
+ "classes_by_multilabel_quality": df_class_label_quality,
+ "common_multilabel_issues": df_common_issues,
+ }
diff --git a/cleanlab/multilabel_classification/filter.py b/cleanlab/multilabel_classification/filter.py
new file mode 100644
index 0000000000..bbddc065dc
--- /dev/null
+++ b/cleanlab/multilabel_classification/filter.py
@@ -0,0 +1,275 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""
+Methods to flag which examples have label issues in multi-label classification datasets.
+Here each example can belong to one or more classes, or none of the classes at all.
+Unlike in standard multi-class classification, model-predicted class probabilities need not sum to 1 for each row in multi-label classification.
+"""
+
+import warnings
+from typing import Optional, Union, Tuple, List, Any
+import numpy as np
+
+
+def find_label_issues(
+ labels: list,
+ pred_probs: np.ndarray,
+ return_indices_ranked_by: Optional[str] = None,
+ rank_by_kwargs={},
+ filter_by: str = "prune_by_noise_rate",
+ frac_noise: float = 1.0,
+ num_to_remove_per_class: Optional[List[int]] = None,
+ min_examples_per_class=1,
+ confident_joint: Optional[np.ndarray] = None,
+ n_jobs: Optional[int] = None,
+ verbose: bool = False,
+) -> np.ndarray:
+ """
+ Identifies potentially mislabeled examples in a multi-label classification dataset.
+ An example is flagged as with a label issue if *any* of the classes appear to be incorrectly annotated for this example.
+
+ Parameters
+ ----------
+ labels : List[List[int]]
+ List of noisy labels for multi-label classification where each example can belong to multiple classes.
+ This is an iterable of iterables where the i-th element of `labels` corresponds to a list of classes that the i-th example belongs to,
+ according to the original data annotation (e.g. ``labels = [[1,2],[1],[0],..]``).
+ This method will return the indices i where the inner list ``labels[i]`` is estimated to have some error.
+ For a dataset with K classes, each class must be represented as an integer in 0, 1, ..., K-1 within the labels.
+
+ pred_probs : np.ndarray
+ An array of shape ``(N, K)`` of model-predicted class probabilities.
+ Each row of this matrix corresponds to an example `x`
+ and contains the predicted probability that `x` belongs to each possible class,
+ for each of the K classes (along its columns).
+ The columns need not sum to 1 but must be ordered such that
+ these probabilities correspond to class 0, 1, ..., K-1.
+
+ Note
+ ----
+ Estimated label quality scores are most accurate when they are computed based on out-of-sample ``pred_probs`` from your model.
+ To obtain out-of-sample predicted probabilities for every example in your dataset, you can use :ref:`cross-validation `.
+ This is encouraged to get better results.
+
+ return_indices_ranked_by : {None, 'self_confidence', 'normalized_margin', 'confidence_weighted_entropy'}, default = None
+ This function can return a boolean mask (if None) or an array of the example-indices with issues sorted based on the specified ranking method.
+ Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details.
+
+ rank_by_kwargs : dict, optional
+ Optional keyword arguments to pass into scoring functions for ranking by
+ label quality score (see :py:func:`rank.get_label_quality_scores
+ `).
+
+ filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given', 'low_normalized_margin', 'low_self_confidence'}, default='prune_by_noise_rate'
+ The specific Confident Learning method to determine precisely which examples have label issues in a dataset.
+ Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details.
+
+ frac_noise : float, default = 1.0
+ This will return the "top" frac_noise * num_label_issues estimated label errors, dependent on the filtering method used,
+ Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details.
+
+ num_to_remove_per_class : array_like
+ An iterable that specifies the number of mislabeled examples to return from each class.
+ Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details.
+
+ min_examples_per_class : int, default = 1
+ The minimum number of examples required per class below which examples from this class will not be flagged as label issues.
+ Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details.
+
+ confident_joint : np.ndarray, optional
+ An array of shape ``(K, 2, 2)`` representing a one-vs-rest formatted confident joint, as is appropriate for multi-label classification tasks.
+ Entry ``(c, i, j)`` in this array is the number of examples confidently counted into a ``(class c, noisy label=i, true label=j)`` bin,
+ where `i, j` are either 0 or 1 to denote whether this example belongs to class `c` or not
+ (recall examples can belong to multiple classes in multi-label classification).
+ The `confident_joint` can be computed using :py:func:`count.compute_confident_joint ` with ``multi_label=True``.
+ If not provided, it is computed from the given (noisy) `labels` and `pred_probs`.
+
+ n_jobs : optional
+ Number of processing threads used by multiprocessing.
+ Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details.
+
+ verbose : optional
+ If ``True``, prints when multiprocessing happens.
+
+ Returns
+ -------
+ label_issues : np.ndarray
+ If `return_indices_ranked_by` left unspecified, returns a boolean **mask** for the entire dataset
+ where ``True`` represents an example suffering from some label issue and
+ ``False`` represents an example that appears accurately labeled.
+
+ If `return_indices_ranked_by` is specified, this method instead returns a list of **indices** of examples identified with
+ label issues (i.e. those indices where the mask would be ``True``).
+ Indices are sorted by the likelihood that *all* classes are correctly annotated for the corresponding example.
+
+ Note
+ ----
+ Obtain the *indices* of examples with label issues in your dataset by setting
+ `return_indices_ranked_by`.
+
+ """
+ from cleanlab.filter import _find_label_issues_multilabel
+
+ return _find_label_issues_multilabel(
+ labels=labels,
+ pred_probs=pred_probs,
+ return_indices_ranked_by=return_indices_ranked_by,
+ rank_by_kwargs=rank_by_kwargs,
+ filter_by=filter_by,
+ frac_noise=frac_noise,
+ num_to_remove_per_class=num_to_remove_per_class,
+ min_examples_per_class=min_examples_per_class,
+ confident_joint=confident_joint,
+ n_jobs=n_jobs,
+ verbose=verbose,
+ )
+
+
+def find_multilabel_issues_per_class(
+ labels: list,
+ pred_probs: np.ndarray,
+ return_indices_ranked_by: Optional[str] = None,
+ rank_by_kwargs={},
+ filter_by: str = "prune_by_noise_rate",
+ frac_noise: float = 1.0,
+ num_to_remove_per_class: Optional[List[int]] = None,
+ min_examples_per_class=1,
+ confident_joint: Optional[np.ndarray] = None,
+ n_jobs: Optional[int] = None,
+ verbose: bool = False,
+) -> Union[np.ndarray, Tuple[List[np.ndarray], List[Any], List[np.ndarray]]]:
+ """
+ Identifies potentially bad labels for each example and each class in a multi-label classification dataset.
+ Whereas :py:func:`find_label_issues `
+ estimates which examples have an erroneous annotation for *any* class, this method estimates which specific classes are incorrectly annotated as well.
+ This method returns a list of size K, the number of classes in the dataset.
+
+ Parameters
+ ----------
+ labels : List[List[int]]
+ List of noisy labels for multi-label classification where each example can belong to multiple classes.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+ This method will identify whether ``labels[i][k]`` appears correct, for every example ``i`` and class ``k``.
+
+ pred_probs : np.ndarray
+ An array of shape ``(N, K)`` of model-predicted class probabilities.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ return_indices_ranked_by : {None, 'self_confidence', 'normalized_margin', 'confidence_weighted_entropy'}, default = None
+ This function can return a boolean mask (if this argument is ``None``) or a sorted array of indices based on the specified ranking method (if not ``None``).
+ Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details.
+
+ rank_by_kwargs : dict, optional
+ Optional keyword arguments to pass into scoring functions for ranking by.
+ label quality score (see :py:func:`rank.get_label_quality_scores
+ `).
+
+ filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given', 'low_normalized_margin', 'low_self_confidence'}, default = 'prune_by_noise_rate'
+ The specific method that can be used to filter or prune examples with label issues from a dataset.
+ Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details.
+
+ frac_noise : float, default = 1.0
+ This will return the "top" frac_noise * num_label_issues estimated label errors, dependent on the filtering method used,
+ Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details.
+
+ num_to_remove_per_class : array_like
+ This parameter is an iterable that specifies the number of mislabeled examples to return from each class.
+ Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details.
+
+ min_examples_per_class : int, default = 1
+ The minimum number of examples required per class to avoid flagging as label issues.
+ Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details.
+
+ confident_joint : np.ndarray, optional
+ An array of shape ``(K, 2, 2)`` representing a one-vs-rest formatted confident joint.
+ Refer to documentation for this argument in :py:func:`cleanlab.multilabel_classification.filter.find_label_issues ` for details.
+
+ n_jobs : optional
+ Number of processing threads used by multiprocessing.
+ Refer to documentation for this argument in :py:func:`filter.find_label_issues ` for details.
+
+ verbose : optional
+ If ``True``, prints when multiprocessing happens.
+
+ Returns
+ -------
+ per_class_label_issues : list(np.ndarray)
+ By default, this is a list of length K containing the examples where each class appears incorrectly annotated.
+ ``per_class_label_issues[k]`` is a Boolean mask of the same length as the dataset,
+ where ``True`` values indicate examples where class ``k`` appears incorrectly annotated.
+
+ For more details, refer to :py:func:`cleanlab.multilabel_classification.filter.find_label_issues `.
+
+ Otherwise if `return_indices_ranked_by` is not ``None``, then this method returns 3 objects (each of length K, the number of classes): `label_issues_list`, `labels_list`, `pred_probs_list`.
+ - *label_issues_list*: an ordered list of indices of examples where class k appears incorrectly annotated, sorted by the likelihood that class k is correctly annotated.
+ - *labels_list*: a binary one-hot representation of the original labels, useful if you want to compute label quality scores.
+ - *pred_probs_list*: a one-vs-rest representation of the original predicted probabilities of shape ``(N, 2)``, useful if you want to compute label quality scores.
+ ``pred_probs_list[k][i][0]`` is the estimated probability that example ``i`` belongs to class ``k``, and is equal to: ``1 - pred_probs_list[k][i][1]``.
+ """
+ import cleanlab.filter
+ from cleanlab.internal.multilabel_utils import get_onehot_num_classes, stack_complement
+
+ y_one, num_classes = get_onehot_num_classes(labels, pred_probs)
+ if return_indices_ranked_by is None:
+ bissues = np.zeros(y_one.shape).astype(bool)
+ else:
+ label_issues_list = []
+ labels_list = []
+ pred_probs_list = []
+ if confident_joint is not None:
+ confident_joint_shape = confident_joint.shape
+ if confident_joint_shape == (num_classes, num_classes):
+ warnings.warn(
+ f"The new recommended format for `confident_joint` in multi_label settings is (num_classes,2,2) as output by compute_confident_joint(...,multi_label=True). Your K x K confident_joint in the old format is being ignored."
+ )
+ confident_joint = None
+ elif confident_joint_shape != (num_classes, 2, 2):
+ raise ValueError("confident_joint should be of shape (num_classes, 2, 2)")
+ for class_num, (label, pred_prob_for_class) in enumerate(zip(y_one.T, pred_probs.T)):
+ pred_probs_binary = stack_complement(pred_prob_for_class)
+ if confident_joint is None:
+ conf = None
+ else:
+ conf = confident_joint[class_num]
+ if num_to_remove_per_class is not None:
+ ml_num_to_remove_per_class = [num_to_remove_per_class[class_num], 0]
+ else:
+ ml_num_to_remove_per_class = None
+ binary_label_issues = cleanlab.filter.find_label_issues(
+ labels=label,
+ pred_probs=pred_probs_binary,
+ return_indices_ranked_by=return_indices_ranked_by,
+ frac_noise=frac_noise,
+ rank_by_kwargs=rank_by_kwargs,
+ filter_by=filter_by,
+ num_to_remove_per_class=ml_num_to_remove_per_class,
+ min_examples_per_class=min_examples_per_class,
+ confident_joint=conf,
+ n_jobs=n_jobs,
+ verbose=verbose,
+ )
+
+ if return_indices_ranked_by is None:
+ bissues[:, class_num] = binary_label_issues
+ else:
+ label_issues_list.append(binary_label_issues)
+ labels_list.append(label)
+ pred_probs_list.append(pred_probs_binary)
+ if return_indices_ranked_by is None:
+ return bissues
+ else:
+ return label_issues_list, labels_list, pred_probs_list
diff --git a/cleanlab/multilabel_classification/rank.py b/cleanlab/multilabel_classification/rank.py
new file mode 100644
index 0000000000..75eb7960fe
--- /dev/null
+++ b/cleanlab/multilabel_classification/rank.py
@@ -0,0 +1,194 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""
+Methods to rank the severity of label issues in multi-label classification datasets.
+Here each example can belong to one or more classes, or none of the classes at all.
+Unlike in standard multi-class classification, model-predicted class probabilities need not sum to 1 for each row in multi-label classification.
+"""
+from __future__ import annotations
+
+import numpy as np # noqa: F401: Imported for type annotations
+from typing import List, TypeVar, Dict, Any, Optional, Tuple, TYPE_CHECKING
+
+from cleanlab.internal.validation import assert_valid_inputs
+from cleanlab.internal.util import get_num_classes
+from cleanlab.internal.multilabel_utils import int2onehot
+from cleanlab.internal.multilabel_scorer import MultilabelScorer, ClassLabelScorer, Aggregator
+
+
+if TYPE_CHECKING: # pragma: no cover
+ import numpy.typing as npt
+
+ T = TypeVar("T", bound=npt.NBitBase)
+
+
+def _labels_to_binary(
+ labels: List[List[int]],
+ pred_probs: npt.NDArray["np.floating[T]"],
+) -> np.ndarray:
+ """Validate the inputs to the multilabel scorer. Also transform the labels to a binary representation."""
+ assert_valid_inputs(
+ X=None, y=labels, pred_probs=pred_probs, multi_label=True, allow_one_class=True
+ )
+ num_classes = get_num_classes(labels=labels, pred_probs=pred_probs, multi_label=True)
+ binary_labels = int2onehot(labels, K=num_classes)
+ return binary_labels
+
+
+def _create_multilabel_scorer(
+ method: str,
+ adjust_pred_probs: bool,
+ aggregator_kwargs: Optional[Dict[str, Any]] = None,
+) -> Tuple[MultilabelScorer, Dict]:
+ """This function acts as a factory that creates a MultilabelScorer."""
+ base_scorer = ClassLabelScorer.from_str(method)
+ base_scorer_kwargs = {"adjust_pred_probs": adjust_pred_probs}
+ if aggregator_kwargs:
+ aggregator = Aggregator(**aggregator_kwargs)
+ scorer = MultilabelScorer(base_scorer, aggregator)
+ else:
+ scorer = MultilabelScorer(base_scorer)
+ return scorer, base_scorer_kwargs
+
+
+def get_label_quality_scores(
+ labels: List[List[int]],
+ pred_probs: npt.NDArray["np.floating[T]"],
+ *,
+ method: str = "self_confidence",
+ adjust_pred_probs: bool = False,
+ aggregator_kwargs: Dict[str, Any] = {"method": "exponential_moving_average", "alpha": 0.8},
+) -> npt.NDArray["np.floating[T]"]:
+ """Computes a label quality score for each example in a multi-label classification dataset.
+
+ Scores are between 0 and 1 with lower scores indicating examples whose label more likely contains an error.
+ For each example, this method internally computes a separate score for each individual class
+ and then aggregates these per-class scores into an overall label quality score for the example.
+
+
+ Parameters
+ ----------
+ labels : List[List[int]]
+ List of noisy labels for multi-label classification where each example can belong to multiple classes.
+ Refer to documentation for this argument in :py:func:`multilabel_classification.filter.find_label_issues ` for further details.
+
+ pred_probs : np.ndarray
+ An array of shape ``(N, K)`` of model-predicted class probabilities.
+ Refer to documentation for this argument in :py:func:`multilabel_classification.filter.find_label_issues ` for further details.
+
+ method : {"self_confidence", "normalized_margin", "confidence_weighted_entropy"}, default = "self_confidence"
+ Method to calculate separate per-class annotation scores for an example that are then aggregated into an overall label quality score for the example.
+ These scores are separately calculated for each class based on the corresponding column of `pred_probs` in a one-vs-rest manner,
+ and are standard label quality scores for binary classification (based on whether the class should or should not apply to this example).
+
+ See also
+ --------
+ :py:func:`rank.get_label_quality_scores ` function for details about each option.
+
+ adjust_pred_probs : bool, default = False
+ Account for class imbalance in the label-quality scoring by adjusting predicted probabilities.
+ Refer to documentation for this argument in :py:func:`rank.get_label_quality_scores ` for details.
+
+
+ aggregator_kwargs : dict, default = {"method": "exponential_moving_average", "alpha": 0.8}
+ A dictionary of hyperparameter values to use when aggregating per-class scores into an overall label quality score for each example.
+ Options for ``"method"`` include: ``"exponential_moving_average"`` or ``"softmin"`` or your own callable function.
+ See :py:class:`internal.multilabel_scorer.Aggregator ` for details about each option and other possible hyperparameters.
+
+ To get a score for each class annotation for each example, use the :py:func:`multilabel_classification.classification.rank.get_label_quality_scores_per_class ` method instead.
+
+ Returns
+ -------
+ label_quality_scores : np.ndarray
+ A 1D array of shape ``(N,)`` with a label quality score (between 0 and 1) for each example in the dataset.
+ Lower scores indicate examples whose label is more likely to contain some annotation error (for any of the classes).
+
+ Examples
+ --------
+ >>> from cleanlab.multilabel_classification import get_label_quality_scores
+ >>> import numpy as np
+ >>> labels = [[1], [0,2]]
+ >>> pred_probs = np.array([[0.1, 0.9, 0.1], [0.4, 0.1, 0.9]])
+ >>> scores = get_label_quality_scores(labels, pred_probs)
+ >>> scores
+ array([0.9, 0.5])
+ """
+ binary_labels = _labels_to_binary(labels, pred_probs)
+ scorer, base_scorer_kwargs = _create_multilabel_scorer(
+ method=method,
+ adjust_pred_probs=adjust_pred_probs,
+ aggregator_kwargs=aggregator_kwargs,
+ )
+ return scorer(binary_labels, pred_probs, base_scorer_kwargs=base_scorer_kwargs)
+
+
+def get_label_quality_scores_per_class(
+ labels: List[List[int]],
+ pred_probs: npt.NDArray["np.floating[T]"],
+ *,
+ method: str = "self_confidence",
+ adjust_pred_probs: bool = False,
+) -> np.ndarray:
+ """
+ Computes a quality score quantifying how likely each individual class annotation is correct in a multi-label classification dataset.
+ This is similar to :py:func:`get_label_quality_scores `
+ but instead returns the per-class results without aggregation.
+ For a dataset with K classes, each example receives K scores from this method.
+ Refer to documentation in :py:func:`get_label_quality_scores ` for details.
+
+ Parameters
+ ----------
+ labels : List[List[int]]
+ List of noisy labels for multi-label classification where each example can belong to multiple classes.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ pred_probs : np.ndarray
+ An array of shape ``(N, K)`` of model-predicted class probabilities.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ method : {"self_confidence", "normalized_margin", "confidence_weighted_entropy"}, default = "self_confidence"
+ Method to calculate separate per-class annotation scores (that quantify how likely a particular class annotation is correct for a particular example).
+ Refer to documentation for this argument in :py:func:`get_label_quality_scores ` for further details.
+
+ adjust_pred_probs : bool, default = False
+ Account for class imbalance in the label-quality scoring by adjusting predicted probabilities.
+ Refer to documentation for this argument in :py:func:`rank.get_label_quality_scores ` for details.
+
+ Returns
+ -------
+ label_quality_scores : list(np.ndarray)
+ A list containing K arrays, each of shape (N,). Here K is the number of classes in the dataset and N is the number of examples.
+ ``label_quality_scores[k][i]`` is a score between 0 and 1 quantifying how likely the annotation for class ``k`` is correct for example ``i``.
+
+ Examples
+ --------
+ >>> from cleanlab.multilabel_classification import get_label_quality_scores
+ >>> import numpy as np
+ >>> labels = [[1], [0,2]]
+ >>> pred_probs = np.array([[0.1, 0.9, 0.1], [0.4, 0.1, 0.9]])
+ >>> scores = get_label_quality_scores(labels, pred_probs)
+ >>> scores
+ array([0.9, 0.5])
+ """
+ binary_labels = _labels_to_binary(labels, pred_probs)
+ scorer, base_scorer_kwargs = _create_multilabel_scorer(
+ method=method,
+ adjust_pred_probs=adjust_pred_probs,
+ )
+ return scorer.get_class_label_quality_scores(
+ labels=binary_labels, pred_probs=pred_probs, base_scorer_kwargs=base_scorer_kwargs
+ )
diff --git a/cleanlab/object_detection/__init__.py b/cleanlab/object_detection/__init__.py
new file mode 100644
index 0000000000..fbc2eb7eac
--- /dev/null
+++ b/cleanlab/object_detection/__init__.py
@@ -0,0 +1,3 @@
+from . import rank
+from . import filter
+from . import summary
diff --git a/cleanlab/object_detection/filter.py b/cleanlab/object_detection/filter.py
new file mode 100644
index 0000000000..cf51f66214
--- /dev/null
+++ b/cleanlab/object_detection/filter.py
@@ -0,0 +1,211 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""Methods to find label issues in an object detection dataset, where each annotated bounding box in an image receives its own class label."""
+
+from typing import List, Any, Dict
+import numpy as np
+
+from cleanlab.internal.constants import (
+ ALPHA,
+ LOW_PROBABILITY_THRESHOLD,
+ HIGH_PROBABILITY_THRESHOLD,
+ OVERLOOKED_THRESHOLD,
+ BADLOC_THRESHOLD,
+ SWAP_THRESHOLD,
+)
+from cleanlab.internal.object_detection_utils import assert_valid_inputs
+
+from cleanlab.object_detection.rank import (
+ _get_valid_inputs_for_compute_scores,
+ compute_overlooked_box_scores,
+ compute_badloc_box_scores,
+ compute_swap_box_scores,
+ get_label_quality_scores,
+ issues_from_scores,
+)
+
+
+def find_label_issues(
+ labels: List[Dict[str, Any]],
+ predictions: List[np.ndarray],
+ *,
+ return_indices_ranked_by_score: bool = False,
+) -> np.ndarray:
+ """
+ Identifies potentially mislabeled images in an object detection dataset.
+ An image is flagged with a label issue if *any* of its bounding boxes appear incorrectly annotated.
+ This includes images for which a bounding box: should have been annotated but is missing,
+ has been annotated with the wrong class, or has been annotated in a suboptimal location.
+
+ Suppose the dataset has ``N`` images, ``K`` possible class labels.
+ If ``return_indices_ranked_by_score`` is ``False``, a boolean mask of length ``N`` is returned,
+ indicating whether each image has a label issue (``True``) or not (``False``).
+ If ``return_indices_ranked_by_score`` is ``True``, the indices of images flagged with label issues are returned,
+ sorted with the most likely-mislabeled images ordered first.
+
+ Parameters
+ ----------
+ labels:
+ Annotated boxes and class labels in the original dataset, which may contain some errors.
+ This is a list of ``N`` dictionaries such that ``labels[i]`` contains the given labels for the `i`-th image in the following format:
+ ``{'bboxes': np.ndarray((L,4)), 'labels': np.ndarray((L,)), 'image_name': str}`` where ``L`` is the number of annotated bounding boxes
+ for the `i`-th image and ``bboxes[l]`` is a bounding box of coordinates in ``[x1,y1,x2,y2]`` format with given class label ``labels[j]``.
+ ``image_name`` is an optional part of the labels that can be used to later refer to specific images.
+
+ For more information on proper labels formatting, check out the `MMDetection library `_.
+
+ predictions:
+ Predictions output by a trained object detection model.
+ For the most accurate results, predictions should be out-of-sample to avoid overfitting, eg. obtained via :ref:`cross-validation `.
+ This is a list of ``N`` ``np.ndarray`` such that ``predictions[i]`` corresponds to the model prediction for the `i`-th image.
+ For each possible class ``k`` in 0, 1, ..., K-1: ``predictions[i][k]`` is a ``np.ndarray`` of shape ``(M,5)``,
+ where ``M`` is the number of predicted bounding boxes for class ``k``. Here the five columns correspond to ``[x1,y1,x2,y2,pred_prob]``,
+ where ``[x1,y1,x2,y2]`` are coordinates of the bounding box predicted by the model
+ and ``pred_prob`` is the model's confidence in the predicted class label for this bounding box.
+
+ Note: Here, ``[x1,y1]`` corresponds to the coordinates of the bottom-left corner of the bounding box, while ``[x2,y2]`` corresponds to the coordinates of the top-right corner of the bounding box. The last column, pred_prob, represents the predicted probability that the bounding box contains an object of the class k.
+
+ For more information see the `MMDetection package `_ for an example object detection library that outputs predictions in the correct format.
+
+ return_indices_ranked_by_score:
+ Determines what is returned by this method (see description of return value for details).
+
+ Returns
+ -------
+ label_issues : np.ndarray
+ Specifies which images are identified to have a label issue.
+ If ``return_indices_ranked_by_score = False``, this function returns a boolean mask of length ``N`` (``True`` entries indicate which images have label issue).
+ If ``return_indices_ranked_by_score = True``, this function returns a (shorter) array of indices of images with label issues, sorted by how likely the image is mislabeled.
+
+ More precisely, indices are sorted by image label quality score calculated via :py:func:`object_detection.rank.get_label_quality_scores `.
+ """
+ scoring_method = "objectlab"
+
+ assert_valid_inputs(
+ labels=labels,
+ predictions=predictions,
+ method=scoring_method,
+ )
+
+ is_issue = _find_label_issues(
+ labels,
+ predictions,
+ scoring_method=scoring_method,
+ return_indices_ranked_by_score=return_indices_ranked_by_score,
+ )
+
+ return is_issue
+
+
+def _find_label_issues(
+ labels: List[Dict[str, Any]],
+ predictions: List[np.ndarray],
+ *,
+ return_indices_ranked_by_score: bool = True,
+ scoring_method: str = "objectlab",
+):
+ """Internal function to find label issues based on passed in method."""
+
+ if scoring_method == "objectlab":
+ auxiliary_inputs = _get_valid_inputs_for_compute_scores(ALPHA, labels, predictions)
+
+ overlooked_scores_per_box = compute_overlooked_box_scores(
+ alpha=ALPHA,
+ high_probability_threshold=HIGH_PROBABILITY_THRESHOLD,
+ auxiliary_inputs=auxiliary_inputs,
+ )
+ overlooked_issues_per_box = _find_label_issues_per_box(
+ overlooked_scores_per_box, OVERLOOKED_THRESHOLD
+ )
+ overlooked_issues_per_image = _pool_box_scores_per_image(overlooked_issues_per_box)
+
+ badloc_scores_per_box = compute_badloc_box_scores(
+ alpha=ALPHA,
+ low_probability_threshold=LOW_PROBABILITY_THRESHOLD,
+ auxiliary_inputs=auxiliary_inputs,
+ )
+ badloc_issues_per_box = _find_label_issues_per_box(badloc_scores_per_box, BADLOC_THRESHOLD)
+ badloc_issues_per_image = _pool_box_scores_per_image(badloc_issues_per_box)
+
+ swap_scores_per_box = compute_swap_box_scores(
+ alpha=ALPHA,
+ high_probability_threshold=HIGH_PROBABILITY_THRESHOLD,
+ auxiliary_inputs=auxiliary_inputs,
+ )
+ swap_issues_per_box = _find_label_issues_per_box(swap_scores_per_box, SWAP_THRESHOLD)
+ swap_issues_per_image = _pool_box_scores_per_image(swap_issues_per_box)
+
+ issues_per_image = (
+ overlooked_issues_per_image + badloc_issues_per_image + swap_issues_per_image
+ )
+ is_issue = issues_per_image > 0
+ else:
+ is_issue = np.full(
+ shape=[
+ len(labels),
+ ],
+ fill_value=-1,
+ )
+
+ if return_indices_ranked_by_score:
+ scores = get_label_quality_scores(labels, predictions)
+ sorted_scores_idx = issues_from_scores(scores, threshold=1.0)
+ is_issue_idx = np.where(is_issue == True)[0]
+ sorted_issue_mask = np.in1d(sorted_scores_idx, is_issue_idx, assume_unique=True)
+ issue_idx = sorted_scores_idx[sorted_issue_mask]
+ return issue_idx
+ else:
+ return is_issue
+
+
+def _find_label_issues_per_box(
+ scores_per_box: List[np.ndarray], threshold: float
+) -> List[np.ndarray]:
+ """Takes in a list of size ``N`` where each index is an array of scores for each bounding box in the `n-th` example
+ and a threshold. Each box below or equal to the threshold will be marked as an issue.
+
+ Returns a list of size ``N`` where each index is a boolean array of length number of boxes per example `n`
+ marking if a specific box is an issue - 1 or not - 0."""
+ is_issue_per_box = []
+ for idx, score_per_box in enumerate(scores_per_box):
+ if len(score_per_box) == 0: # if no for specific image, then image not an issue
+ is_issue_per_box.append(np.array([False]))
+ else:
+ score_per_box[np.isnan(score_per_box)] = 1.0
+ score_per_box = score_per_box
+ issue_per_box = score_per_box <= threshold
+ is_issue_per_box.append(issue_per_box)
+ return is_issue_per_box
+
+
+def _pool_box_scores_per_image(is_issue_per_box: List[np.ndarray]) -> np.ndarray:
+ """Takes in a list of size ``N`` where each index is a boolean array of length number of boxes per image `n `
+ marking if a specific box is an issue - 1 or not - 0.
+
+ Returns a list of size ``N`` where each index marks if the image contains an issue - 1 or not - 0.
+ Images are marked as issues if 1 or more bounding boxes in the image is an issue."""
+ is_issue = np.zeros(
+ shape=[
+ len(
+ is_issue_per_box,
+ )
+ ]
+ )
+ for idx, issue_per_box in enumerate(is_issue_per_box):
+ if np.sum(issue_per_box) > 0:
+ is_issue[idx] = 1
+ return is_issue
diff --git a/cleanlab/object_detection/rank.py b/cleanlab/object_detection/rank.py
new file mode 100644
index 0000000000..ceca1c0c52
--- /dev/null
+++ b/cleanlab/object_detection/rank.py
@@ -0,0 +1,1062 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""Methods to rank and score images in an object detection dataset (object detection data), based on how likely they
+are to contain label errors. """
+
+import warnings
+
+from cleanlab.internal.constants import (
+ ALPHA,
+ CUSTOM_SCORE_WEIGHT_BADLOC,
+ CUSTOM_SCORE_WEIGHT_OVERLOOKED,
+ CUSTOM_SCORE_WEIGHT_SWAP,
+ EUC_FACTOR,
+ HIGH_PROBABILITY_THRESHOLD,
+ LOW_PROBABILITY_THRESHOLD,
+ MAX_ALLOWED_BOX_PRUNE,
+ TINY_VALUE,
+ TEMPERATURE,
+)
+
+
+import copy
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, TypeVar
+
+import numpy as np
+from cleanlab.internal.object_detection_utils import (
+ softmin1d,
+ assert_valid_aggregation_weights,
+ assert_valid_inputs,
+)
+
+if TYPE_CHECKING: # pragma: no cover
+ from typing import TypedDict
+
+ AuxiliaryTypesDict = TypedDict(
+ "AuxiliaryTypesDict",
+ {
+ "pred_labels": np.ndarray,
+ "pred_label_probs": np.ndarray,
+ "pred_bboxes": np.ndarray,
+ "lab_labels": np.ndarray,
+ "lab_bboxes": np.ndarray,
+ "similarity_matrix": np.ndarray,
+ "min_possible_similarity": float,
+ },
+ )
+else:
+ AuxiliaryTypesDict = TypeVar("AuxiliaryTypesDict")
+
+
+def get_label_quality_scores(
+ labels: List[Dict[str, Any]],
+ predictions: List[np.ndarray],
+ aggregation_weights: Optional[Dict[str, float]] = None,
+ *,
+ verbose: bool = True,
+) -> np.ndarray:
+ """Computes a label quality score for each image of the ``N`` images in the dataset.
+
+ For object detection datasets, the label quality score for an image estimates how likely it has been correctly labeled.
+ Lower scores indicate images whose annotation is more likely imperfect.
+ Annotators may have mislabeled an image because they:
+
+ - overlooked an object (missing annotated bounding box),
+ - chose the wrong class label for an annotated box in the correct location,
+ - imperfectly annotated the location/edges of a bounding box.
+
+ Any of these annotation errors should lead to an image with a lower label quality score. This quality score is between 0 and 1.
+
+ - 1 - clean label (given label is likely correct).
+ - 0 - dirty label (given label is likely incorrect).
+
+ Parameters
+ ----------
+ labels:
+ A list of ``N`` dictionaries such that ``labels[i]`` contains the given labels for the `i`-th image.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ predictions:
+ A list of ``N`` ``np.ndarray`` such that ``predictions[i]`` corresponds to the model predictions for the `i`-th image.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ aggregation_weights:
+ Optional dictionary to specify weights for aggregating quality scores for subtype of label issue into an overall label quality score for the image.
+ Its keys are: "overlooked", "swap", "badloc", and values should be nonnegative weights that sum to 1.
+ Increase one of these weights to prioritize images with bounding boxes that were either:
+ missing in the annotations (overlooked object), annotated with the wrong class label (class for the object should be swapped to another class), or annotated in a suboptimal location (badly located).
+
+ swapped examples, bad location examples, and overlooked examples.
+ It is important to ensure that the weights are non-negative values and that their sum equals 1.0.
+
+ verbose : bool, default = True
+ Set to ``False`` to suppress all print statements.
+
+ Returns
+ ---------
+ label_quality_scores:
+ Array of shape ``(N, )`` of scores between 0 and 1, one per image in the object detection dataset.
+ Lower scores indicate images that are more likely mislabeled.
+ """
+ method = "objectlab"
+ probability_threshold = 0.0
+
+ assert_valid_inputs(
+ labels=labels,
+ predictions=predictions,
+ method=method,
+ threshold=probability_threshold,
+ )
+ aggregation_weights = _get_aggregation_weights(aggregation_weights)
+
+ return _compute_label_quality_scores(
+ labels=labels,
+ predictions=predictions,
+ method=method,
+ threshold=probability_threshold,
+ aggregation_weights=aggregation_weights,
+ verbose=verbose,
+ )
+
+
+def issues_from_scores(label_quality_scores: np.ndarray, *, threshold: float = 0.1) -> np.ndarray:
+ """Convert label quality scores to a list of indices of images with issues sorted from most to least severe cut off at threshold.
+
+ Returns the list of indices of images with issues sorted from most to least severe cut off at threshold.
+
+ Parameters
+ ----------
+ label_quality_scores:
+ Array of shape ``(N, )`` of scores between 0 and 1, one per image in the object detection dataset.
+ Lower scores indicate images are more likely to contain a label issue.
+
+ threshold:
+ Label quality scores above the threshold are not considered to be label issues. The corresponding examples' indices are omitted from the returned array.
+
+ Returns
+ ---------
+ issue_indices:
+ Array of issue indices sorted from most to least severe who's label quality scores fall below the threshold if one is provided.
+ """
+
+ if threshold > 1.0:
+ raise ValueError(
+ f"""
+ Threshold is a cutoff of label_quality_scores and therefore should be <= 1.
+ """
+ )
+
+ issue_indices = np.argwhere(label_quality_scores <= threshold).flatten()
+ issue_vals = label_quality_scores[issue_indices]
+ sorted_idx = issue_vals.argsort()
+ return issue_indices[sorted_idx]
+
+
+def _compute_label_quality_scores(
+ labels: List[Dict[str, Any]],
+ predictions: List[np.ndarray],
+ aggregation_weights: Optional[Dict[str, float]] = None,
+ *,
+ method: str = "objectlab",
+ threshold: Optional[float] = None,
+ verbose: bool = True,
+) -> np.ndarray:
+ """Internal function to prune extra bounding boxes and compute label quality scores based on passed in method."""
+
+ pred_probs_prepruned = False
+ min_pred_prob = _get_min_pred_prob(predictions)
+ aggregation_weights = _get_aggregation_weights(aggregation_weights)
+
+ if threshold is not None:
+ predictions = _prune_by_threshold(
+ predictions=predictions, threshold=threshold, verbose=verbose
+ )
+ if np.abs(min_pred_prob - threshold) < 0.001 and threshold > 0:
+ pred_probs_prepruned = True # the provided threshold is the threshold used for pre_pruning the pred_probs during model prediction.
+ else:
+ threshold = min_pred_prob # assume model was not pre_pruned if no threshold was provided
+
+ if method == "objectlab":
+ scores = _get_subtype_label_quality_scores(
+ labels,
+ predictions,
+ alpha=ALPHA,
+ low_probability_threshold=LOW_PROBABILITY_THRESHOLD,
+ high_probability_threshold=HIGH_PROBABILITY_THRESHOLD,
+ temperature=TEMPERATURE,
+ aggregation_weights=aggregation_weights,
+ )
+
+ return scores
+
+
+def _get_min_pred_prob(
+ predictions: List[np.ndarray],
+) -> float:
+ """Returns min pred_prob out of all predictions."""
+ pred_probs = [1.0] # avoid calling np.min on empty array.
+ for prediction in predictions:
+ for class_prediction in prediction:
+ pred_probs.extend(list(class_prediction[:, -1]))
+
+ min_pred_prob = np.min(pred_probs)
+ return min_pred_prob
+
+
+def _prune_by_threshold(
+ predictions: List[np.ndarray], threshold: float, verbose: bool = True
+) -> List[np.ndarray]:
+ """Removes predicted bounding boxes from predictions who's pred_prob is below the cuttoff threshold."""
+
+ predictions_copy = copy.deepcopy(predictions)
+ num_ann_to_zero = 0
+ total_ann = 0
+ for idx_predictions, prediction in enumerate(predictions_copy):
+ for idx_class, class_prediction in enumerate(prediction):
+ filtered_class_prediction = class_prediction[class_prediction[:, -1] >= threshold]
+ if len(class_prediction) > 0:
+ total_ann += 1
+ if len(filtered_class_prediction) == 0:
+ num_ann_to_zero += 1
+
+ predictions_copy[idx_predictions][idx_class] = filtered_class_prediction
+
+ p_ann_pruned = total_ann and num_ann_to_zero / total_ann or 0 # avoid division by zero
+ if p_ann_pruned > MAX_ALLOWED_BOX_PRUNE:
+ warnings.warn(
+ f"Pruning with threshold=={threshold} prunes {p_ann_pruned}% labels. Consider lowering the threshold.",
+ UserWarning,
+ )
+ if verbose:
+ print(
+ f"Pruning {num_ann_to_zero} predictions out of {total_ann} using threshold=={threshold}. These predictions are no longer considered as potential candidates for identifying label issues as their similarity with the given labels is no longer considered."
+ )
+ return predictions_copy
+
+
+def _separate_label(label: Dict[str, Any]) -> Tuple[np.ndarray, np.ndarray]:
+ """Separates labels into bounding box and class label lists."""
+ bboxes = label["bboxes"]
+ labels = label["labels"]
+ return bboxes, labels
+
+
+# TODO: make object detection work for all predicted probabilities
+def _separate_prediction_all_preds(
+ prediction: List[np.ndarray],
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ pred_bboxes, pred_labels, det_probs = prediction
+ return pred_bboxes, pred_labels, det_probs
+
+
+def _separate_prediction_single_box(
+ prediction: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ """Separates predictions into class labels, bounding boxes and pred_prob lists"""
+ labels = []
+ boxes = []
+ for idx, prediction_class in enumerate(prediction):
+ labels.extend([idx] * len(prediction_class))
+ boxes.extend(prediction_class.tolist())
+ bboxes = [box[:4] for box in boxes]
+ pred_probs = [box[-1] for box in boxes]
+ return np.array(bboxes), np.array(labels), np.array(pred_probs)
+
+
+def _get_prediction_type(prediction: np.ndarray) -> str:
+ if (
+ len(prediction) == 3
+ and prediction[0].shape[0] == prediction[2].shape[1]
+ and prediction[1].shape[0] == prediction[2].shape[0]
+ ):
+ return "all_pred"
+ else:
+ return "single_pred"
+
+
+def _separate_prediction(
+ prediction, prediction_type="single_pred"
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ """Returns bbox, label and pred_prob values for prediction."""
+
+ if prediction_type == "all_pred":
+ boxes, labels, pred_probs = _separate_prediction_all_preds(prediction)
+ else:
+ boxes, labels, pred_probs = _separate_prediction_single_box(prediction)
+ return boxes, labels, pred_probs
+
+
+def _mod_coordinates(x: List[float]) -> Dict[str, Any]:
+ """Takes is a list of xyxy coordinates and returns them in dictionary format."""
+
+ wd = {"x1": x[0], "y1": x[1], "x2": x[2], "y2": x[3]}
+ return wd
+
+
+def _get_overlap(bb1: List[float], bb2: List[float]) -> float:
+ """Takes in two bounding boxes `bb1` and `bb2` and returns their IoU overlap."""
+
+ return _get_iou(_mod_coordinates(bb1), _mod_coordinates(bb2))
+
+
+def _get_overlap_matrix(bb1_list: np.ndarray, bb2_list: np.ndarray) -> np.ndarray:
+ """Takes in two lists of bounding boxes and returns an IoU matrix where IoU[i][j] is the overlap between
+ the i-th box in `bb1_list` and the j-th box in `bb2_list`."""
+ wd = np.zeros(shape=(len(bb1_list), len(bb2_list)))
+ for i in range(len(bb1_list)):
+ for j in range(len(bb2_list)):
+ wd[i][j] = _get_overlap(bb1_list[i], bb2_list[j])
+ return wd
+
+
+def _get_iou(bb1: Dict[str, Any], bb2: Dict[str, Any]) -> float:
+ """
+ Calculate the Intersection over Union (IoU) of two bounding boxes.
+ I've modified this to calculate overlap ratio in the line:
+ iou = np.clip(intersection_area / float(min(bb1_area,bb2_area)),0.0,1.0)
+
+ Parameters
+ ----------
+ bb1 : dict
+ Keys: {'x1', 'x2', 'y1', 'y2'}
+ The (x1, y1) position is at the top left corner,
+ the (x2, y2) position is at the bottom right corner
+ bb2 : dict
+ Keys: {'x1', 'x2', 'y1', 'y2'}
+ The (x, y) position is at the top left corner,
+ the (x2, y2) position is at the bottom right corner
+ Returns
+ -------
+ float
+ in [0, 1]
+ """
+ # determine the coordinates of the intersection rectangle
+ x_left = max(bb1["x1"], bb2["x1"])
+ y_top = max(bb1["y1"], bb2["y1"])
+ x_right = min(bb1["x2"], bb2["x2"])
+ y_bottom = min(bb1["y2"], bb2["y2"])
+
+ if x_right < x_left or y_bottom < y_top:
+ return 0.0
+
+ # The intersection of two axis-aligned bounding boxes is always an
+ # axis-aligned bounding box
+ intersection_area = (x_right - x_left) * (y_bottom - y_top)
+
+ # compute the area of both AABBs
+ bb1_area = (bb1["x2"] - bb1["x1"]) * (bb1["y2"] - bb1["y1"])
+ bb2_area = (bb2["x2"] - bb2["x1"]) * (bb2["y2"] - bb2["y1"])
+
+ # compute the intersection over union by taking the intersection
+ # area and dividing it by the sum of prediction + ground-truth
+ # areas - the interesection area
+ iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
+ # There are some hyper-parameters here like consider tile area/object area
+ return iou
+
+
+def _euc_dis(box1: List[float], box2: List[float]) -> float:
+ """Calculates the Euclidean distance between `box1` and `box2`."""
+ x1, y1 = (box1[0] + box1[2]) / 2, (box1[1] + box1[3]) / 2
+ x2, y2 = (box2[0] + box2[2]) / 2, (box2[1] + box2[3]) / 2
+ p1 = np.array([x1, y1])
+ p2 = np.array([x2, y2])
+ val2 = np.exp(-np.linalg.norm(p1 - p2) * EUC_FACTOR)
+ return val2
+
+
+def _get_dist_matrix(bb1_list: np.ndarray, bb2_list: np.ndarray) -> np.ndarray:
+ """Returns a distance matrix of distances from all of boxes in bb1_list to all of boxes in bb2_list."""
+ wd = np.zeros(shape=(len(bb1_list), len(bb2_list)))
+ for i in range(len(bb1_list)):
+ for j in range(len(bb2_list)):
+ wd[i][j] = _euc_dis(bb1_list[i], bb2_list[j])
+ return wd
+
+
+def _get_min_possible_similarity(
+ alpha: float,
+ predictions,
+ labels: List[Dict[str, Any]],
+) -> float:
+ """Gets the min possible similarity score between two bounding boxes out of all images."""
+ min_possible_similarity = 1.0
+ for prediction, label in zip(predictions, labels):
+ lab_bboxes, lab_labels = _separate_label(label)
+ pred_bboxes, pred_labels, _ = _separate_prediction(prediction)
+ iou_matrix = _get_overlap_matrix(lab_bboxes, pred_bboxes)
+ dist_matrix = 1 - _get_dist_matrix(lab_bboxes, pred_bboxes)
+ similarity_matrix = iou_matrix * alpha + (1 - alpha) * (1 - dist_matrix)
+ non_zero_similarity_matrix = similarity_matrix[np.nonzero(similarity_matrix)]
+ min_image_similarity = (
+ 1.0 if 0 in non_zero_similarity_matrix.shape else np.min(non_zero_similarity_matrix)
+ )
+ min_possible_similarity = np.min([min_possible_similarity, min_image_similarity])
+ return min_possible_similarity
+
+
+def _get_valid_inputs_for_compute_scores_per_image(
+ *,
+ alpha: float,
+ label: Optional[Dict[str, Any]] = None,
+ prediction: Optional[np.ndarray] = None,
+ pred_labels: Optional[np.ndarray] = None,
+ pred_label_probs: Optional[np.ndarray] = None,
+ pred_bboxes: Optional[np.ndarray] = None,
+ lab_labels: Optional[np.ndarray] = None,
+ lab_bboxes=None,
+ similarity_matrix=None,
+ min_possible_similarity: Optional[float] = None,
+) -> AuxiliaryTypesDict:
+ """Returns valid inputs for compute scores by either passing through values or calculating the inputs internally."""
+ if lab_labels is None or lab_bboxes is None:
+ if label is None:
+ raise ValueError(
+ f"Pass in either one of label or label labels into auxiliary inputs. Both can not be None."
+ )
+ lab_bboxes, lab_labels = _separate_label(label)
+
+ if pred_labels is None or pred_label_probs is None or pred_bboxes is None:
+ if prediction is None:
+ raise ValueError(
+ f"Pass in either one of prediction or prediction labels and prediction probabilities into auxiliary inputs. Both can not be None."
+ )
+ pred_bboxes, pred_labels, pred_label_probs = _separate_prediction(prediction)
+
+ if similarity_matrix is None:
+ iou_matrix = _get_overlap_matrix(lab_bboxes, pred_bboxes)
+ dist_matrix = 1 - _get_dist_matrix(lab_bboxes, pred_bboxes)
+ similarity_matrix = iou_matrix * alpha + (1 - alpha) * (1 - dist_matrix)
+
+ if min_possible_similarity is None:
+ min_possible_similarity = (
+ 1.0
+ if 0 in similarity_matrix.shape
+ else np.min(similarity_matrix[np.nonzero(similarity_matrix)])
+ )
+
+ auxiliary_input_dict: AuxiliaryTypesDict = {
+ "pred_labels": pred_labels,
+ "pred_label_probs": pred_label_probs,
+ "pred_bboxes": pred_bboxes,
+ "lab_labels": lab_labels,
+ "lab_bboxes": lab_bboxes,
+ "similarity_matrix": similarity_matrix,
+ "min_possible_similarity": min_possible_similarity,
+ }
+
+ return auxiliary_input_dict
+
+
+def _get_valid_inputs_for_compute_scores(
+ alpha: float,
+ labels: Optional[List[Dict[str, Any]]] = None,
+ predictions: Optional[List[np.ndarray]] = None,
+) -> List[AuxiliaryTypesDict]:
+ """Takes in alpha, labels and predictions and returns auxiliary input dictionary containing divided parts of labels and prediction per image."""
+ if predictions is None or labels is None:
+ raise ValueError(
+ f"Predictions and labels can not be None. Both are needed to get valid inputs."
+ )
+ min_possible_similarity = _get_min_possible_similarity(alpha, predictions, labels)
+
+ auxiliary_inputs = []
+
+ for prediction, label in zip(predictions, labels):
+ auxiliary_input_dict = _get_valid_inputs_for_compute_scores_per_image(
+ alpha=alpha,
+ label=label,
+ prediction=prediction,
+ min_possible_similarity=min_possible_similarity,
+ )
+ auxiliary_inputs.append(auxiliary_input_dict)
+
+ return auxiliary_inputs
+
+
+def _get_valid_score(scores_arr: np.ndarray, temperature: float) -> float:
+ """Given scores array, returns valid score (softmin) or 1. Checks validity of score."""
+ scores_arr = scores_arr[~np.isnan(scores_arr)]
+ if len(scores_arr) > 0:
+ valid_score = softmin1d(scores_arr, temperature=temperature)
+ else:
+ valid_score = 1.0
+ return valid_score
+
+
+def _get_valid_subtype_score_params(
+ alpha: Optional[float] = None,
+ low_probability_threshold: Optional[float] = None,
+ high_probability_threshold: Optional[float] = None,
+ temperature: Optional[float] = None,
+):
+ """This function returns valid params for subtype score. If param is None, then default constant is returned"""
+ if alpha is None:
+ alpha = ALPHA
+ if low_probability_threshold is None:
+ low_probability_threshold = LOW_PROBABILITY_THRESHOLD
+ if high_probability_threshold is None:
+ high_probability_threshold = HIGH_PROBABILITY_THRESHOLD
+ if temperature is None:
+ temperature = TEMPERATURE
+ return alpha, low_probability_threshold, high_probability_threshold, temperature
+
+
+def _get_aggregation_weights(
+ aggregation_weights: Optional[Dict[str, Any]] = None
+) -> Dict[str, Any]:
+ """This function validates aggregation weights, returning the default weights if none are provided."""
+ if aggregation_weights is None:
+ aggregation_weights = {
+ "overlooked": CUSTOM_SCORE_WEIGHT_OVERLOOKED,
+ "swap": CUSTOM_SCORE_WEIGHT_SWAP,
+ "badloc": CUSTOM_SCORE_WEIGHT_BADLOC,
+ }
+ else:
+ assert_valid_aggregation_weights(aggregation_weights)
+ return aggregation_weights
+
+
+def _compute_overlooked_box_scores_for_image(
+ alpha: float,
+ high_probability_threshold: float,
+ label: Optional[Dict[str, Any]] = None,
+ prediction: Optional[np.ndarray] = None,
+ pred_labels: Optional[np.ndarray] = None,
+ pred_label_probs: Optional[np.ndarray] = None,
+ pred_bboxes: Optional[np.ndarray] = None,
+ lab_labels: Optional[np.ndarray] = None,
+ lab_bboxes: Optional[np.ndarray] = None,
+ similarity_matrix: Optional[np.ndarray] = None,
+ min_possible_similarity: Optional[float] = None,
+) -> np.ndarray:
+ """This method returns one score per predicted box (above threshold) in an image. Score from 0 to 1 ranking how overlooked the box is."""
+
+ auxiliary_input_dict = _get_valid_inputs_for_compute_scores_per_image(
+ alpha=alpha,
+ label=label,
+ prediction=prediction,
+ pred_labels=pred_labels,
+ pred_label_probs=pred_label_probs,
+ pred_bboxes=pred_bboxes,
+ lab_labels=lab_labels,
+ lab_bboxes=lab_bboxes,
+ similarity_matrix=similarity_matrix,
+ min_possible_similarity=min_possible_similarity,
+ )
+
+ pred_labels = auxiliary_input_dict["pred_labels"]
+ pred_label_probs = auxiliary_input_dict["pred_label_probs"]
+ lab_labels = auxiliary_input_dict["lab_labels"]
+ similarity_matrix = auxiliary_input_dict["similarity_matrix"]
+ min_possible_similarity = auxiliary_input_dict["min_possible_similarity"]
+
+ scores_overlooked = np.empty(
+ shape=[
+ len(pred_labels),
+ ]
+ ) # same length as num of predicted boxes
+
+ for iid, k in enumerate(pred_labels):
+ if pred_label_probs[iid] < high_probability_threshold:
+ scores_overlooked[iid] = np.nan
+ continue
+
+ k_similarity = similarity_matrix[lab_labels == k, iid]
+ if len(k_similarity) == 0: # if there is no annotated box
+ score = min_possible_similarity * (1 - pred_label_probs[iid])
+ else:
+ closest_annotated_box = np.argmax(k_similarity)
+ score = k_similarity[closest_annotated_box]
+ scores_overlooked[iid] = score
+
+ return scores_overlooked
+
+
+def compute_overlooked_box_scores(
+ labels: Optional[List[Dict[str, Any]]] = None,
+ predictions: Optional[List[np.ndarray]] = None,
+ *,
+ alpha: Optional[float] = None,
+ high_probability_threshold: Optional[float] = None,
+ auxiliary_inputs: Optional[List[AuxiliaryTypesDict]] = None,
+) -> List[np.ndarray]:
+ """
+ Returns an array of overlooked box scores for each image.
+ Score per high-confidence predicted bounding box is between 0 and 1, with lower values indicating boxes we are more confident were overlooked in the given label.
+
+ Each image has ``L`` annotated bounding boxes and ``M`` predicted bounding boxes.
+ A score is calculated for each predicted box in each of the ``N`` images in dataset.
+
+ Note: ``M`` and ``L`` can be a different values for each image, as the number of annotated and predicted boxes varies.
+
+ Parameters
+ ----------
+ labels:
+ A list of ``N`` dictionaries such that ``labels[i]`` contains the given labels for the `i`-th image.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ predictions:
+ A list of ``N`` ``np.ndarray`` such that ``predictions[i]`` corresponds to the model predictions for the `i`-th image.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ alpha:
+ Optional weighting between IoU and Euclidean distance when calculating similarity between predicted and annotated boxes. High alpha means weighting IoU more heavily over Euclidean distance. If no alpha is provided, a good default is used.
+
+ high_probability_threshold:
+ Optional probability threshold that determines which predicted boxes are considered high-confidence when computing overlooked scores. If not provided, a good default is used.
+
+ auxiliary_inputs:
+ Optional list of ``N`` dictionaries containing keys for sub-parts of label and prediction per image. Useful to minimize computation when computing multiple box scores for a single set of images. For the `i`-th image, `auxiliary_inputs[i]` should contain following keys:
+
+ * pred_labels: np.ndarray
+ Array of predicted classes for `i`-th image of shape ``(M,)``.
+ * pred_label_probs: np.ndarray
+ Array of predicted class probabilities for `i`-th image of shape ``(M,)``.
+ * pred_bboxes: np.ndarray
+ Array of predicted bounding boxes for `i`-th image of shape ``(M, 4)``.
+ * lab_labels: np.ndarray
+ Array of given label classed for `i`-th image of shape ``(L,)``.
+ * lab_bboxes: np.ndarray
+ Array of given label bounding boxes for `i`-th image of shape ``(L, 4)``.
+ * similarity_matrix: np.ndarray
+ Similarity matrix between labels and predictions `i`-th image.
+ * min_possible_similarity: float
+ Minimum possible similarity value greater than 0 between labels and predictions for the entire dataset.
+ Returns
+ ---------
+ scores_overlooked:
+ A list of ``N`` numpy arrays where scores_overlooked[i] is an array of size ``M`` of overlooked scores per predicted box for the `i`-th image.
+ """
+ (
+ alpha,
+ low_probability_threshold,
+ high_probability_threshold,
+ temperature,
+ ) = _get_valid_subtype_score_params(alpha, None, high_probability_threshold, None)
+
+ if auxiliary_inputs is None:
+ auxiliary_inputs = _get_valid_inputs_for_compute_scores(alpha, labels, predictions)
+
+ scores_overlooked = []
+ for auxiliary_input_dict in auxiliary_inputs:
+ scores_overlooked_per_box = _compute_overlooked_box_scores_for_image(
+ alpha=alpha,
+ high_probability_threshold=high_probability_threshold,
+ **auxiliary_input_dict,
+ )
+ scores_overlooked.append(scores_overlooked_per_box)
+ return scores_overlooked
+
+
+def _compute_badloc_box_scores_for_image(
+ alpha: float,
+ low_probability_threshold: float,
+ label: Optional[Dict[str, Any]] = None,
+ prediction: Optional[np.ndarray] = None,
+ pred_labels: Optional[np.ndarray] = None,
+ pred_label_probs: Optional[np.ndarray] = None,
+ pred_bboxes: Optional[np.ndarray] = None,
+ lab_labels: Optional[np.ndarray] = None,
+ lab_bboxes: Optional[np.ndarray] = None,
+ similarity_matrix: Optional[np.ndarray] = None,
+ min_possible_similarity: Optional[float] = None,
+) -> np.ndarray:
+ """This method returns one score per labeled box in an image. Score from 0 to 1 ranking how badly located the box is."""
+
+ auxiliary_input_dict = _get_valid_inputs_for_compute_scores_per_image(
+ alpha=alpha,
+ label=label,
+ prediction=prediction,
+ pred_labels=pred_labels,
+ pred_label_probs=pred_label_probs,
+ pred_bboxes=pred_bboxes,
+ lab_labels=lab_labels,
+ lab_bboxes=lab_bboxes,
+ similarity_matrix=similarity_matrix,
+ min_possible_similarity=min_possible_similarity,
+ )
+
+ pred_labels = auxiliary_input_dict["pred_labels"]
+ pred_label_probs = auxiliary_input_dict["pred_label_probs"]
+ lab_labels = auxiliary_input_dict["lab_labels"]
+ similarity_matrix = auxiliary_input_dict["similarity_matrix"]
+
+ scores_badloc = np.empty(
+ shape=[
+ len(lab_labels),
+ ]
+ ) # same length as number of labeled boxes
+ for iid, k in enumerate(lab_labels): # for every annotated box
+ k_similarity = similarity_matrix[iid, pred_labels == k]
+ k_pred = pred_label_probs[pred_labels == k]
+
+ if len(k_pred) == 0: # there are no predicted boxes of class k
+ scores_badloc[iid] = 1.0
+ continue
+
+ idx_at_least_low_probability_threshold = k_pred > low_probability_threshold
+ k_similarity = k_similarity[idx_at_least_low_probability_threshold]
+ k_pred = k_pred[idx_at_least_low_probability_threshold]
+
+ if len(k_pred) == 0:
+ scores_badloc[iid] = 1.0
+ else:
+ scores_badloc[iid] = np.max(k_similarity)
+ return scores_badloc
+
+
+def compute_badloc_box_scores(
+ labels: Optional[List[Dict[str, Any]]] = None,
+ predictions: Optional[List[np.ndarray]] = None,
+ *,
+ alpha: Optional[float] = None,
+ low_probability_threshold: Optional[float] = None,
+ auxiliary_inputs: Optional[List[AuxiliaryTypesDict]] = None,
+) -> List[np.ndarray]:
+ """
+ Returns a numeric score for each annotated bounding box in each image, estimating the likelihood that the edges of this box are not badly located.
+ Score per high-confidence predicted bounding box is between 0 and 1, with lower values indicating boxes we are more confident were overlooked in the given label.
+
+ Each image has ``L`` annotated bounding boxes and ``M`` predicted bounding boxes.
+ A score is calculated for each predicted box in each of the ``N`` images in dataset.
+
+ Note: ``M`` and ``L`` can be a different values for each image, as the number of annotated and predicted boxes varies.
+
+ Parameters
+ ----------
+ labels:
+ A list of ``N`` dictionaries such that ``labels[i]`` contains the given labels for the `i`-th image.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ predictions:
+ A list of ``N`` ``np.ndarray`` such that ``predictions[i]`` corresponds to the model predictions for the `i`-th image.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ alpha:
+ Optional weighting between IoU and Euclidean distance when calculating similarity between predicted and annotated boxes. High alpha means weighting IoU more heavily over Euclidean distance. If no alpha is provided, a good default is used.
+
+ low_probability_threshold:
+ Optional minimum probability threshold that determines which predicted boxes are considered when computing badly located scores. If not provided, a good default is used.
+
+ auxiliary_inputs:
+ Optional list of ``N`` dictionaries containing keys for sub-parts of label and prediction per image. Useful to minimize computation when computing multiple box scores for a single set of images. For the `i`-th image, `auxiliary_inputs[i]` should contain following keys:
+
+ * pred_labels: np.ndarray
+ Array of predicted classes for `i`-th image of shape ``(M,)``.
+ * pred_label_probs: np.ndarray
+ Array of predicted class probabilities for `i`-th image of shape ``(M,)``.
+ * pred_bboxes: np.ndarray
+ Array of predicted bounding boxes for `i`-th image of shape ``(M, 4)``.
+ * lab_labels: np.ndarray
+ Array of given label classed for `i`-th image of shape ``(L,)``.
+ * lab_bboxes: np.ndarray
+ Array of given label bounding boxes for `i`-th image of shape ``(L, 4)``.
+ * similarity_matrix: np.ndarray
+ Similarity matrix between labels and predictions `i`-th image.
+ * min_possible_similarity: float
+ Minimum possible similarity value greater than 0 between labels and predictions for the entire dataset.
+ Returns
+ ---------
+ scores_badloc:
+ A list of ``N`` numpy arrays where scores_badloc[i] is an array of size ``L`` badly located scores per annotated box for the `i`-th image.
+ """
+ (
+ alpha,
+ low_probability_threshold,
+ high_probability_threshold,
+ temperature,
+ ) = _get_valid_subtype_score_params(alpha, low_probability_threshold, None, None)
+
+ if auxiliary_inputs is None:
+ auxiliary_inputs = _get_valid_inputs_for_compute_scores(alpha, labels, predictions)
+
+ scores_badloc = []
+ for auxiliary_input_dict in auxiliary_inputs:
+ scores_badloc_per_box = _compute_badloc_box_scores_for_image(
+ alpha=alpha, low_probability_threshold=low_probability_threshold, **auxiliary_input_dict
+ )
+ scores_badloc.append(scores_badloc_per_box)
+ return scores_badloc
+
+
+def _compute_swap_box_scores_for_image(
+ alpha: float,
+ high_probability_threshold: float,
+ label: Optional[Dict[str, Any]] = None,
+ prediction: Optional[np.ndarray] = None,
+ pred_labels: Optional[np.ndarray] = None,
+ pred_label_probs: Optional[np.ndarray] = None,
+ pred_bboxes: Optional[np.ndarray] = None,
+ lab_labels: Optional[np.ndarray] = None,
+ lab_bboxes: Optional[np.ndarray] = None,
+ similarity_matrix: Optional[np.ndarray] = None,
+ min_possible_similarity: Optional[float] = None,
+) -> np.ndarray:
+ """This method returns one score per labeled box in an image. Score from 0 to 1 ranking how likeley swapped the box is."""
+
+ auxiliary_input_dict = _get_valid_inputs_for_compute_scores_per_image(
+ alpha=alpha,
+ label=label,
+ prediction=prediction,
+ pred_labels=pred_labels,
+ pred_label_probs=pred_label_probs,
+ pred_bboxes=pred_bboxes,
+ lab_labels=lab_labels,
+ lab_bboxes=lab_bboxes,
+ similarity_matrix=similarity_matrix,
+ min_possible_similarity=min_possible_similarity,
+ )
+
+ pred_labels = auxiliary_input_dict["pred_labels"]
+ pred_label_probs = auxiliary_input_dict["pred_label_probs"]
+ lab_labels = auxiliary_input_dict["lab_labels"]
+ similarity_matrix = auxiliary_input_dict["similarity_matrix"]
+ min_possible_similarity = auxiliary_input_dict["min_possible_similarity"]
+
+ scores_swap = np.empty(
+ shape=[
+ len(lab_labels),
+ ]
+ ) # same length as number of labeled boxes
+ for iid, k in enumerate(lab_labels):
+ not_k_idx = pred_labels != k
+
+ if len(not_k_idx) == 0:
+ scores_swap[iid] = 1.0
+ continue
+
+ not_k_similarity = similarity_matrix[iid, not_k_idx]
+ not_k_pred = pred_label_probs[not_k_idx]
+
+ idx_at_least_high_probability_threshold = not_k_pred > high_probability_threshold
+ if len(idx_at_least_high_probability_threshold) == 0:
+ scores_swap[iid] = 1.0
+ continue
+
+ not_k_similarity = not_k_similarity[idx_at_least_high_probability_threshold]
+ if len(not_k_similarity) == 0: # if there is no annotated box
+ scores_swap[iid] = 1.0
+ else:
+ closest_predicted_box = np.argmax(not_k_similarity)
+ score = np.max([min_possible_similarity, 1 - not_k_similarity[closest_predicted_box]])
+ scores_swap[iid] = score
+ return scores_swap
+
+
+def compute_swap_box_scores(
+ labels: Optional[List[Dict[str, Any]]] = None,
+ predictions: Optional[List[np.ndarray]] = None,
+ *,
+ alpha: Optional[float] = None,
+ high_probability_threshold: Optional[float] = None,
+ auxiliary_inputs: Optional[List[AuxiliaryTypesDict]] = None,
+) -> List[np.ndarray]:
+ """
+ Returns a numeric score for each annotated bounding box in each image, estimating the likelihood that the class label for this box was not accidentally swapped with another class.
+ Score per high-confidence predicted bounding box is between 0 and 1, with lower values indicating boxes we are more confident were overlooked in the given label.
+
+ Each image has ``L`` annotated bounding boxes and ``M`` predicted bounding boxes.
+ A score is calculated for each predicted box in each of the ``N`` images in dataset.
+
+ Note: ``M`` and ``L`` can be a different values for each image, as the number of annotated and predicted boxes varies.
+
+ Parameters
+ ----------
+ labels:
+ A list of ``N`` dictionaries such that ``labels[i]`` contains the given labels for the `i`-th image.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ predictions:
+ A list of ``N`` ``np.ndarray`` such that ``predictions[i]`` corresponds to the model predictions for the `i`-th image.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ alpha:
+ Optional weighting between IoU and Euclidean distance when calculating similarity between predicted and annotated boxes. High alpha means weighting IoU more heavily over Euclidean distance. If no alpha is provided, a good default is used.
+
+ high_probability_threshold:
+ Optional probability threshold that determines which predicted boxes are considered high-confidence when computing overlooked scores. If not provided, a good default is used.
+
+ auxiliary_inputs:
+ Optional list of ``N`` dictionaries containing keys for sub-parts of label and prediction per image. Useful to minimize computation when computing multiple box scores for a single set of images. For the `i`-th image, `auxiliary_inputs[i]` should contain following keys:
+
+ * pred_labels: np.ndarray
+ Array of predicted classes for `i`-th image of shape ``(M,)``.
+ * pred_label_probs: np.ndarray
+ Array of predicted class probabilities for `i`-th image of shape ``(M,)``.
+ * pred_bboxes: np.ndarray
+ Array of predicted bounding boxes for `i`-th image of shape ``(M, 4)``.
+ * lab_labels: np.ndarray
+ Array of given label classed for `i`-th image of shape ``(L,)``.
+ * lab_bboxes: np.ndarray
+ Array of given label bounding boxes for `i`-th image of shape ``(L, 4)``.
+ * similarity_matrix: np.ndarray
+ Similarity matrix between labels and predictions `i`-th image.
+ * min_possible_similarity: float
+ Minimum possible similarity value greater than 0 between labels and predictions for the entire dataset.
+ Returns
+ ---------
+ scores_swap:
+ A list of ``N`` numpy arrays where scores_swap[i] is an array of size ``L`` swap scores per annotated box for the `i`-th image.
+ """
+ (
+ alpha,
+ low_probability_threshold,
+ high_probability_threshold,
+ temperature,
+ ) = _get_valid_subtype_score_params(alpha, None, high_probability_threshold, None)
+
+ if auxiliary_inputs is None:
+ auxiliary_inputs = _get_valid_inputs_for_compute_scores(alpha, labels, predictions)
+
+ scores_swap = []
+ for auxiliary_inputs in auxiliary_inputs:
+ scores_swap_per_box = _compute_swap_box_scores_for_image(
+ alpha=alpha, high_probability_threshold=high_probability_threshold, **auxiliary_inputs
+ )
+ scores_swap.append(scores_swap_per_box)
+ return scores_swap
+
+
+def pool_box_scores_per_image(
+ box_scores: List[np.ndarray], *, temperature: Optional[float] = None
+) -> np.ndarray:
+ """
+ Aggregates multiple per-box scores within an image to return a single quality score for the image rather than for individual boxes within it.
+ Score per image is between 0 and 1, with lower values indicating we are more confident image contains an error.
+
+ Parameters
+ ----------
+ box_scores:
+ A list of ``N`` numpy arrays where box_scores[i] is an array of badly located scores per box for the `i`-th image.
+
+ temperature:
+ Optional temperature of the softmin function where a lower value suggests softmin acts closer to min. If not provided, a good default is used.
+
+ Returns
+ ---------
+ image_scores:
+ An array of size ``N`` where ``image_scores[i]`` represents the score for the `i`-th image.
+ """
+
+ (
+ alpha,
+ low_probability_threshold,
+ high_probability_threshold,
+ temperature,
+ ) = _get_valid_subtype_score_params(None, None, None, temperature)
+
+ image_scores = np.empty(
+ shape=[
+ len(box_scores),
+ ]
+ )
+ for idx, box_score in enumerate(box_scores):
+ image_score = _get_valid_score(box_score, temperature=temperature)
+ image_scores[idx] = image_score
+ return image_scores
+
+
+def _get_subtype_label_quality_scores(
+ labels: List[Dict[str, Any]],
+ predictions: List[np.ndarray],
+ *,
+ alpha: Optional[float] = None,
+ low_probability_threshold: Optional[float] = None,
+ high_probability_threshold: Optional[float] = None,
+ temperature: Optional[float] = None,
+ aggregation_weights: Optional[Dict[str, float]] = None,
+) -> np.ndarray:
+ """
+ Returns a label quality score for each of the ``N`` images in the dataset.
+ Score is between 0 and 1.
+
+ 1 - clean label (given label is likely correct).
+ 0 - dirty label (given label is likely incorrect).
+
+ Parameters
+ ----------
+ labels:
+ A list of ``N`` dictionaries such that ``labels[i]`` contains the given labels for the `i`-th image.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ predictions:
+ A list of ``N`` ``np.ndarray`` such that ``predictions[i]`` corresponds to the model predictions for the `i`-th image.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ alpha:
+ Optional weighting between IoU and Euclidean distance when calculating similarity between predicted and annotated boxes. High alpha means weighting IoU more heavily over Euclidean distance. If no alpha is provided, a good default is used.
+
+ low_probability_threshold:
+ Optional minimum probability threshold that determines which predicted boxes are considered when computing badly located scores. If not provided, a good default is used.
+
+ high_probability_threshold:
+ Optional probability threshold that determines which predicted boxes are considered high-confidence when computing overlooked and swapped scores. If not provided, a good default is used.
+
+ temperature:
+ Optional temperature of the softmin function where a lower score suggests softmin acts closer to min. If not provided, a good default is used.
+
+ Returns
+ ---------
+ label_quality_scores:
+ As returned by :py:func:`get_label_quality_scores `. See function for more details.
+ """
+ (
+ alpha,
+ low_probability_threshold,
+ high_probability_threshold,
+ temperature,
+ ) = _get_valid_subtype_score_params(
+ alpha, low_probability_threshold, high_probability_threshold, temperature
+ )
+ auxiliary_inputs = _get_valid_inputs_for_compute_scores(alpha, labels, predictions)
+ aggregation_weights = _get_aggregation_weights(aggregation_weights)
+
+ overlooked_scores_per_box = compute_overlooked_box_scores(
+ alpha=alpha,
+ high_probability_threshold=high_probability_threshold,
+ auxiliary_inputs=auxiliary_inputs,
+ )
+ overlooked_score_per_image = pool_box_scores_per_image(
+ overlooked_scores_per_box, temperature=temperature
+ )
+
+ badloc_scores_per_box = compute_badloc_box_scores(
+ alpha=alpha,
+ low_probability_threshold=low_probability_threshold,
+ auxiliary_inputs=auxiliary_inputs,
+ )
+ badloc_score_per_image = pool_box_scores_per_image(
+ badloc_scores_per_box, temperature=temperature
+ )
+
+ swap_scores_per_box = compute_swap_box_scores(
+ alpha=alpha,
+ high_probability_threshold=high_probability_threshold,
+ auxiliary_inputs=auxiliary_inputs,
+ )
+ swap_score_per_image = pool_box_scores_per_image(swap_scores_per_box, temperature=temperature)
+
+ scores = (
+ aggregation_weights["overlooked"] * np.log(TINY_VALUE + overlooked_score_per_image)
+ + aggregation_weights["badloc"] * np.log(TINY_VALUE + badloc_score_per_image)
+ + aggregation_weights["swap"] * np.log(TINY_VALUE + swap_score_per_image)
+ )
+
+ scores = np.exp(scores)
+
+ return scores
diff --git a/cleanlab/object_detection/summary.py b/cleanlab/object_detection/summary.py
new file mode 100644
index 0000000000..1e83ee89f7
--- /dev/null
+++ b/cleanlab/object_detection/summary.py
@@ -0,0 +1,239 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""Methods to display examples and their label issues in an object detection dataset."""
+from typing import Optional, Any, Dict, Tuple, Union, TYPE_CHECKING, TypeVar
+
+import numpy as np
+
+from cleanlab.internal.constants import MAX_CLASS_TO_SHOW
+from cleanlab.object_detection.rank import (
+ _separate_prediction,
+ _separate_label,
+ _get_prediction_type,
+)
+
+from cleanlab.internal.object_detection_utils import bbox_xyxy_to_xywh
+
+if TYPE_CHECKING:
+ from PIL.Image import Image as Image # pragma: no cover
+else:
+ Image = TypeVar("Image")
+
+
+def visualize(
+ image: Union[str, Image],
+ *,
+ label: Optional[Dict[str, Any]] = None,
+ prediction: Optional[np.ndarray] = None,
+ prediction_threshold: Optional[float] = None,
+ overlay: bool = True,
+ class_names: Optional[Dict[Any, Any]] = None,
+ figsize: Optional[Tuple[int, int]] = None,
+ save_path: Optional[str] = None,
+) -> None:
+ """Display the annotated bounding boxes (given labels) and predicted bounding boxes (model predictions) for a particular image.
+ Given labels are shown in red, model predictions in blue.
+
+
+ Parameters
+ ----------
+ image:
+ Image object loaded into memory or full path to the image file. If path is provided, image is loaded into memory.
+
+ label:
+ The given label for a single image in the format ``{'bboxes': np.ndarray((L,4)), 'labels': np.ndarray((L,))}`` where
+ ``L`` is the number of bounding boxes for the `i`-th image and ``bboxes[j]`` is in the format ``[x1,y1,x2,y2]`` with given label ``labels[j]``.
+
+ Note: Here, ``[x1,y1]`` corresponds to the coordinates of the bottom-left corner of the bounding box, while ``[x2,y2]`` corresponds to the coordinates of the top-right corner of the bounding box. The last column, pred_prob, represents the predicted probability that the bounding box contains an object of the class k.
+
+ prediction:
+ A prediction for a single image in the format ``np.ndarray((K,))`` and ``prediction[k]`` is of shape ``np.ndarray(N,5)``
+ where ``M`` is the number of predicted bounding boxes for class ``k`` and the five columns correspond to ``[x,y,x,y,pred_prob]`` where
+ ``[x,y,x,y]`` are the bounding box coordinates predicted by the model and ``pred_prob`` is the model's confidence in ``predictions[i]``.
+
+ prediction_threshold:
+ All model-predicted bounding boxes with confidence (`pred_prob`)
+ below this threshold are omitted from the visualization.
+
+ overlay: bool
+ If True, display a single image with given labels and predictions overlaid.
+ If False, display two images (side by side) with the left image showing the model predictions and the right image showing the given label.
+
+ class_names:
+ Optional dictionary mapping one-hot-encoded class labels back to their original class names in the format ``{"integer-label": "original-class-name"}``.
+
+ save_path:
+ Path to save figure at. If a path is provided, the figure is saved. To save in a specific image format, add desired file extension to the end of `save_path`. Allowed file extensions are: 'png', 'pdf', 'ps', 'eps', and 'svg'.
+
+ figsize:
+ Optional figure size for plotting the image.
+ Corresponds to ``matplotlib.figure.figsize``.
+ """
+ try:
+ import matplotlib.pyplot as plt
+ except ImportError as e:
+ raise ImportError(
+ "This functionality requires matplotlib. Install it via: `pip install matplotlib`"
+ )
+
+ # Create figure and axes
+ if isinstance(image, str):
+ image = plt.imread(image)
+
+ if prediction is not None:
+ prediction_type = _get_prediction_type(prediction)
+ pbbox, plabels, pred_probs = _separate_prediction(
+ prediction, prediction_type=prediction_type
+ )
+
+ if prediction_threshold is not None:
+ keep_idx = np.where(pred_probs > prediction_threshold)
+ pbbox = pbbox[keep_idx]
+ plabels = plabels[keep_idx]
+
+ if label is not None:
+ abbox, alabels = _separate_label(label)
+
+ if overlay:
+ figsize = (8, 5) if figsize is None else figsize
+ fig, ax = plt.subplots(frameon=False, figsize=figsize)
+ plt.axis("off")
+ ax.imshow(image)
+ if label is not None:
+ fig, ax = _draw_boxes(
+ fig, ax, abbox, alabels, edgecolor="r", linestyle="-", linewidth=1
+ )
+ if prediction is not None:
+ _, _ = _draw_boxes(fig, ax, pbbox, plabels, edgecolor="b", linestyle="-.", linewidth=1)
+ else:
+ figsize = (14, 10) if figsize is None else figsize
+ fig, axes = plt.subplots(nrows=1, ncols=2, frameon=False, figsize=figsize)
+ axes[0].axis("off")
+ axes[0].imshow(image)
+ axes[1].axis("off")
+ axes[1].imshow(image)
+
+ if label is not None:
+ fig, ax = _draw_boxes(
+ fig, axes[0], abbox, alabels, edgecolor="r", linestyle="-", linewidth=1
+ )
+ if prediction is not None:
+ _, _ = _draw_boxes(
+ fig, axes[1], pbbox, plabels, edgecolor="b", linestyle="-.", linewidth=1
+ )
+ bbox_extra_artists = None
+ if label or prediction is not None:
+ legend, plt = _plot_legend(class_names, label, prediction)
+ bbox_extra_artists = (legend,)
+
+ if save_path:
+ allowed_image_formats = set(["png", "pdf", "ps", "eps", "svg"])
+ image_format: Optional[str] = None
+ if save_path.split(".")[-1] in allowed_image_formats and "." in save_path:
+ image_format = save_path.split(".")[-1]
+ plt.savefig(
+ save_path,
+ format=image_format,
+ bbox_extra_artists=bbox_extra_artists,
+ bbox_inches="tight",
+ transparent=True,
+ pad_inches=0.5,
+ )
+ plt.show()
+
+
+def _plot_legend(class_names, label, prediction):
+ colors = ["black"]
+ colors.extend(["red"] if label is not None else [])
+ colors.extend(["blue"] if prediction is not None else [])
+
+ markers = [None]
+ markers.extend(["s"] if label is not None else [])
+ markers.extend(["s"] if prediction is not None else [])
+
+ labels = [r"$\bf{Legend}$"]
+ labels.extend(["given label"] if label is not None else [])
+ labels.extend(["predicted label"] if prediction is not None else [])
+
+ if class_names:
+ colors += ["black"] + ["black"] * min(len(class_names), MAX_CLASS_TO_SHOW)
+ markers += [None] + [f"${class_key}$" for class_key in class_names.keys()]
+ labels += [r"$\bf{classes}$"] + list(class_names.values())
+
+ try:
+ import matplotlib.pyplot as plt
+ except ImportError as e:
+ raise ImportError(
+ "This functionality requires matplotlib. Install it via: `pip install matplotlib`"
+ )
+
+ f = lambda m, c: plt.plot([], [], marker=m, color=c, ls="none")[0]
+ handles = [f(marker, color) for marker, color in zip(markers, colors)]
+ legend = plt.legend(
+ handles, labels, bbox_to_anchor=(1.04, 0.05), loc="lower left", borderaxespad=0
+ )
+
+ return legend, plt
+
+
+def _draw_labels(ax, rect, label, edgecolor):
+ """Helper function to draw labels on an axis."""
+
+ rx, ry = rect.get_xy()
+ c_xleft = rx + 10
+ c_xright = rx + rect.get_width() - 10
+ c_ytop = ry + 12
+
+ if edgecolor == "r":
+ cx, cy = c_xleft, c_ytop
+ else: # edgecolor == b
+ cx, cy = c_xright, c_ytop
+
+ l = ax.annotate(
+ label, (cx, cy), fontsize=8, fontweight="bold", color="white", ha="center", va="center"
+ )
+ l.set_bbox(dict(facecolor=edgecolor, alpha=0.35, edgecolor=edgecolor, pad=2))
+ return ax
+
+
+def _draw_boxes(fig, ax, bboxes, labels, edgecolor="g", linestyle="-", linewidth=3):
+ """Helper function to draw bboxes and labels on an axis."""
+ bboxes = [bbox_xyxy_to_xywh(box) for box in bboxes]
+
+ try:
+ from matplotlib.patches import Rectangle
+ except Exception as e:
+ raise ImportError(
+ "This functionality requires matplotlib. Install it via: `pip install matplotlib`"
+ )
+
+ for (x, y, w, h), label in zip(bboxes, labels):
+ rect = Rectangle(
+ (x, y),
+ w,
+ h,
+ linewidth=linewidth,
+ linestyle=linestyle,
+ edgecolor=edgecolor,
+ facecolor="none",
+ )
+ ax.add_patch(rect)
+
+ if labels is not None:
+ ax = _draw_labels(ax, rect, label, edgecolor)
+
+ return fig, ax
diff --git a/cleanlab/outlier.py b/cleanlab/outlier.py
index b2190ada65..8fb50e8252 100644
--- a/cleanlab/outlier.py
+++ b/cleanlab/outlier.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -25,11 +25,12 @@
from cleanlab.count import get_confident_thresholds
from sklearn.neighbors import NearestNeighbors
from sklearn.exceptions import NotFittedError
-from typing import Optional, Union, Tuple, Dict
+from typing import Optional, Union, Tuple, Dict, cast
from cleanlab.internal.label_quality_utils import (
_subtract_confident_thresholds,
get_normalized_entropy,
)
+from cleanlab.internal.outlier import transform_distances_to_scores
from cleanlab.internal.validation import assert_valid_inputs, labels_to_array
from cleanlab.typing import LabelLike
@@ -37,10 +38,11 @@
class OutOfDistribution:
"""
Provides scores to detect Out Of Distribution (OOD) examples that are outliers in a dataset.
+
Each example's OOD score lies in [0,1] with smaller values indicating examples that are less typical under the data distribution.
OOD scores may be estimated from either: numeric feature embeddings or predicted probabilities from a trained classifier.
- To get indices of examples that are the most severe outliers, call :py:func:`find_top_issues ` function on the returned OOD scores.
+ To get indices of examples that are the most severe outliers, call `~cleanlab.rank.find_top_issues` function on the returned OOD scores.
Parameters
----------
@@ -55,7 +57,8 @@ class OutOfDistribution:
You can also pass in a subclass of ``sklearn.neighbors.NearestNeighbors`` which allows you to use faster
approximate neighbor libraries as long as you wrap them behind the same sklearn API.
If you specify ``knn`` here, there is no need to later call ``fit()`` before calling ``score()``.
- If ``knn = None``, then by default: ``knn = sklearn.neighbors.NearestNeighbors(n_neighbors=k, metric="cosine").fit(features)``
+ If ``knn = None``, then by default: ``knn = sklearn.neighbors.NearestNeighbors(n_neighbors=k, metric=dist_metric).fit(features)``
+ where ``dist_metric == "cosine"`` if ``dim(features) > 3`` or ``dist_metric == "euclidean"`` otherwise.
See: https://scikit-learn.org/stable/modules/neighbors.html
* k : int, default=None
Optional number of neighbors to use when calculating outlier score (average distance to neighbors).
@@ -92,7 +95,7 @@ class OutOfDistribution:
OUTLIER_PARAMS = {"k", "t", "knn"}
OOD_PARAMS = {"confident_thresholds", "adjust_pred_probs", "method"}
- DEFAULT_PARAM_DICT: Dict[Union[str, int, None], Union[str, int, None, np.ndarray]] = {
+ DEFAULT_PARAM_DICT: Dict[str, Union[str, int, None, np.ndarray]] = {
"k": None, # ood features param
"t": 1, # ood features param
"knn": None, # ood features param
@@ -101,10 +104,11 @@ class OutOfDistribution:
"confident_thresholds": None, # ood pred_probs param
}
- def __init__(self, params: dict = {}):
+ def __init__(self, params: Optional[dict] = None) -> None:
self._assert_valid_params(params, self.DEFAULT_PARAM_DICT)
- self.params = self.DEFAULT_PARAM_DICT
- self.params = {**self.params, **params}
+ self.params = self.DEFAULT_PARAM_DICT.copy()
+ if params is not None:
+ self.params.update(params)
def fit_score(
self,
@@ -116,27 +120,27 @@ def fit_score(
) -> np.ndarray:
"""
Fits this estimator to a given dataset and returns out-of-distribution scores for the same dataset.
+
Scores lie in [0,1] with smaller values indicating examples that are less typical under the dataset
distribution (values near 0 indicate outliers). Exactly one of `features` or `pred_probs` needs to be passed
in to calculate scores.
If `features` are passed in a ``NearestNeighbors`` object is fit. If `pred_probs` and 'labels' are passed in a
- `confident_thresholds` ``np.ndarray`` is fit. For details see :py:func:`fit
- `.
+ `confident_thresholds` ``np.ndarray`` is fit. For details see `~cleanlab.outlier.OutOfDistribution.fit`.
Parameters
----------
features : np.ndarray, optional
Feature array of shape ``(N, M)``, where N is the number of examples and M is the number of features used to represent each example.
- For details, `features` in the same format expected by the :py:func:`fit ` function.
+ For details, `features` in the same format expected by the `~cleanlab.outlier.OutOfDistribution.fit` function.
pred_probs : np.ndarray, optional
An array of shape ``(N, K)`` of predicted class probabilities output by a trained classifier.
- For details, `pred_probs` in the same format expected by the :py:func:`fit ` function.
+ For details, `pred_probs` in the same format expected by the `~cleanlab.outlier.OutOfDistribution.fit` function.
labels : array_like, optional
A discrete array of given class labels for the data of shape ``(N,)``.
- For details, `labels` in the same format expected by the :py:func:`fit ` function.
+ For details, `labels` in the same format expected by the `~cleanlab.outlier.OutOfDistribution.fit` function.
verbose : bool, default = True
Set to ``False`` to suppress all print statements.
@@ -146,7 +150,7 @@ def fit_score(
scores : np.ndarray
If `features` are passed in, `ood_features_scores` are returned.
If `pred_probs` are passed in, `ood_predictions_scores` are returned.
- For details see return of :py:func:`score ` function.
+ For details see return of `~cleanlab.outlier.OutOfDistribution.scores` function.
"""
scores = self._shared_fit(
@@ -171,18 +175,19 @@ def fit(
):
"""
Fits this estimator to a given dataset.
+
One of `features` or `pred_probs` must be specified.
If `features` are passed in, a ``NearestNeighbors`` object is fit.
If `pred_probs` and 'labels' are passed in, a `confident_thresholds` ``np.ndarray`` is fit.
- For details see :py:class:`OutOfDistribution ` documentation.
+ For details see `~cleanlab.outlier.OutOfDistribution` documentation.
Parameters
----------
features : np.ndarray, optional
Feature array of shape ``(N, M)``, where N is the number of examples and M is the number of features used to represent each example.
- All features should be **numeric**. For less structured data (eg. images, text, categorical values, ...), you should provide
- vector embeddings to represent each example (eg. extracted from some pretrained neural network).
+ All features should be **numeric**. For less structured data (e.g. images, text, categorical values, ...), you should provide
+ vector embeddings to represent each example (e.g. extracted from some pretrained neural network).
pred_probs : np.ndarray, optional
An array of shape ``(N, K)`` of model-predicted probabilities,
@@ -214,7 +219,8 @@ def score(
self, *, features: Optional[np.ndarray] = None, pred_probs: Optional[np.ndarray] = None
) -> np.ndarray:
"""
- Uses fitted estimator and passed in `features` or `pred_probs` to calculate out-of-distribution scores for a dataset.
+ Use fitted estimator and passed in `features` or `pred_probs` to calculate out-of-distribution scores for a dataset.
+
Score for each example corresponds to the likelihood this example stems from the same distribution as the dataset previously specified in ``fit()`` (i.e. is not an outlier).
If `features` are passed, returns OOD score for each example based on its feature values.
@@ -225,11 +231,11 @@ def score(
----------
features : np.ndarray, optional
Feature array of shape ``(N, M)``, where N is the number of examples and M is the number of features used to represent each example.
- For details, see `features` in :py:func:`fit ` function.
+ For details, see `features` in `~cleanlab.outlier.OutOfDistribution.fit` function.
pred_probs : np.ndarray, optional
An array of shape ``(N, K)`` of predicted class probabilities output by a trained classifier.
- For details, see `pred_probs` in :py:func:`fit ` function.
+ For details, see `pred_probs` in `~cleanlab.outlier.OutOfDistribution.fit` function.
Returns
-------
@@ -249,37 +255,27 @@ def score(
if features is not None:
if self.params["knn"] is None:
raise ValueError(
- f"OOD estimator needs to be fit on features first. Call `fit()` or `fit_scores()` before this function."
- )
- else:
- scores, _ = _get_ood_features_scores(
- features, **self._get_params(self.OUTLIER_PARAMS)
+ "OOD estimator needs to be fit on features first. Call `fit()` or `fit_scores()` before this function."
)
+ scores, _ = _get_ood_features_scores(features, **self._get_params(self.OUTLIER_PARAMS))
if pred_probs is not None:
if self.params["confident_thresholds"] is None and self.params["adjust_pred_probs"]:
raise ValueError(
- f"OOD estimator needs to be fit on pred_probs first since params['adjust_pred_probs']=True. Call `fit()` or `fit_scores()` before this function."
- )
- else:
- scores, _ = _get_ood_predictions_scores(
- pred_probs, **self._get_params(self.OOD_PARAMS)
+ "OOD estimator needs to be fit on pred_probs first since params['adjust_pred_probs']=True. Call `fit()` or `fit_scores()` before this function."
)
+ scores, _ = _get_ood_predictions_scores(pred_probs, **self._get_params(self.OOD_PARAMS))
return scores
def _get_params(self, param_keys) -> dict:
- """
- Helper method to get function specific dictionary of parameters (i.e. only those in param_keys).
- """
+ """Get function specific dictionary of parameters (i.e. only those in param_keys)."""
return {k: v for k, v in self.params.items() if k in param_keys}
@staticmethod
def _assert_valid_params(params, param_keys):
- """
- Helper method to check passed in params valid and get list of parameters in param that are not in param_keys.
- """
- if len(params) > 0:
+ """Validate passed in params and get list of parameters in param that are not in param_keys."""
+ if params is not None:
wrong_params = list(set(params.keys()).difference(set(param_keys)))
if len(wrong_params) > 0:
raise ValueError(
@@ -288,23 +284,21 @@ def _assert_valid_params(params, param_keys):
@staticmethod
def _assert_valid_inputs(features, pred_probs):
- """
- Helper method to check features and pred_prob inputs are valid. Throws error if not.
- """
+ """Check whether features and pred_prob inputs are valid, throw error if not."""
if features is None and pred_probs is None:
raise ValueError(
- f"Not enough information to compute scores. Pass in either features or pred_probs."
+ "Not enough information to compute scores. Pass in either features or pred_probs."
)
if features is not None and pred_probs is not None:
raise ValueError(
- f"Cannot fit to OOD Estimator to both features and pred_probs. Pass in either one or the other."
+ "Cannot fit to OOD Estimator to both features and pred_probs. Pass in either one or the other."
)
if features is not None and len(features.shape) != 2:
raise ValueError(
- f"Feature array needs to be of shape (N, M), where N is the number of examples and M is the "
- f"number of features used to represent each example. "
+ "Feature array needs to be of shape (N, M), where N is the number of examples and M is the "
+ "number of features used to represent each example. "
)
def _shared_fit(
@@ -317,8 +311,9 @@ def _shared_fit(
) -> Optional[np.ndarray]:
"""
Shared fit functionality between ``fit()`` and ``fit_score()``.
- For details, refer to :py:func:`fit `
- or :py:func:`fit_score `.
+
+ For details, refer to `~cleanlab.outlier.OutOfDistribution.fit`
+ or `~cleanlab.outlier.OutOfDistribution.fit_score`.
"""
self._assert_valid_inputs(features, pred_probs)
scores = None # If none scores are returned, fit was skipped
@@ -327,7 +322,7 @@ def _shared_fit(
if self.params["knn"] is not None:
# No fitting twice if knn object already fit
warnings.warn(
- f"A KNN estimator has previously already been fit, call score() to apply it to data, or create a new OutOfDistribution object to fit a different estimator.",
+ "A KNN estimator has previously already been fit, call score() to apply it to data, or create a new OutOfDistribution object to fit a different estimator.",
UserWarning,
)
else:
@@ -343,7 +338,7 @@ def _shared_fit(
if self.params["confident_thresholds"] is not None:
# No fitting twice if confident_thresholds object already fit
warnings.warn(
- f"Confident thresholds have previously already been fit, call score() to apply them to data, or create a new OutOfDistribution object to fit a different estimator.",
+ "Confident thresholds have previously already been fit, call score() to apply them to data, or create a new OutOfDistribution object to fit a different estimator.",
UserWarning,
)
else:
@@ -357,8 +352,8 @@ def _shared_fit(
)
if confident_thresholds is None:
warnings.warn(
- f"No estimates need to be be fit under the provided params, so you could directly call "
- f"score() as an alternative.",
+ "No estimates need to be be fit under the provided params, so you could directly call "
+ "score() as an alternative.",
UserWarning,
)
else:
@@ -372,25 +367,28 @@ def _get_ood_features_scores(
k: Optional[int] = None,
t: int = 1,
) -> Tuple[np.ndarray, Optional[NearestNeighbors]]:
- """Returns an outlier score for each example based on its feature values which is computed inversely proportional
- to the average distance between this example and its K nearest neighbors (in feature space).
+ """
+ Return outlier score based on feature values using `k` nearest neighbors.
+
+ The outlier score for each example is computed inversely proportional to
+ the average distance between this example and its K nearest neighbors (in feature space).
Parameters
----------
features : np.ndarray
Feature array of shape ``(N, M)``, where N is the number of examples and M is the number of features used to represent each example.
- For details, `features` in the same format expected by the :py:func:`fit ` function.
+ For details, `features` in the same format expected by the `~cleanlab.outlier.OutOfDistribution.fit` function.
knn : sklearn.neighbors.NearestNeighbors, default = None
- For details, see key `knn` in the params dict arg of :py:class:`OutOfDistribution `.
+ For details, see key `knn` in the params dict arg of `~cleanlab.outlier.OutOfDistribution`.
k : int, default=None
Optional number of neighbors to use when calculating outlier score (average distance to neighbors).
- For details, see key `k` in the params dict arg of :py:class:`OutOfDistribution `.
+ For details, see key `k` in the params dict arg of `~cleanlab.outlier.OutOfDistribution`.
t : int, default=1
Controls transformation of distances between examples into similarity scores that lie in [0,1].
- For details, see key `t` in the params dict arg of :py:class:`OutOfDistribution `.
+ For details, see key `t` in the params dict arg of `~cleanlab.outlier.OutOfDistribution`.
Returns
-------
@@ -403,7 +401,7 @@ def _get_ood_features_scores(
# Make sure both knn and features are not None
if features is None:
raise ValueError(
- f"Both knn and features arguments cannot be None at the same time. Not enough information to compute outlier scores."
+ "Both knn and features arguments cannot be None at the same time. Not enough information to compute outlier scores."
)
if k is None:
k = DEFAULT_K # use default when knn and k are both None
@@ -411,8 +409,15 @@ def _get_ood_features_scores(
raise ValueError(
f"Number of nearest neighbors k={k} cannot exceed the number of examples N={len(features)} passed into the estimator (knn)."
)
- knn = NearestNeighbors(n_neighbors=k, metric="cosine").fit(features)
+
+ if features.shape[1] > 3: # use euclidean distance for lower dimensional spaces
+ metric = "cosine"
+ else:
+ metric = "euclidean"
+
+ knn = NearestNeighbors(n_neighbors=k, metric=metric).fit(features)
features = None # features should be None in knn.kneighbors(features) to avoid counting duplicate data points
+
elif k is None:
k = knn.n_neighbors
@@ -428,7 +433,7 @@ def _get_ood_features_scores(
# Fit knn estimator on the features if a non-fitted estimator is passed in
try:
knn.kneighbors(features)
- except NotFittedError as e:
+ except NotFittedError:
knn.fit(features)
# Get distances to k-nearest neighbors Note that the knn object contains the specification of distance metric
@@ -436,11 +441,7 @@ def _get_ood_features_scores(
# neighbor of each point is the point itself, at a distance of zero.
distances, _ = knn.kneighbors(features)
- # Calculate average distance to k-nearest neighbors
- avg_knn_distances = distances[:, :k].mean(axis=1)
-
- # Map ood_features_scores to range 0-1 with 0 = most concerning
- ood_features_scores: np.ndarray = np.exp(-1 * avg_knn_distances * t)
+ ood_features_scores = transform_distances_to_scores(distances, cast(int, k), t)
return (ood_features_scores, knn)
@@ -452,27 +453,27 @@ def _get_ood_predictions_scores(
adjust_pred_probs: bool = True,
method: str = "entropy",
) -> Tuple[np.ndarray, Optional[np.ndarray]]:
- """Returns an OOD (out of distribution) score for each example based on it pred_prob values.
+ """Return an OOD (out of distribution) score for each example based on it pred_prob values.
Parameters
----------
pred_probs : np.ndarray
An array of shape ``(N, K)`` of model-predicted probabilities,
- `pred_probs` in the same format expected by the :py:func:`fit ` function.
+ `pred_probs` in the same format expected by the `~cleanlab.outlier.OutOfDistribution.fit` function.
confident_thresholds : np.ndarray, default = None
- For details, see key `confident_thresholds` in the params dict arg of :py:class:`OutOfDistribution `.
+ For details, see key `confident_thresholds` in the params dict arg of `~cleanlab.outlier.OutOfDistribution`.
labels : array_like, optional
- `labels` in the same format expected by the :py:func:`fit ` function.
+ `labels` in the same format expected by the `~cleanlab.outlier.OutOfDistribution.fit` function.
adjust_pred_probs : bool, True
Account for class imbalance in the label-quality scoring.
- For details, see key `adjust_pred_probs` in the params dict arg of :py:class:`OutOfDistribution `.
+ For details, see key `adjust_pred_probs` in the params dict arg of `~cleanlab.outlier.OutOfDistribution`.
method : {"entropy", "least_confidence"}, default="entropy"
OOD scoring method.
- For details see key `method` in the params dict arg of :py:class:`OutOfDistribution `.
+ For details see key `method` in the params dict arg of `~cleanlab.outlier.OutOfDistribution`.
Returns
@@ -480,17 +481,16 @@ def _get_ood_predictions_scores(
ood_predictions_scores : Tuple[np.ndarray, Optional[np.ndarray]]
Returns a tuple. First element is array of `ood_predictions_scores` and second is an np.ndarray of `confident_thresholds` or None is 'confident_thresholds' is not calculated.
"""
-
- valid_methods = [
+ valid_methods = (
"entropy",
"least_confidence",
- ]
+ )
if (confident_thresholds is not None or labels is not None) and not adjust_pred_probs:
warnings.warn(
- f"OOD scores are not adjusted with confident thresholds. If scores need to be adjusted set "
- f"params['adjusted_pred_probs'] = True. Otherwise passing in confident_thresholds and/or labels does not change "
- f"score calculation.",
+ "OOD scores are not adjusted with confident thresholds. If scores need to be adjusted set "
+ "params['adjusted_pred_probs'] = True. Otherwise passing in confident_thresholds and/or labels does not change "
+ "score calculation.",
UserWarning,
)
@@ -498,15 +498,12 @@ def _get_ood_predictions_scores(
if confident_thresholds is None:
if labels is None:
raise ValueError(
- f"Cannot calculate adjust_pred_probs without labels. Either pass in labels parameter or set "
- f"params['adjusted_pred_probs'] = False. "
- )
- else:
- labels = labels_to_array(labels)
- assert_valid_inputs(X=None, y=labels, pred_probs=pred_probs, multi_label=False)
- confident_thresholds = get_confident_thresholds(
- labels, pred_probs, multi_label=False
+ "Cannot calculate adjust_pred_probs without labels. Either pass in labels parameter or set "
+ "params['adjusted_pred_probs'] = False. "
)
+ labels = labels_to_array(labels)
+ assert_valid_inputs(X=None, y=labels, pred_probs=pred_probs, multi_label=False)
+ confident_thresholds = get_confident_thresholds(labels, pred_probs, multi_label=False)
pred_probs = _subtract_confident_thresholds(
None, pred_probs, multi_label=False, confident_thresholds=confident_thresholds
diff --git a/cleanlab/rank.py b/cleanlab/rank.py
index ef2e03b5a3..9e51da9fd6 100644
--- a/cleanlab/rank.py
+++ b/cleanlab/rank.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -25,6 +25,7 @@
Note: multi-label classification is not supported by most methods in this module,
each example must be labeled as belonging to a single class, e.g. format: ``labels = np.ndarray([1,0,2,1,1,0...])``.
+For multi-label classification, instead see :py:func:`multilabel_classification.get_label_quality_scores `.
Note: Label quality scores are most accurate when they are computed based on out-of-sample `pred_probs` from your model.
To obtain out-of-sample predicted probabilities for every datapoint in your dataset, you can use :ref:`cross-validation `. This is encouraged to get better results.
@@ -36,6 +37,10 @@
import warnings
from cleanlab.internal.validation import assert_valid_inputs
+from cleanlab.internal.constants import (
+ CLIPPING_LOWER_BOUND,
+) # lower-bound clipping threshold to prevents 0 in logs and division
+
from cleanlab.internal.label_quality_utils import (
_subtract_confident_thresholds,
get_normalized_entropy,
@@ -119,15 +124,28 @@ class 0, 1, ..., K-1.
assert_valid_inputs(
X=None, y=labels, pred_probs=pred_probs, multi_label=False, allow_one_class=True
)
+ return _compute_label_quality_scores(
+ labels=labels, pred_probs=pred_probs, method=method, adjust_pred_probs=adjust_pred_probs
+ )
+
- # Available scoring functions to choose from
+def _compute_label_quality_scores(
+ labels: np.ndarray,
+ pred_probs: np.ndarray,
+ *,
+ method: str = "self_confidence",
+ adjust_pred_probs: bool = False,
+ confident_thresholds: Optional[np.ndarray] = None,
+) -> np.ndarray:
+ """Internal implementation of get_label_quality_scores that assumes inputs
+ have already been checked and are valid. This speeds things up.
+ Can also take in pre-computed confident_thresholds to further accelerate things.
+ """
scoring_funcs = {
"self_confidence": get_self_confidence_for_each_label,
"normalized_margin": get_normalized_margin_for_each_label,
"confidence_weighted_entropy": get_confidence_weighted_entropy_for_each_label,
}
-
- # Select scoring function
try:
scoring_func = scoring_funcs[method]
except KeyError:
@@ -137,22 +155,15 @@ class 0, 1, ..., K-1.
Please choose a valid rank_by: self_confidence, normalized_margin, confidence_weighted_entropy
"""
)
-
- # Adjust predicted probabilities
if adjust_pred_probs:
-
- # Check if adjust_pred_probs is supported for the chosen method
if method == "confidence_weighted_entropy":
raise ValueError(f"adjust_pred_probs is not currently supported for {method}.")
+ pred_probs = _subtract_confident_thresholds(
+ labels=labels, pred_probs=pred_probs, confident_thresholds=confident_thresholds
+ )
- pred_probs = _subtract_confident_thresholds(labels, pred_probs)
-
- # Pass keyword arguments for scoring function
- input = {"labels": labels, "pred_probs": pred_probs}
-
- # Calculate scores
- label_quality_scores = scoring_func(**input)
-
+ scoring_inputs = {"labels": labels, "pred_probs": pred_probs}
+ label_quality_scores = scoring_func(**scoring_inputs)
return label_quality_scores
@@ -229,8 +240,6 @@ def get_label_quality_ensemble_scores(
get_label_quality_scores
"""
- MIN_ALLOWED = 1e-6 # lower-bound clipping threshold to prevents 0 in logs and division
-
# Check pred_probs_list for errors
assert isinstance(
pred_probs_list, list
@@ -259,20 +268,18 @@ def get_label_quality_ensemble_scores(
# This weighting scheme performs search of t in log_loss_search_T_values for "best" log loss
if weight_ensemble_members_by == "log_loss_search":
-
# Initialize variables for log loss search
pred_probs_avg_log_loss_weighted = None
neg_log_loss_weights = None
best_eval_log_loss = float("inf")
for t in log_loss_search_T_values:
-
neg_log_loss_list = []
# pred_probs for each model
for pred_probs in pred_probs_list:
pred_probs_clipped = np.clip(
- pred_probs, a_min=MIN_ALLOWED, a_max=None
+ pred_probs, a_min=CLIPPING_LOWER_BOUND, a_max=None
) # lower-bound clipping threshold to prevents 0 in logs when calculating log loss
pred_probs_clipped /= pred_probs_clipped.sum(axis=1)[:, np.newaxis] # renormalize
@@ -299,7 +306,6 @@ def get_label_quality_ensemble_scores(
scores_list = []
accuracy_list = []
for pred_probs in pred_probs_list:
-
# Calculate scores and accuracy
scores = get_label_quality_scores(
labels=labels,
@@ -349,7 +355,6 @@ def get_label_quality_ensemble_scores(
label_quality_scores = (scores_ensemble * weights).sum(axis=1)
elif weight_ensemble_members_by == "custom":
-
# Check custom_weights for errors
assert (
custom_weights is not None
@@ -499,9 +504,9 @@ def get_self_confidence_for_each_label(
Lower scores indicate more likely mislabeled examples.
"""
- # np.mean is used so that this works for multi-labels (list of lists)
- label_quality_scores = np.array([np.mean(pred_probs[i, l]) for i, l in enumerate(labels)])
- return label_quality_scores
+ # To make this work for multi-label (but it will slow down runtime), replace:
+ # pred_probs[i, l] -> np.mean(pred_probs[i, l])
+ return np.array([pred_probs[i, l] for i, l in enumerate(labels)])
def get_normalized_margin_for_each_label(
@@ -571,15 +576,14 @@ def get_confidence_weighted_entropy_for_each_label(
Lower scores indicate more likely mislabeled examples.
"""
- MIN_ALLOWED = 1e-6 # lower-bound clipping threshold to prevents 0 in logs and division
self_confidence = get_self_confidence_for_each_label(labels, pred_probs)
- self_confidence = np.clip(self_confidence, a_min=MIN_ALLOWED, a_max=None)
+ self_confidence = np.clip(self_confidence, a_min=CLIPPING_LOWER_BOUND, a_max=None)
# Divide entropy by self confidence
label_quality_scores = get_normalized_entropy(pred_probs) / self_confidence
# Rescale
- clipped_scores = np.clip(label_quality_scores, a_min=MIN_ALLOWED, a_max=None)
+ clipped_scores = np.clip(label_quality_scores, a_min=CLIPPING_LOWER_BOUND, a_max=None)
label_quality_scores = np.log(label_quality_scores + 1) / clipped_scores
return label_quality_scores
diff --git a/cleanlab/regression/__init__.py b/cleanlab/regression/__init__.py
new file mode 100644
index 0000000000..9928af71eb
--- /dev/null
+++ b/cleanlab/regression/__init__.py
@@ -0,0 +1,2 @@
+from . import rank
+from . import learn
diff --git a/cleanlab/regression/learn.py b/cleanlab/regression/learn.py
new file mode 100644
index 0000000000..d026741f9a
--- /dev/null
+++ b/cleanlab/regression/learn.py
@@ -0,0 +1,880 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""
+cleanlab can be used for learning with noisy data for any dataset and regression model.
+
+For regression tasks, the :py:class:`regression.learn.CleanLearning `
+class wraps any instance of an sklearn model to allow you to train more robust regression models,
+or use the model to identify corrupted values in the dataset.
+The wrapped model must adhere to the `sklearn estimator API
+`_,
+meaning it must define three functions:
+
+* ``model.fit(X, y, sample_weight=None)``
+* ``model.predict(X)``
+* ``model.score(X, y, sample_weight=None)``
+
+where ``X`` contains the data (i.e. features, covariates, independant variables) and ``y`` contains the target
+value (i.e. label, response/dependant variable). The first index of ``X`` and of ``y`` should correspond to the different
+examples in the dataset, such that ``len(X) = len(y) = N`` (sample-size).
+
+Your model should be correctly clonable via
+`sklearn.base.clone `_:
+cleanlab internally creates multiple instances of the model, and if you e.g. manually wrap a
+PyTorch model, ensure that every call to the estimator's ``__init__()`` creates an independent
+instance of the model (for sklearn compatibility, the weights of neural network models should typically
+be initialized inside of ``clf.fit()``).
+
+Example
+-------
+>>> from cleanlab.regression.learn import CleanLearning
+>>> from sklearn.linear_model import LinearRegression
+>>> cl = CleanLearning(clf=LinearRegression()) # Pass in any model.
+>>> cl.fit(X, y_with_noise)
+>>> # Estimate the predictions as if you had trained without label issues.
+>>> predictions = cl.predict(y)
+
+If your model is not sklearn-compatible by default, it might be the case that standard packages can adapt
+the model. For example, you can adapt PyTorch models using `skorch `_
+and adapt Keras models using `SciKeras `_.
+
+If an adapter doesn't already exist, you can manually wrap your
+model to be sklearn-compatible. This is made easy by inheriting from
+`sklearn.base.BaseEstimator
+`_:
+
+.. code:: python
+
+ from sklearn.base import BaseEstimator
+
+ class YourModel(BaseEstimator):
+ def __init__(self, ):
+ pass
+ def fit(self, X, y):
+ pass
+ def predict(self, X):
+ pass
+ def score(self, X, y):
+ pass
+
+"""
+
+from typing import Optional, Union, Tuple
+import inspect
+import warnings
+
+import math
+import numpy as np
+import pandas as pd
+
+import sklearn.base
+from sklearn.base import BaseEstimator
+from sklearn.model_selection import KFold
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import r2_score
+
+from cleanlab.typing import LabelLike
+from cleanlab.internal.constants import TINY_VALUE
+from cleanlab.internal.util import train_val_split, subset_X_y
+from cleanlab.internal.regression_utils import assert_valid_regression_inputs
+from cleanlab.internal.validation import labels_to_array
+
+
+class CleanLearning(BaseEstimator):
+ """
+ CleanLearning = Machine Learning with cleaned data (even when training on messy, error-ridden data).
+
+ Automated and robust learning with noisy labels using any dataset and any regression model.
+ For regression tasks, this class trains a ``model`` with error-prone, noisy labels
+ as if the model had been instead trained on a dataset with perfect labels.
+ It achieves this by estimating which labels are noisy (you might solely use CleanLearning for this estimation)
+ and then removing examples estimated to have noisy labels, such that a more robust copy of the same model can be
+ trained on the remaining clean data.
+
+ Parameters
+ ----------
+ model :
+ Any regression model implementing the `sklearn estimator API `_,
+ defining the following functions:
+
+ - ``model.fit(X, y)``
+ - ``model.predict(X)``
+ - ``model.score(X, y)``
+
+ Default model used is `sklearn.linear_model.LinearRegression
+ `_.
+
+ cv_n_folds :
+ This class needs holdout predictions for every data example and if not provided,
+ uses cross-validation to compute them. This argument sets the number of cross-validation
+ folds used to compute out-of-sample predictions for each example in ``X``. Default is 5.
+ Larger values may produce better results, but requires longer to run.
+
+ n_boot :
+ Number of bootstrap resampling rounds used to estimate the model's epistemic uncertainty.
+ Default is 5. Larger values are expected to produce better results but require longer runtimes.
+
+ include_aleatoric_uncertainty :
+ Specifies if the aleatoric uncertainty should be estimated during label error detection.
+ ``True`` by default, which is expected to produce better results but require longer runtimes.
+
+ verbose :
+ Controls how much output is printed. Set to ``False`` to suppress print statements. Default `False`.
+
+ seed :
+ Set the default state of the random number generator used to split
+ the data. By default, uses ``np.random`` current random state.
+ """
+
+ def __init__(
+ self,
+ model: Optional[BaseEstimator] = None,
+ *,
+ cv_n_folds: int = 5,
+ n_boot: int = 5,
+ include_aleatoric_uncertainty: bool = True,
+ verbose: bool = False,
+ seed: Optional[bool] = None,
+ ):
+ if model is None:
+ # Use linear regression if no model is provided.
+ model = LinearRegression()
+
+ # Make sure the given regression model has the appropriate methods defined.
+ if not hasattr(model, "fit"):
+ raise ValueError("The model must define a .fit() method.")
+ if not hasattr(model, "predict"):
+ raise ValueError("The model must define a .predict() method.")
+
+ if seed is not None:
+ np.random.seed(seed=seed)
+
+ if n_boot < 0:
+ raise ValueError("n_boot cannot be a negative value")
+ if cv_n_folds < 2:
+ raise ValueError("cv_n_folds must be at least 2")
+
+ self.model: BaseEstimator = model
+ self.seed: Optional[int] = seed
+ self.cv_n_folds: int = cv_n_folds
+ self.n_boot: int = n_boot
+ self.include_aleatoric_uncertainty: bool = include_aleatoric_uncertainty
+ self.verbose: bool = verbose
+ self.label_issues_df: Optional[pd.DataFrame] = None
+ self.label_issues_mask: Optional[np.ndarray] = None
+ self.k: Optional[float] = None # frac flagged as issue
+
+ def fit(
+ self,
+ X: Union[np.ndarray, pd.DataFrame],
+ y: LabelLike,
+ *,
+ label_issues: Optional[Union[pd.DataFrame, np.ndarray]] = None,
+ sample_weight: Optional[np.ndarray] = None,
+ find_label_issues_kwargs: Optional[dict] = None,
+ model_kwargs: Optional[dict] = None,
+ model_final_kwargs: Optional[dict] = None,
+ ) -> BaseEstimator:
+ """
+ Train regression ``model`` with error-prone, noisy labels as if the model had been instead trained
+ on a dataset with the correct labels. ``fit`` achieves this by first training ``model`` via
+ cross-validation on the noisy data, using the resulting predicted probabilities to identify label issues,
+ pruning the data with label issues, and finally training ``model`` on the remaining clean data.
+
+ Parameters
+ ----------
+ X :
+ Data features (i.e. covariates, independent variables), typically an array of shape ``(N, ...)``,
+ where N is the number of examples (sample-size).
+ Your ``model`` must be able to ``fit()`` and ``predict()`` data of this format.
+
+ y :
+ An array of shape ``(N,)`` of noisy labels (i.e. target/response/dependant variable), where some values may be erroneous.
+
+ label_issues :
+ Optional already-identified label issues in the dataset (if previously estimated).
+ Specify this to avoid re-estimating the label issues if already done.
+ If ``pd.DataFrame``, must be formatted as the one returned by:
+ :py:meth:`self.find_label_issues ` or
+ :py:meth:`self.get_label_issues `. The DataFrame must
+ have a column named ``is_label_issue``.
+
+ If ``np.ndarray``, the input must be a boolean mask of length ``N`` where examples that have label issues
+ have the value ``True``, and the rest of the examples have the value ``False``.
+
+ sample_weight :
+ Optional array of weights with shape ``(N,)`` that are assigned to individual samples. Specifies how to weight the examples in
+ the loss function while training.
+
+ find_label_issues_kwargs:
+ Optional keyword arguments to pass into :py:meth:`self.find_label_issues `.
+
+ model_kwargs :
+ Optional keyword arguments to pass into model's ``fit()`` method.
+
+ model_final_kwargs :
+ Optional extra keyword arguments to pass into the final model's ``fit()`` on the cleaned data,
+ but not the ``fit()`` in each fold of cross-validation on the noisy data.
+ The final ``fit()`` will also receive the arguments in `clf_kwargs`, but these may be overwritten
+ by values in `clf_final_kwargs`. This can be useful for training differently in the final ``fit()``
+ than during cross-validation.
+
+ Returns
+ -------
+ self : CleanLearning
+ Fitted estimator that has all the same methods as any sklearn estimator.
+
+ After calling ``self.fit()``, this estimator also stores extra attributes such as:
+
+ - ``self.label_issues_df``: a ``pd.DataFrame`` containing label quality scores, boolean flags
+ indicating which examples have label issues, and predicted label values for each example.
+ Accessible via :py:meth:`self.get_label_issues `,
+ of similar format as the one returned by :py:meth:`self.find_label_issues `.
+ See documentation of :py:meth:`self.find_label_issues `
+ for column descriptions.
+ - ``self.label_issues_mask``: a ``np.ndarray`` boolean mask indicating if a particular
+ example has been identified to have issues.
+ """
+ assert_valid_regression_inputs(X, y)
+
+ if find_label_issues_kwargs is None:
+ find_label_issues_kwargs = {}
+ if model_kwargs is None:
+ model_kwargs = {}
+ if model_final_kwargs is None:
+ model_final_kwargs = {}
+ model_final_kwargs = {**model_kwargs, **model_final_kwargs}
+
+ if "sample_weight" in model_kwargs or "sample_weight" in model_final_kwargs:
+ raise ValueError(
+ "sample_weight should be provided directly in fit() rather than in model_kwargs or model_final_kwargs"
+ )
+
+ if sample_weight is not None:
+ if "sample_weight" not in inspect.getfullargspec(self.model.fit).args:
+ raise ValueError(
+ "sample_weight must be a supported fit() argument for your model in order to be specified here"
+ )
+ if len(sample_weight) != len(X):
+ raise ValueError("sample_weight must be a 1D array that has the same length as y.")
+
+ if label_issues is None:
+ if self.label_issues_df is not None and self.verbose:
+ print(
+ "If you already ran self.find_label_issues() and don't want to recompute, you "
+ "should pass the label_issues in as a parameter to this function next time."
+ )
+
+ label_issues = self.find_label_issues(
+ X,
+ y,
+ model_kwargs=model_kwargs,
+ **find_label_issues_kwargs,
+ )
+ else:
+ if self.verbose:
+ print("Using provided label_issues instead of finding label issues.")
+ if self.label_issues_df is not None:
+ print(
+ "These will overwrite self.label_issues_df and will be returned by "
+ "`self.get_label_issues()`. "
+ )
+
+ self.label_issues_df = self._process_label_issues_arg(label_issues, y)
+ self.label_issues_mask = self.label_issues_df["is_label_issue"].to_numpy()
+
+ X_mask = np.invert(self.label_issues_mask)
+ X_cleaned, y_cleaned = subset_X_y(X, y, X_mask)
+ if self.verbose:
+ print(f"Pruning {np.sum(self.label_issues_mask)} examples with label issues ...")
+ print(f"Remaining clean data has {len(y_cleaned)} examples.")
+
+ if sample_weight is not None:
+ model_final_kwargs["sample_weight"] = sample_weight[X_mask]
+ if self.verbose:
+ print("Fitting final model on the clean data with custom sample_weight ...")
+ else:
+ if self.verbose:
+ print("Fitting final model on the clean data ...")
+
+ self.model.fit(X_cleaned, y_cleaned, **model_final_kwargs)
+
+ if self.verbose:
+ print(
+ "Label issues stored in label_issues_df DataFrame accessible via: self.get_label_issues(). "
+ "Call self.save_space() to delete this potentially large DataFrame attribute."
+ )
+ return self
+
+ def predict(self, X: np.ndarray, *args, **kwargs) -> np.ndarray:
+ """
+ Predict class labels using your wrapped model.
+ Works just like ``model.predict()``.
+
+ Parameters
+ ----------
+ X : np.ndarray or DatasetLike
+ Test data in the same format expected by your wrapped regression model.
+
+ Returns
+ -------
+ predictions : np.ndarray
+ Predictions for the test examples.
+ """
+ return self.model.predict(X, *args, **kwargs)
+
+ def score(
+ self,
+ X: Union[np.ndarray, pd.DataFrame],
+ y: LabelLike,
+ sample_weight: Optional[np.ndarray] = None,
+ ) -> float:
+ """Evaluates your wrapped regression model's score on a test set `X` with target values `y`.
+ Uses your model's default scoring function, or r-squared score if your model as no ``"score"`` attribute.
+
+ Parameters
+ ----------
+ X :
+ Test data in the same format expected by your wrapped model.
+
+ y :
+ Test labels in the same format as labels previously used in ``fit()``.
+
+ sample_weight :
+ Optional array of shape ``(N,)`` or ``(N, 1)`` used to weight each test example when computing the score.
+
+ Returns
+ -------
+ score : float
+ Number quantifying the performance of this regression model on the test data.
+ """
+ if hasattr(self.model, "score"):
+ if "sample_weight" in inspect.getfullargspec(self.model.score).args:
+ return self.model.score(X, y, sample_weight=sample_weight)
+ else:
+ return self.model.score(X, y)
+ else:
+ return r2_score(
+ y,
+ self.model.predict(X),
+ sample_weight=sample_weight,
+ )
+
+ def find_label_issues(
+ self,
+ X: Union[np.ndarray, pd.DataFrame],
+ y: LabelLike,
+ *,
+ uncertainty: Optional[Union[np.ndarray, float]] = None,
+ coarse_search_range: list = [0.01, 0.05, 0.1, 0.15, 0.2],
+ fine_search_size: int = 3,
+ save_space: bool = False,
+ model_kwargs: Optional[dict] = None,
+ ) -> pd.DataFrame:
+ """
+ Identifies potential label issues (corrupted `y`-values) in the dataset, and estimates how noisy each label is.
+
+ Note: this method estimates the label issues from scratch. To access previously-estimated label issues from
+ this :py:class:`CleanLearning ` instance, use the
+ :py:meth:`self.get_label_issues ` method.
+
+ This is the method called to find label issues inside
+ :py:meth:`CleanLearning.fit() `
+ and they share mostly the same parameters.
+
+ Parameters
+ ----------
+ X :
+ Data features (i.e. covariates, independent variables), typically an array of shape ``(N, ...)``,
+ where N is the number of examples (sample-size).
+ Your ``model``, must be able to ``fit()`` and ``predict()`` data of this format.
+
+ y :
+ An array of shape ``(N,)`` of noisy labels (i.e. target/response/dependant variable), where some values may be erroneous.
+
+ uncertainty :
+ Optional estimated uncertainty for each example. Should be passed in as a float (constant uncertainty throughout all examples),
+ or a numpy array of length ``N`` (estimated uncertainty for each example).
+ If not provided, this method will estimate the uncertainty as the sum of the epistemic and aleatoric uncertainty.
+
+ save_space :
+ If True, then returned ``label_issues_df`` will not be stored as attribute.
+ This means some other methods like :py:meth:`self.get_label_issues ` will no longer work.
+
+ coarse_search_range :
+ The coarse search range to find the value of ``k``, which estimates the fraction of data which have label issues.
+ More values represent a more thorough search (better expected results but longer runtimes).
+
+ fine_search_size :
+ Size of fine-grained search grid to find the value of ``k``, which represents our estimate of the fraction of data which have label issues.
+ A higher number represents a more thorough search (better expected results but longer runtimes).
+
+
+ For info about the **other parameters**, see the docstring of :py:meth:`CleanLearning.fit()
+ `.
+
+ Returns
+ -------
+ label_issues_df : pd.DataFrame
+ DataFrame with info about label issues for each example.
+ Unless `save_space` argument is specified, same DataFrame is also stored as `self.label_issues_df` attribute accessible via
+ :py:meth:`get_label_issues`.
+
+ Each row represents an example from our dataset and the DataFrame may contain the following columns:
+
+ - *is_label_issue*: boolean mask for the entire dataset where ``True`` represents a label issue and ``False`` represents an example
+ that is accurately labeled with high confidence.
+ - *label_quality*: Numeric score that measures the quality of each label (how likely it is to be correct,
+ with lower scores indicating potentially erroneous labels).
+ - *given_label*: Values originally given for this example (same as `y` input).
+ - *predicted_label*: Values predicted by the trained model.
+ """
+
+ X, y = assert_valid_regression_inputs(X, y)
+
+ if model_kwargs is None:
+ model_kwargs = {}
+
+ if self.verbose:
+ print("Identifying label issues ...")
+
+ # compute initial values to find best k
+ initial_predictions = self._get_cv_predictions(X, y, model_kwargs=model_kwargs)
+ initial_residual = initial_predictions - y
+ initial_sorted_index = np.argsort(abs(initial_residual))
+ initial_r2 = r2_score(y, initial_predictions)
+
+ self.k, r2 = self._find_best_k(
+ X=X,
+ y=y,
+ sorted_index=initial_sorted_index,
+ coarse_search_range=coarse_search_range,
+ fine_search_size=fine_search_size,
+ )
+
+ # check if initial r2 score (ie. not removing anything) is the best
+ if initial_r2 >= r2:
+ self.k = 0
+
+ # get predictions using the best k
+ predictions = self._get_cv_predictions(
+ X, y, sorted_index=initial_sorted_index, k=self.k, model_kwargs=model_kwargs
+ )
+ residual = predictions - y
+
+ if uncertainty is None:
+ epistemic_uncertainty = self.get_epistemic_uncertainty(X, y, predictions=predictions)
+ if self.include_aleatoric_uncertainty:
+ aleatoric_uncertainty = self.get_aleatoric_uncertainty(X, residual)
+ else:
+ aleatoric_uncertainty = 0
+ uncertainty = epistemic_uncertainty + aleatoric_uncertainty
+ else:
+ if isinstance(uncertainty, np.ndarray) and len(y) != len(uncertainty):
+ raise ValueError(
+ "If uncertainty is passed in as an array, it must have the same length as y."
+ )
+
+ label_quality_scores = np.exp(-abs(residual) / (uncertainty + TINY_VALUE))
+
+ label_issues_mask = np.zeros(len(y), dtype=bool)
+ num_issues = math.ceil(len(y) * self.k)
+ issues_index = np.argsort(label_quality_scores)[:num_issues]
+ label_issues_mask[issues_index] = True
+
+ # convert predictions to int if input is int
+ if y.dtype == int:
+ predictions = predictions.astype(int)
+
+ label_issues_df = pd.DataFrame(
+ {
+ "is_label_issue": label_issues_mask,
+ "label_quality": label_quality_scores,
+ "given_label": y,
+ "predicted_label": predictions,
+ }
+ )
+
+ if self.verbose:
+ print(f"Identified {np.sum(label_issues_mask)} examples with label issues.")
+
+ if not save_space:
+ if self.label_issues_df is not None and self.verbose:
+ print(
+ "Overwriting previously identified label issues stored at self.label_issues_df. "
+ "self.get_label_issues() will now return the newly identified label issues. "
+ )
+ self.label_issues_df = label_issues_df
+ self.label_issues_mask = label_issues_df["is_label_issue"].to_numpy()
+ elif self.verbose:
+ print("Not storing label_issues as attributes since save_space was specified.")
+
+ return label_issues_df
+
+ def get_label_issues(self) -> Optional[pd.DataFrame]:
+ """
+ Accessor, returns `label_issues_df` attribute if previously computed.
+ This ``pd.DataFrame`` describes the issues identified for each example (each row corresponds to an example).
+ For column definitions, see the documentation of
+ :py:meth:`CleanLearning.find_label_issues`.
+
+ Returns
+ -------
+ label_issues_df : pd.DataFrame
+ DataFrame with (precomputed) info about the label issues for each example.
+ """
+ if self.label_issues_df is None:
+ warnings.warn(
+ "Label issues have not yet been computed. Run `self.find_label_issues()` or `self.fit()` first."
+ )
+ return self.label_issues_df
+
+ def get_epistemic_uncertainty(
+ self,
+ X: np.ndarray,
+ y: np.ndarray,
+ predictions: Optional[np.ndarray] = None,
+ ) -> np.ndarray:
+ """
+ Compute the epistemic uncertainty of the regression model for each example. This uncertainty is estimated using the bootstrapped
+ variance of the model predictions.
+
+ Parameters
+ ----------
+ X :
+ Data features (i.e. training inputs for ML), typically an array of shape ``(N, ...)``, where N is the number of examples.
+
+ y :
+ An array of shape ``(N,)`` of target values (dependant variables), where some values may be erroneous.
+
+ predictions :
+ Model predicted values of y, will be used as an extra bootstrap iteration to calculate the variance.
+
+ Returns
+ _______
+ epistemic_uncertainty : np.ndarray
+ The estimated epistemic uncertainty for each example.
+ """
+ X, y = assert_valid_regression_inputs(X, y)
+
+ if self.n_boot == 0: # does not estimate epistemic uncertainty
+ return np.zeros(len(y))
+ else:
+ bootstrap_predictions = np.zeros(shape=(len(y), self.n_boot))
+ for i in range(self.n_boot):
+ bootstrap_predictions[:, i] = self._get_cv_predictions(X, y, cv_n_folds=2)
+
+ # add a set of predictions from model that was already trained
+ if predictions is not None:
+ _, predictions = assert_valid_regression_inputs(X, predictions)
+ bootstrap_predictions = np.hstack(
+ [bootstrap_predictions, predictions.reshape(-1, 1)]
+ )
+
+ return np.sqrt(np.var(bootstrap_predictions, axis=1))
+
+ def get_aleatoric_uncertainty(
+ self,
+ X: np.ndarray,
+ residual: np.ndarray,
+ ) -> float:
+ """
+ Compute the aleatoric uncertainty of the data. This uncertainty is estimated by predicting the standard deviation
+ of the regression error.
+
+ Parameters
+ ----------
+ X :
+ Data features (i.e. training inputs for ML), typically an array of shape ``(N, ...)``, where N is the number of examples.
+
+ residual :
+ The difference between the given value and the model predicted value of each examples, ie.
+ `predictions - y`.
+
+ Returns
+ _______
+ aleatoric_uncertainty : float
+ The overall estimated aleatoric uncertainty for this dataset.
+ """
+ X, residual = assert_valid_regression_inputs(X, residual)
+ residual_predictions = self._get_cv_predictions(X, residual)
+ return np.sqrt(np.var(residual_predictions))
+
+ def save_space(self):
+ """
+ Clears non-sklearn attributes of this estimator to save space (in-place).
+ This includes the DataFrame attribute that stored label issues which may be large for big datasets.
+ You may want to call this method before deploying this model (i.e. if you just care about producing predictions).
+ After calling this method, certain non-prediction-related attributes/functionality will no longer be available
+ """
+ if self.label_issues_df is None and self.verbose:
+ print("self.label_issues_df is already empty")
+
+ self.label_issues_df = None
+ self.label_issues_mask = None
+ self.k = None
+
+ if self.verbose:
+ print("Deleted non-sklearn attributes such as label_issues_df to save space.")
+
+ def _get_cv_predictions(
+ self,
+ X: np.ndarray,
+ y: np.ndarray,
+ sorted_index: Optional[np.ndarray] = None,
+ k: float = 0,
+ *,
+ cv_n_folds: Optional[int] = None,
+ seed: Optional[int] = None,
+ model_kwargs: Optional[dict] = None,
+ ) -> np.ndarray:
+ """
+ Helper method to get out-of-fold predictions using cross validation.
+ This method also allows us to filter out the bottom k percent of label errors before training the cross-validation models
+ (both ``sorted_index`` and ``k`` has to be provided for this).
+
+ Parameters
+ ----------
+ X :
+ Data features (i.e. training inputs for ML), typically an array of shape ``(N, ...)``, where N is the number of examples.
+
+ y :
+ An array of shape ``(N,)`` of target values (dependant variables), where some values may be erroneous.
+
+ sorted_index :
+ Index of each example sorted by their residuals in ascending order.
+
+ k :
+ The fraction of examples to hold out from the training sets. Usually this is the fraction of examples that are
+ deemed to contain errors.
+
+ """
+ # set to default unless specified otherwise
+ if cv_n_folds is None:
+ cv_n_folds = self.cv_n_folds
+
+ if model_kwargs is None:
+ model_kwargs = {}
+
+ if k < 0 or k > 1:
+ raise ValueError("k must be a value between 0 and 1")
+ elif k == 0:
+ if sorted_index is None:
+ sorted_index = np.array(range(len(y)))
+ in_sample_idx = sorted_index
+ else:
+ if sorted_index is None:
+ # TODO: better error message
+ raise ValueError(
+ "You need to pass in the index sorted by prediction quality to use with k"
+ )
+ num_to_drop = math.ceil(len(sorted_index) * k)
+ in_sample_idx = sorted_index[:-num_to_drop]
+ out_of_sample_idx = sorted_index[-num_to_drop:]
+
+ X_out_of_sample = X[out_of_sample_idx]
+ out_of_sample_predictions = np.zeros(shape=[len(out_of_sample_idx), cv_n_folds])
+
+ if len(in_sample_idx) < cv_n_folds:
+ raise ValueError(
+ f"There are too few examples to conduct {cv_n_folds}-fold cross validation. "
+ "You can either reduce cv_n_folds for cross validation, or decrease k to exclude less data."
+ )
+
+ predictions = np.zeros(shape=len(y))
+
+ kf = KFold(n_splits=cv_n_folds, shuffle=True, random_state=seed)
+
+ for k_split, (cv_train_idx, cv_holdout_idx) in enumerate(kf.split(in_sample_idx)):
+ try:
+ model_copy = sklearn.base.clone(self.model) # fresh untrained copy of the model
+ except Exception:
+ raise ValueError(
+ "`model` must be clonable via: sklearn.base.clone(model). "
+ "You can either implement instance method `model.get_params()` to produce a fresh untrained copy of this model, "
+ "or you can implement the cross-validation outside of cleanlab "
+ "and pass in the obtained `pred_probs` to skip cleanlab's internal cross-validation"
+ )
+
+ # map the index to the actual index in the original dataset
+ data_idx_train, data_idx_holdout = (
+ in_sample_idx[cv_train_idx],
+ in_sample_idx[cv_holdout_idx],
+ )
+
+ X_train_cv, X_holdout_cv, y_train_cv, y_holdout_cv = train_val_split(
+ X, y, data_idx_train, data_idx_holdout
+ )
+
+ model_copy.fit(X_train_cv, y_train_cv, **model_kwargs)
+ predictions_cv = model_copy.predict(X_holdout_cv)
+
+ predictions[data_idx_holdout] = predictions_cv
+
+ if k != 0:
+ out_of_sample_predictions[:, k_split] = model_copy.predict(X_out_of_sample)
+
+ if k != 0:
+ out_of_sample_predictions_avg = np.mean(out_of_sample_predictions, axis=1)
+ predictions[out_of_sample_idx] = out_of_sample_predictions_avg
+
+ return predictions
+
+ def _find_best_k(
+ self,
+ X: np.ndarray,
+ y: np.ndarray,
+ sorted_index: np.ndarray,
+ coarse_search_range: list = [0.01, 0.05, 0.1, 0.15, 0.2],
+ fine_search_size: int = 3,
+ ) -> Tuple[float, float]:
+ """
+ Helper method that conducts a coarse and fine grained grid search to determine the best value
+ of k, the fraction of the dataset that contains issues.
+
+ Returns a tuple containing the the best value of k (ie. the one that has the best r squared score),
+ and the corrsponding r squared score obtained when dropping k% of the data.
+ """
+ if len(coarse_search_range) == 0:
+ raise ValueError("coarse_search_range must have at least 1 value of k")
+ elif len(coarse_search_range) == 1:
+ curr_k = coarse_search_range[0]
+ num_examples_kept = math.floor(len(y) * (1 - curr_k))
+ if num_examples_kept < self.cv_n_folds:
+ raise ValueError(
+ f"There are too few examples to conduct {self.cv_n_folds}-fold cross validation. "
+ "You can either reduce self.cv_n_folds for cross validation, or decrease k to exclude less data."
+ )
+ predictions = self._get_cv_predictions(
+ X=X,
+ y=y,
+ sorted_index=sorted_index,
+ k=curr_k,
+ )
+ best_r2 = r2_score(y, predictions)
+ best_k = coarse_search_range[0]
+ else:
+ # conduct coarse search
+ coarse_search_range = sorted(coarse_search_range) # sort to conduct fine search well
+ r2_coarse = np.full(len(coarse_search_range), np.NaN)
+ for i in range(len(coarse_search_range)):
+ curr_k = coarse_search_range[i]
+ num_examples_kept = math.floor(len(y) * (1 - curr_k))
+ # check if there are too few examples to do cross val
+ if num_examples_kept < self.cv_n_folds:
+ r2_coarse[i] = -1e30 # arbitrary large negative number
+ else:
+ predictions = self._get_cv_predictions(
+ X=X,
+ y=y,
+ sorted_index=sorted_index,
+ k=curr_k,
+ )
+ r2_coarse[i] = r2_score(y, predictions)
+
+ max_r2_ind = np.argmax(r2_coarse)
+
+ # conduct fine search
+ if fine_search_size < 0:
+ raise ValueError("fine_search_size must at least 0")
+ elif fine_search_size == 0:
+ best_k = coarse_search_range[np.argmax(r2_coarse)]
+ best_r2 = np.max(r2_coarse)
+ else:
+ fine_search_range = np.array([])
+ if max_r2_ind != 0:
+ fine_search_range = np.append(
+ np.linspace(
+ coarse_search_range[max_r2_ind - 1],
+ coarse_search_range[max_r2_ind],
+ fine_search_size + 1,
+ endpoint=False,
+ )[1:],
+ fine_search_range,
+ )
+ if max_r2_ind != len(coarse_search_range) - 1:
+ fine_search_range = np.append(
+ fine_search_range,
+ np.linspace(
+ coarse_search_range[max_r2_ind],
+ coarse_search_range[max_r2_ind + 1],
+ fine_search_size + 1,
+ endpoint=False,
+ )[1:],
+ )
+
+ r2_fine = np.full(len(fine_search_range), np.NaN)
+ for i in range(len(fine_search_range)):
+ curr_k = fine_search_range[i]
+ num_examples_kept = math.floor(len(y) * (1 - curr_k))
+ # check if there are too few examples to do cross val
+ if num_examples_kept < self.cv_n_folds:
+ r2_fine[i] = -1e30 # arbitrary large negative number
+ else:
+ predictions = self._get_cv_predictions(
+ X=X,
+ y=y,
+ sorted_index=sorted_index,
+ k=curr_k,
+ )
+ r2_fine[i] = r2_score(y, predictions)
+
+ # check the max between coarse and fine search
+ if max(r2_coarse) > max(r2_fine):
+ best_k = coarse_search_range[np.argmax(r2_coarse)]
+ best_r2 = np.max(r2_coarse)
+ else:
+ best_k = fine_search_range[np.argmax(r2_fine)]
+ best_r2 = np.max(r2_fine)
+
+ return best_k, best_r2
+
+ def _process_label_issues_arg(
+ self,
+ label_issues: Union[pd.DataFrame, pd.Series, np.ndarray],
+ y: LabelLike,
+ ) -> pd.DataFrame:
+ """
+ Helper method to process the label_issues input into a well-formatted DataFrame.
+ """
+ y = labels_to_array(y)
+
+ if isinstance(label_issues, pd.DataFrame):
+ if "is_label_issue" not in label_issues.columns:
+ raise ValueError(
+ "DataFrame label_issues must contain column: 'is_label_issue'. "
+ "See CleanLearning.fit() documentation for label_issues column descriptions."
+ )
+ if len(label_issues) != len(y):
+ raise ValueError("label_issues and labels must have same length")
+ if "given_label" in label_issues.columns and np.any(
+ label_issues["given_label"].to_numpy() != y
+ ):
+ raise ValueError("labels must match label_issues['given_label']")
+ return label_issues
+
+ elif isinstance(label_issues, (pd.Series, np.ndarray)):
+ if label_issues.dtype is not np.dtype("bool"):
+ raise ValueError("If label_issues is numpy.array, dtype must be 'bool'.")
+ if label_issues.shape != y.shape:
+ raise ValueError("label_issues must have same shape as labels")
+ return pd.DataFrame({"is_label_issue": label_issues, "given_label": y})
+
+ else:
+ raise ValueError(
+ "label_issues must be either pandas.DataFrame, pandas.Series or numpy.ndarray"
+ )
diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py
new file mode 100644
index 0000000000..d73efdfba2
--- /dev/null
+++ b/cleanlab/regression/rank.py
@@ -0,0 +1,186 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+
+"""
+Methods to score the quality of each label in a regression dataset. These can be used to rank the examples whose Y-value is most likely erroneous.
+
+Note: Label quality scores are most accurate when they are computed based on out-of-sample `predictions` from your regression model.
+To obtain out-of-sample predictions for every datapoint in your dataset, you can use :ref:`cross-validation `. This is encouraged to get better results.
+
+If you have a sklearn-compatible regression model, consider using `cleanlab.regression.learn.CleanLearning` instead, which can more accurately identify noisy label values.
+"""
+
+from typing import Dict, Callable
+import numpy as np
+from numpy.typing import ArrayLike
+from sklearn.neighbors import NearestNeighbors
+
+from cleanlab.outlier import OutOfDistribution
+from cleanlab.internal.regression_utils import assert_valid_prediction_inputs
+
+from cleanlab.internal.constants import TINY_VALUE
+
+
+def get_label_quality_scores(
+ labels: ArrayLike,
+ predictions: ArrayLike,
+ *,
+ method: str = "outre",
+) -> np.ndarray:
+ """
+ Returns label quality score for each example in the regression dataset.
+
+ Each score is a continous value in the range [0,1]
+
+ * 1 - clean label (given label is likely correct).
+ * 0 - dirty label (given label is likely incorrect).
+
+ Parameters
+ ----------
+ labels : array_like
+ Raw labels from original dataset.
+ 1D array of shape ``(N, )`` containing the given labels for each example (aka. Y-value, response/target/dependent variable), where N is number of examples in the dataset.
+
+ predictions : np.ndarray
+ 1D array of shape ``(N,)`` containing the predicted label for each example in the dataset. These should be out-of-sample predictions from a trained regression model, which you can obtain for every example in your dataset via :ref:`cross-validation `.
+
+ method : {"residual", "outre"}, default="outre"
+ String specifying which method to use for scoring the quality of each label and identifying which labels appear most noisy.
+
+ Returns
+ -------
+ label_quality_scores:
+ Array of shape ``(N, )`` of scores between 0 and 1, one per example in the dataset.
+
+ Lower scores indicate examples more likely to contain a label issue.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> from cleanlab.regression.rank import get_label_quality_scores
+ >>> labels = np.array([1,2,3,4])
+ >>> predictions = np.array([2,2,5,4.1])
+ >>> label_quality_scores = get_label_quality_scores(labels, predictions)
+ >>> label_quality_scores
+ array([0.00323821, 0.33692597, 0.00191686, 0.33692597])
+ """
+
+ # Check if inputs are valid
+ labels, predictions = assert_valid_prediction_inputs(
+ labels=labels, predictions=predictions, method=method
+ )
+
+ scoring_funcs: Dict[str, Callable[[np.ndarray, np.ndarray], np.ndarray]] = {
+ "residual": _get_residual_score_for_each_label,
+ "outre": _get_outre_score_for_each_label,
+ }
+
+ scoring_func = scoring_funcs.get(method, None)
+ if not scoring_func:
+ raise ValueError(
+ f"""
+ {method} is not a valid scoring method.
+ Please choose a valid scoring technique: {scoring_funcs.keys()}.
+ """
+ )
+
+ # Calculate scores
+ label_quality_scores = scoring_func(labels, predictions)
+ return label_quality_scores
+
+
+def _get_residual_score_for_each_label(
+ labels: np.ndarray,
+ predictions: np.ndarray,
+) -> np.ndarray:
+ """Returns a residual label-quality score for each example.
+
+ This is function to compute label-quality scores for regression datasets,
+ where lower score indicate labels less likely to be correct.
+
+ Residual based scores can work better for datasets where independent variables
+ are based out of normal distribution.
+
+ Parameters
+ ----------
+ labels: np.ndarray
+ Labels in the same format expected by the :py:func:`get_label_quality_scores ` function.
+
+ predictions: np.ndarray
+ Predicted labels in the same format expected by the :py:func:`get_label_quality_scores ` function.
+
+ Returns
+ -------
+ label_quality_scores: np.ndarray
+ Contains one score (between 0 and 1) per example.
+ Lower scores indicate more likely mislabled examples.
+
+ """
+ residual = predictions - labels
+ label_quality_scores = np.exp(-abs(residual))
+ return label_quality_scores
+
+
+def _get_outre_score_for_each_label(
+ labels: np.ndarray,
+ predictions: np.ndarray,
+ *,
+ residual_scale: float = 5,
+ frac_neighbors: float = 0.5,
+ neighbor_metric: str = "euclidean",
+) -> np.ndarray:
+ """Returns OUTRE based label-quality scores.
+
+ This function computes label-quality scores for regression datasets,
+ where a lower score indicates labels that are less likely to be correct.
+
+ Parameters
+ ----------
+ labels: np.ndarray
+ Labels in the same format as expected by the :py:func:`get_label_quality_scores ` function.
+
+ predictions: np.ndarray
+ Predicted labels in the same format as expected by the :py:func:`get_label_quality_scores ` function.
+
+ residual_scale: float, default = 5
+ Multiplicative factor to adjust scale (standard deviation) of the residuals relative to the labels.
+
+ frac_neighbors: float, default = 0.5
+ Fraction of examples in dataset that should be considered as `n_neighbors` in the ``NearestNeighbors`` object used internally to assess outliers.
+
+ neighbor_metric: str, default = "euclidean"
+ The parameter is passed to sklearn NearestNeighbors. # TODO add reference to sklearn.NearestNeighbor?
+
+ Returns
+ -------
+ label_quality_scores: np.ndarray
+ Contains one score (between 0 and 1) per example.
+ Lower scores indicate more likely mislabled examples.
+ """
+ residual = predictions - labels
+ labels = (labels - labels.mean()) / (labels.std() + TINY_VALUE)
+ residual = residual_scale * ((residual - residual.mean()) / (residual.std() + TINY_VALUE))
+
+ # 2D features by combining labels and residual
+ features = np.array([labels, residual]).T
+
+ neighbors = int(np.ceil(frac_neighbors * labels.shape[0]))
+ knn = NearestNeighbors(n_neighbors=neighbors, metric=neighbor_metric).fit(features)
+ ood = OutOfDistribution(params={"knn": knn})
+
+ label_quality_scores = ood.score(features=features)
+ return label_quality_scores
diff --git a/cleanlab/segmentation/__init__.py b/cleanlab/segmentation/__init__.py
new file mode 100644
index 0000000000..fbc2eb7eac
--- /dev/null
+++ b/cleanlab/segmentation/__init__.py
@@ -0,0 +1,3 @@
+from . import rank
+from . import filter
+from . import summary
diff --git a/cleanlab/segmentation/filter.py b/cleanlab/segmentation/filter.py
new file mode 100644
index 0000000000..9c93409dfd
--- /dev/null
+++ b/cleanlab/segmentation/filter.py
@@ -0,0 +1,168 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""
+Methods to find label issues in image semantic segmentation datasets, where each pixel in an image receives its own class label.
+
+"""
+
+from cleanlab.experimental.label_issues_batched import find_label_issues_batched
+import numpy as np
+from typing import Tuple, Optional
+
+from cleanlab.internal.segmentation_utils import _get_valid_optional_params, _check_input
+
+
+def find_label_issues(
+ labels: np.ndarray,
+ pred_probs: np.ndarray,
+ *,
+ batch_size: Optional[int] = None,
+ n_jobs: Optional[int] = None,
+ verbose: bool = True,
+ **kwargs,
+) -> np.ndarray:
+ """
+ Returns a boolean mask for the entire dataset, per pixel where ``True`` represents
+ an example identified with a label issue and ``False`` represents an example of a pixel correctly labeled.
+
+ * N - Number of images in the dataset
+ * K - Number of classes in the dataset
+ * H - Height of each image
+ * W - Width of each image
+
+ Tip: if you encounter the error "pred_probs is not defined", try setting ``n_jobs=1``.
+
+ Parameters
+ ----------
+ labels:
+ A discrete array of shape ``(N,H,W,)`` of noisy labels for a semantic segmentation dataset, i.e. some labels may be erroneous.
+ *Format requirements*: for a dataset with K classes, each pixel must be labeled using an integer in 0, 1, ..., K-1.
+ Tip: If your labels are one hot encoded you can do: ``labels = np.argmax(labels_one_hot,axis=1)`` assuming that `labels_one_hot` is of dimension ``(N,K,H,W)``, in order to get properly formatted `labels`
+
+ pred_probs:
+ An array of shape ``(N,K,H,W,)`` of model-predicted class probabilities,
+ ``P(label=k|x)`` for each pixel ``x``. The prediction for each pixel is an array corresponding to the estimated likelihood that this pixel belongs to each of the ``K`` classes. The 2nd dimension of `pred_probs` must be ordered such that these probabilities correspond to class 0, 1, ..., K-1.
+
+ batch_size:
+ Optional size of image mini-batches used for computing the label issues in a streaming fashion (does not affect results, just the runtime and memory requirements).
+ To maximize efficiency, try to use the largest `batch_size` your memory allows. If not provided, a good default is used.
+
+ n_jobs:
+ Optional number of processes for multiprocessing (default value = 1). Only used on Linux.
+ If `n_jobs=None`, will use either the number of: physical cores if psutil is installed, or logical cores otherwise.
+
+ verbose:
+ Set to ``False`` to suppress all print statements.
+
+ **kwargs:
+ * downsample: int,
+ Optional factor to shrink labels and pred_probs by. Default ``1``
+ Must be a factor divisible by both the labels and the pred_probs. Larger values of `downsample` produce faster runtimes but potentially less accurate results due to over-compression. Set to 1 to avoid any downsampling.
+
+ Returns
+ -------
+ label_issues: np.ndarray
+ Returns a boolean **mask** for the entire dataset of length `(N,H,W)`
+ where ``True`` represents a pixel label issue and ``False`` represents an example that is correctly labeled.
+ """
+ batch_size, n_jobs = _get_valid_optional_params(batch_size, n_jobs)
+ downsample = kwargs.get("downsample", 1)
+
+ def downsample_arrays(
+ labels: np.ndarray, pred_probs: np.ndarray, factor: int = 1
+ ) -> Tuple[np.ndarray, np.ndarray]:
+ if factor == 1:
+ return labels, pred_probs
+
+ num_image, num_classes, h, w = pred_probs.shape
+
+ # Check if possible to downsample
+ if h % downsample != 0 or w % downsample != 0:
+ raise ValueError(
+ f"Height {h} and width {w} not divisible by downsample value of {downsample}. Set kwarg downsample to 1 to avoid downsampling."
+ )
+ small_labels = np.round(
+ labels.reshape((num_image, h // factor, factor, w // factor, factor)).mean(4).mean(2)
+ )
+ small_pred_probs = (
+ pred_probs.reshape((num_image, num_classes, h // factor, factor, w // factor, factor))
+ .mean(5)
+ .mean(3)
+ )
+
+ # We want to make sure that pred_probs are renormalized
+ row_sums = small_pred_probs.sum(axis=1)
+ renorm_small_pred_probs = small_pred_probs / np.expand_dims(row_sums, 1)
+
+ return small_labels, renorm_small_pred_probs
+
+ def flatten_and_preprocess_masks(
+ labels: np.ndarray, pred_probs: np.ndarray
+ ) -> Tuple[np.ndarray, np.ndarray]:
+ _, num_classes, _, _ = pred_probs.shape
+ labels_flat = labels.flatten().astype(int)
+ pred_probs_flat = np.moveaxis(pred_probs, 0, 1).reshape(num_classes, -1)
+
+ return labels_flat, pred_probs_flat.T
+
+ ##
+ _check_input(labels, pred_probs)
+
+ # Added Downsampling
+ pre_labels, pre_pred_probs = downsample_arrays(labels, pred_probs, downsample)
+
+ num_image, num_classes, h, w = pre_pred_probs.shape
+ # flatten images just preps labels and pred_probs
+
+ pre_labels, pre_pred_probs = flatten_and_preprocess_masks(pre_labels, pre_pred_probs)
+
+ ranked_label_issues = find_label_issues_batched(
+ pre_labels, pre_pred_probs, batch_size=batch_size, n_jobs=n_jobs, verbose=verbose
+ )
+
+ # Finding the right indicies
+ relative_index = ranked_label_issues % (h * w)
+ pixel_coor_i, pixel_coor_j = np.unravel_index(relative_index, (h, w))
+ image_number = ranked_label_issues // (h * w)
+
+ # Upsample carefully maintaining indicies
+ label_issues = np.full((num_image, h, w), False)
+
+ for num, ii, jj in zip(image_number, pixel_coor_i, pixel_coor_j):
+ # only want to call it an error if pred_probs doesnt match the label at that pixel
+ label_issues[num, ii, jj] = True
+ if downsample == 1:
+ # check if pred_probs matches the label at that pixel
+ if np.argmax(pred_probs[num, :, ii, jj]) == labels[num, ii, jj]:
+ label_issues[num, ii, jj] = False
+
+ if downsample != 1:
+ label_issues = label_issues.repeat(downsample, axis=1).repeat(downsample, axis=2)
+
+ for num, ii, jj in zip(image_number, pixel_coor_i, pixel_coor_j):
+ # Upsample the coordinates
+ upsampled_ii = ii * downsample
+ upsampled_jj = jj * downsample
+ # Iterate over the upsampled region
+ for row in range(upsampled_ii, upsampled_ii + downsample):
+ for col in range(upsampled_jj, upsampled_jj + downsample):
+ # Check if the predicted class (argmax) at the identified issue location matches the true label
+ if np.argmax(pred_probs[num, :, row, col]) == labels[num, row, col]:
+ # If they match, set the corresponding entry in the label_issues array to False
+ label_issues[num, row, col] = False
+
+ return label_issues
diff --git a/cleanlab/segmentation/rank.py b/cleanlab/segmentation/rank.py
new file mode 100644
index 0000000000..2d130e8b32
--- /dev/null
+++ b/cleanlab/segmentation/rank.py
@@ -0,0 +1,246 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""
+Methods to rank and score images in a semantic segmentation dataset based on how likely they are to contain mislabeled pixels.
+"""
+import numpy as np
+import warnings
+from typing import Optional, Union, Tuple
+from cleanlab.segmentation.filter import find_label_issues
+
+from cleanlab.internal.segmentation_utils import _get_valid_optional_params, _check_input
+
+
+def get_label_quality_scores(
+ labels: np.ndarray,
+ pred_probs: np.ndarray,
+ *,
+ method: str = "softmin",
+ batch_size: Optional[int] = None,
+ n_jobs: Optional[int] = None,
+ verbose: bool = True,
+ **kwargs,
+) -> Tuple[np.ndarray, np.ndarray]:
+ """Returns a label quality score for each image.
+
+ This is a function to compute label quality scores for semantic segmentation datasets,
+ where lower scores indicate labels less likely to be correct.
+
+ * N - Number of images in the dataset
+ * K - Number of classes in the dataset
+ * H - Height of each image
+ * W - Width of each image
+
+ Parameters
+ ----------
+ labels:
+ A discrete array of noisy labels for a segmantic segmentation dataset, in the shape``(N,H,W,)``.
+ where each pixel must be integer in 0, 1, ..., K-1.
+ Refer to documentation for this argument in :py:func:find_label_issues for further details.
+
+ pred_probs:
+ An array of shape ``(N,K,H,W,)`` of model-predicted class probabilities.
+ Refer to documentation for this argument in :py:func:find_label_issues for further details.
+
+ method: {"softmin", "num_pixel_issues"}, default="softmin"
+ Label quality scoring method.
+ - "softmin" - Calculates the inner product between scores and softmax(1-scores). For efficiency, use instead of "num_pixel_issues".
+ - "num_pixel_issues" - Uses the number of pixels with label issues for each image using :py:func:find_label_issues
+
+ batch_size :
+ Optional size of mini-batches to use for estimating the label issues for 'num_pixel_issues' only, not 'softmin'
+ To maximize efficiency, try to use the largest `batch_size` your memory allows. If not provided, a good default is used.
+
+ n_jobs:
+ Optional number of processes for multiprocessing (default value = 1). Only used on Linux. For 'num_pixel_issues' only, not 'softmin'
+ If `n_jobs=None`, will use either the number of: physical cores if psutil is installed, or logical cores otherwise.
+
+ verbose:
+ Set to ``False`` to suppress all print statements.
+
+ **kwargs:
+ * downsample : int,
+ Factor to shrink labels and pred_probs by for 'num_pixel_issues' only, not 'softmin' . Default ``16``
+ Must be a factor divisible by both the labels and the pred_probs. Larger values of `downsample` produce faster runtimes but potentially less accurate results due to over-compression. Set to 1 to avoid any downsampling.
+ * temperature : float,
+ Temperature for softmin. Default ``0.1``
+
+
+ Returns
+ -------
+ image_scores:
+ Array of shape ``(N, )`` of scores between 0 and 1, one per image in the dataset.
+ Lower scores indicate image more likely to contain a label issue.
+ pixel_scores:
+ Array of shape ``(N,H,W)`` of scores between 0 and 1, one per pixel in the dataset.
+ """
+ batch_size, n_jobs = _get_valid_optional_params(batch_size, n_jobs)
+ _check_input(labels, pred_probs)
+
+ softmin_temperature = kwargs.get("temperature", 0.1)
+ downsample_num_pixel_issues = kwargs.get("downsample", 1)
+
+ if method == "num_pixel_issues":
+ _, K, _, _ = pred_probs.shape
+ labels_expanded = labels[:, np.newaxis, :, :]
+ mask = np.arange(K)[np.newaxis, :, np.newaxis, np.newaxis] == labels_expanded
+ # Calculate pixel_scores
+ masked_pred_probs = np.where(mask, pred_probs, 0)
+ pixel_scores = masked_pred_probs.sum(axis=1)
+ scores = find_label_issues(
+ labels,
+ pred_probs,
+ downsample=downsample_num_pixel_issues,
+ n_jobs=n_jobs,
+ verbose=verbose,
+ batch_size=batch_size,
+ )
+ img_scores = 1 - np.mean(scores, axis=(1, 2))
+ return (img_scores, pixel_scores)
+
+ if downsample_num_pixel_issues != 1:
+ warnings.warn(
+ f"image will not downsample for method {method} is only for method: num_pixel_issues"
+ )
+
+ num_im, num_class, h, w = pred_probs.shape
+ image_scores = []
+ pixel_scores = []
+ if verbose:
+ from tqdm.auto import tqdm
+
+ pbar = tqdm(desc=f"images processed using {method}", total=num_im)
+ for image in range(num_im):
+ image_probs = pred_probs[image][
+ labels[image],
+ np.arange(h)[:, None],
+ np.arange(w),
+ ]
+ pixel_scores.append(image_probs)
+ image_scores.append(
+ _get_label_quality_per_image(
+ np.array(image_probs.flatten()), method=method, temperature=softmin_temperature
+ )
+ )
+ if verbose:
+ pbar.update(1)
+ return np.array(image_scores), np.array(pixel_scores)
+
+
+def issues_from_scores(
+ image_scores: np.ndarray, pixel_scores: Optional[np.ndarray] = None, threshold: float = 0.1
+) -> Union[list, np.ndarray]:
+ """
+ Converts scores output by :py:func:`segmentation.rank.get_label_quality_scores `
+ to a list of issues of similar format as output by :py:func:`segmentation.filter.find_label_issues `.
+
+ Issues are sorted by label quality score, from most to least severe.
+
+ Only considers as issues those tokens with label quality score lower than `threshold`,
+ so this parameter determines the number of issues that are returned.
+
+ Note: This method is intended for converting the most severely mislabeled examples to a format compatible with
+ ``summary`` methods like :py:func:`segmentation.summary.display_issues `.
+ This method does not estimate the number of label errors since the `threshold` is arbitrary,
+ for that instead use :py:func:`segmentation.filter.find_label_issues `,
+ which estimates the label errors via Confident Learning rather than score thresholding.
+
+ Parameters
+ ----------
+ image_scores:
+ Array of shape `(N, )` of overall image scores, where `N` is the number of images in the dataset.
+ Same format as the `image_scores` returned by :py:func:`segmentation.rank.get_label_quality_scores `.
+
+ pixel_scores:
+ Optional array of shape ``(N,H,W)`` of scores between 0 and 1, one per pixel in the dataset.
+ Same format as the `pixel_scores` returned by :py:func:`segmentation.rank.get_label_quality_scores `.
+
+ threshold:
+ Optional quality scores threshold that determines which pixels are included in result. Pixels with with quality scores above the `threshold` are not
+ included in the result. If not provided, all pixels are included in result.
+
+ Returns
+ ---------
+ issues:
+ Returns a boolean **mask** for the entire dataset
+ where ``True`` represents a pixel label issue and ``False`` represents an example that is
+ accurately labeled with using the threshold provided by the user.
+ Use :py:func:`segmentation.summary.display_issues `
+ to view these issues within the original images.
+
+ If `pixel_scores` is not provided, returns array of integer indices (rather than boolean mask) of the images whose label quality score
+ falls below the `threshold` (also sorted by overall label quality score of each image).
+
+ """
+
+ if image_scores is None:
+ raise ValueError("pixel_scores must be provided")
+ if threshold < 0 or threshold > 1 or threshold is None:
+ raise ValueError("threshold must be between 0 and 1")
+
+ if pixel_scores is not None:
+ issues = np.where(pixel_scores < threshold, True, False)
+ else:
+ ranking = np.argsort(image_scores)
+ cutoff = np.searchsorted(image_scores[ranking], threshold)
+ issues = ranking[: cutoff + 1]
+ return issues
+
+
+def _get_label_quality_per_image(pixel_scores, method=None, temperature=0.1):
+ from cleanlab.internal.multilabel_scorer import softmin
+
+ """
+ Input pixel scores and get label quality score for that image, currently using the "softmin" method.
+
+ Parameters
+ ----------
+ pixel_scores:
+ Per-pixel label quality scores in flattened array of shape ``(N, )``, where N is the number of pixels in the image.
+
+ method: default "softmin"
+ Method to use to calculate the image's label quality score.
+ Currently only supports "softmin".
+ temperature: default 0.1
+ Temperature of the softmax function. Too small values may cause numerical underflow and NaN scores.
+
+ Lower values encourage this method to converge toward the label quality score of the pixel with the lowest quality label in the image.
+
+ Higher values encourage this method to converge toward the average label quality score of all pixels in the image.
+
+ Returns
+ ---------
+ image_score:
+ Float of the image's label quality score from 0 to 1, 0 being the lowest quality and 1 being the highest quality.
+
+ """
+ if pixel_scores is None or pixel_scores.size == 0:
+ raise Exception("Invalid Input: pixel_scores cannot be None or an empty list")
+
+ if temperature == 0 or temperature is None:
+ raise Exception("Invalid Input: temperature cannot be zero or None")
+
+ pixel_scores_64 = pixel_scores.astype("float64")
+ if method == "softmin":
+ if len(pixel_scores_64) > 0:
+ return softmin(
+ np.expand_dims(pixel_scores_64, axis=0), axis=1, temperature=temperature
+ )[0]
+ else:
+ raise Exception("Invalid Input: pixel_scores is empty")
+ else:
+ raise Exception("Invalid Method: Specify correct method. Currently only supports 'softmin'")
diff --git a/cleanlab/segmentation/summary.py b/cleanlab/segmentation/summary.py
new file mode 100644
index 0000000000..c578f71cd3
--- /dev/null
+++ b/cleanlab/segmentation/summary.py
@@ -0,0 +1,354 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""
+Methods to display images and their label issues in a semantic segmentation dataset, as well as summarize the overall types of issues identified.
+"""
+
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+from cleanlab.internal.segmentation_utils import _get_summary_optional_params
+
+
+def display_issues(
+ issues: np.ndarray,
+ *,
+ labels: Optional[np.ndarray] = None,
+ pred_probs: Optional[np.ndarray] = None,
+ class_names: Optional[List[str]] = None,
+ exclude: Optional[List[int]] = None,
+ top: Optional[int] = None,
+) -> None:
+ """
+ Display semantic segmentation label issues, showing images with problematic pixels highlighted.
+
+ Can also show given and predicted masks for each image identified to have label issue.
+
+ Parameters
+ ----------
+ issues:
+ Boolean **mask** for the entire dataset
+ where ``True`` represents a pixel label issue and ``False`` represents an example that is
+ accurately labeled.
+
+ Same format as output by :py:func:`segmentation.filter.find_label_issues `
+ or :py:func:`segmentation.rank.issues_from_scores `.
+
+ labels:
+ Optional discrete array of noisy labels for a segmantic segmentation dataset, in the shape``(N,H,W,)``.
+ where each pixel must be integer in 0, 1, ..., K-1.
+ If `labels` is provided, this function also displays given label of the pixel identified with issue.
+ Refer to documentation for this argument in :py:func:find_label_issues for more information.
+
+ pred_probs:
+ Optional array of shape ``(N,K,H,W,)`` of model-predicted class probabilities.
+ If `pred_probs` is provided, this function also displays predicted label of the pixel identified with issue.
+ Refer to documentation for this argument in :py:func:find_label_issues for more information.
+
+ Tip: If your labels are one hot encoded you can `np.argmax(labels_one_hot,axis=1)` assuming that `labels_one_hot` is of dimension (N,K,H,W)
+ before entering in the function
+
+ class_names:
+ Optional list of strings, where each string represents the name of a class in the semantic segmentation problem.
+ The order of the names should correspond to the numerical order of the classes. The list length should be
+ equal to the number of unique classes present in the labels.
+ If provided, this function will generate a legend
+ showing the color mapping of each class in the provided colormap.
+
+ Example:
+ If there are three classes in your labels, represented by 0, 1, 2, then class_names might look like this:
+ class_names = ['background', 'person', 'dog']
+
+ top:
+ Optional maximum number of issues to be printed. If not provided, a good default is used.
+
+ exclude:
+ Optional list of label classes that can be ignored in the errors, each element must be 0, 1, ..., K-1
+
+ """
+ class_names, exclude, top = _get_summary_optional_params(class_names, exclude, top)
+ if labels is None and len(exclude) > 0:
+ raise ValueError("Provide labels to allow class exclusion")
+
+ top = min(top, len(issues))
+
+ correct_ordering = np.argsort(-np.sum(issues, axis=(1, 2)))[:top]
+
+ try:
+ import matplotlib.pyplot as plt
+ import matplotlib.patches as mpatches
+ except:
+ raise ImportError('try "pip install matplotlib"')
+
+ output_plots = (pred_probs is not None) + (labels is not None) + 1
+
+ # Colormap for errors
+ error_cmap = plt.cm.colors.ListedColormap(["none", "red"])
+ _, h, w = issues.shape
+ if output_plots > 1:
+ if pred_probs is not None:
+ _, num_classes, _, _ = pred_probs.shape
+ cmap = _generate_colormap(num_classes)
+ elif labels is not None:
+ num_classes = max(np.unique(labels)) + 1
+ cmap = _generate_colormap(num_classes)
+ else:
+ cmap = None
+
+ # Show a legend
+ if class_names is not None and cmap is not None:
+ patches = [
+ mpatches.Patch(color=cmap[i], label=class_names[i]) for i in range(len(class_names))
+ ]
+ legend = plt.figure() # adjust figsize for larger legend
+ legend.legend(
+ handles=patches, loc="center", ncol=len(class_names), facecolor="white", fontsize=20
+ ) # adjust fontsize for larger text
+ plt.axis("off")
+ plt.show()
+
+ for i in correct_ordering:
+ # Show images
+ fig, axes = plt.subplots(1, output_plots, figsize=(5 * output_plots, 5))
+ plot_index = 0
+
+ # First image - Given truth labels
+ if labels is not None:
+ axes[plot_index].imshow(cmap[labels[i]])
+ axes[plot_index].set_title("Given Labels")
+ plot_index += 1
+
+ # Second image - Argmaxed pred_probs
+ if pred_probs is not None:
+ axes[plot_index].imshow(cmap[np.argmax(pred_probs[i], axis=0)])
+ axes[plot_index].set_title("Argmaxed Prediction Probabilities")
+ plot_index += 1
+
+ # Third image - Errors
+ if output_plots == 1:
+ ax = axes
+ else:
+ ax = axes[plot_index]
+
+ mask = np.full((h, w), True)
+ if labels is not None and len(exclude) != 0:
+ mask = ~np.isin(labels[i], exclude)
+ ax.imshow(issues[i] & mask, cmap=error_cmap, vmin=0, vmax=1)
+ ax.set_title(f"Image {i}: Suggested Errors (in Red)")
+ plt.show()
+
+ return None
+
+
+def common_label_issues(
+ issues: np.ndarray,
+ labels: np.ndarray,
+ pred_probs: np.ndarray,
+ *,
+ class_names: Optional[List[str]] = None,
+ exclude: Optional[List[int]] = None,
+ top: Optional[int] = None,
+ verbose: bool = True,
+) -> pd.DataFrame:
+ """
+ Display the frequency of which label are swapped in the dataset.
+
+ These may correspond to pixels that are ambiguous or systematically misunderstood by the data annotators.
+
+ * N - Number of images in the dataset
+ * K - Number of classes in the dataset
+ * H - Height of each image
+ * W - Width of each image
+
+ Parameters
+ ----------
+ issues:
+ Boolean **mask** for the entire dataset
+ where ``True`` represents a pixel label issue and ``False`` represents an example that is
+ accurately labeled.
+
+ Same format as output by :py:func:`segmentation.filter.find_label_issues `
+ or :py:func:`segmentation.rank.issues_from_scores `.
+
+ labels:
+ A discrete array of noisy labels for a segmantic segmentation dataset, in the shape``(N,H,W,)``.
+ where each pixel must be integer in 0, 1, ..., K-1.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for more information.
+
+ pred_probs:
+ An array of shape ``(N,K,H,W,)`` of model-predicted class probabilities.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for more information.
+
+ Tip: If your labels are one hot encoded you can `np.argmax(labels_one_hot,axis=1)` assuming that `labels_one_hot` is of dimension (N,K,H,W)
+ before entering in the function
+
+ class_names:
+ Optional length K list of names of each class, such that `class_names[i]` is the string name of the class corresponding to `labels` with value `i`.
+ If `class_names` is provided, display these string names for predicted and given labels, otherwise display the integer index of classes.
+
+ exclude:
+ Optional list of label classes that can be ignored in the errors, each element must be in 0, 1, ..., K-1.
+
+ top:
+ Optional maximum number of tokens to print information for. If not provided, a good default is used.
+
+ verbose:
+ Set to ``False`` to suppress all print statements.
+
+ Returns
+ -------
+ issues_df:
+ DataFrame `issues_df` contains columns ``['given_label', 'predicted_label', 'num_label_issues']`` and each row contains information for a
+ given/predicted label swap, ordered by the number of label issues inferred for this type of label swap.
+ """
+ try:
+ N, K, H, W = pred_probs.shape
+ except:
+ raise ValueError("pred_probs must be of shape (N, K, H, W)")
+
+ assert labels.shape == (N, H, W), "labels must be of shape (N, H, W)"
+
+ class_names, exclude, top = _get_summary_optional_params(class_names, exclude, top)
+ # Find issues by pixel coordinates
+ issue_coords = np.column_stack(np.where(issues))
+
+ # Count issues per class (given label)
+ count: Dict[int, Any] = {}
+ for i, j, k in tqdm(issue_coords):
+ label = labels[i, j, k]
+ pred = pred_probs[i, :, j, k].argmax()
+ if label not in count:
+ count[label] = np.zeros(K, dtype=int)
+ if pred not in exclude:
+ count[label][pred] += 1
+
+ # Prepare output DataFrame
+ if class_names is None:
+ class_names = [str(i) for i in range(K)]
+
+ info = []
+ for given_label, class_name in enumerate(class_names):
+ if given_label in count:
+ for pred_label, num_issues in enumerate(count[given_label]):
+ if num_issues > 0:
+ info.append([class_name, class_names[pred_label], num_issues])
+
+ info = sorted(info, key=lambda x: x[2], reverse=True)[:top]
+ issues_df = pd.DataFrame(info, columns=["given_label", "predicted_label", "num_pixel_issues"])
+
+ if verbose:
+ for idx, row in issues_df.iterrows():
+ print(
+ f"Class '{row['given_label']}' is potentially mislabeled as class for '{row['predicted_label']}' "
+ f"{row['num_pixel_issues']} pixels in the dataset"
+ )
+
+ return issues_df
+
+
+def filter_by_class(
+ class_index: int, issues: np.ndarray, labels: np.ndarray, pred_probs: np.ndarray
+) -> np.ndarray:
+ """
+ Return label issues involving particular class. Note that this includes errors where the given label is the class of interest, and the predicted label is any other class.
+
+ Parameters
+ ----------
+ class_index:
+ The specific class you are interested in.
+
+ issues:
+ Boolean **mask** for the entire dataset where ``True`` represents a pixel label issue and ``False`` represents an example that is
+ accurately labeled.
+
+ Same format as output by :py:func:`segmentation.filter.find_label_issues `
+ or :py:func:`segmentation.rank.issues_from_scores `.
+
+ labels:
+ A discrete array of noisy labels for a segmantic segmentation dataset, in the shape``(N,H,W,)``.
+ where each pixel must be integer in 0, 1, ..., K-1.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ pred_probs:
+ An array of shape ``(N,K,H,W,)`` of model-predicted class probabilities.
+ Refer to documentation for this argument in :py:func:`find_label_issues ` for further details.
+
+ Returns
+ ----------
+ issues_subset:
+ Boolean **mask** for the subset dataset where ``True`` represents a pixel label issue and ``False`` represents an example that is
+ accurately labeled for the labeled class.
+
+ Returned mask shows **all** instances that involve the particular class of interest.
+
+
+ """
+ issues_subset = (issues & np.isin(labels, class_index)) | (
+ issues & np.isin(pred_probs.argmax(1), class_index)
+ )
+ return issues_subset
+
+
+def _generate_colormap(num_colors):
+ """
+ Finds a unique color map based on the number of colors inputted ideal for semantic segmentation.
+ Parameters
+ ----------
+ num_colors:
+ How many unique colors you want
+
+ Returns
+ -------
+ colors:
+ colors with num_colors distinct colors
+ """
+
+ try:
+ from matplotlib.cm import hsv
+ except:
+ raise ImportError('try "pip install matplotlib"')
+
+ num_shades = 7
+ num_colors_with_shades = -(-num_colors // num_shades) * num_shades
+ linear_nums = np.linspace(0, 1, num_colors_with_shades, endpoint=False)
+
+ arr_by_shade_rows = linear_nums.reshape(num_shades, -1)
+ arr_by_shade_columns = arr_by_shade_rows.T
+ num_partitions = arr_by_shade_columns.shape[0]
+ nums_distributed_like_rising_saw = arr_by_shade_columns.flatten()
+
+ initial_cm = hsv(nums_distributed_like_rising_saw)
+ lower_partitions_half = num_partitions // 2
+ upper_partitions_half = num_partitions - lower_partitions_half
+
+ lower_half = lower_partitions_half * num_shades
+ initial_cm[:lower_half, :3] *= np.linspace(0.2, 1, lower_half)[:, np.newaxis]
+
+ upper_half_indices = np.arange(lower_half, num_colors_with_shades).reshape(
+ upper_partitions_half, num_shades
+ )
+ modifier = (
+ (1 - initial_cm[upper_half_indices, :3])
+ * np.arange(upper_partitions_half)[:, np.newaxis, np.newaxis]
+ / upper_partitions_half
+ )
+ initial_cm[upper_half_indices, :3] += modifier
+ colors = initial_cm[:num_colors]
+ return colors
diff --git a/cleanlab/token_classification/filter.py b/cleanlab/token_classification/filter.py
index fcaf25b2fe..698efdd4fc 100644
--- a/cleanlab/token_classification/filter.py
+++ b/cleanlab/token_classification/filter.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -31,6 +31,7 @@ def find_label_issues(
pred_probs: list,
*,
return_indices_ranked_by: str = "self_confidence",
+ **kwargs,
) -> List[Tuple[int, int]]:
"""Identifies tokens with label issues in a token classification dataset.
@@ -60,6 +61,10 @@ def find_label_issues(
See :py:func:`cleanlab.filter.find_label_issues `
documentation for more details on each label quality scoring method.
+ kwargs:
+ Additional keyword arguments to pass into :py:func:`filter.find_label_issues `
+ which is internally applied at the token level. Can include values like `n_jobs` to control parallel processing, `frac_noise`, etc.
+
Returns
-------
issues:
@@ -87,7 +92,10 @@ def find_label_issues(
pred_probs_flatten = np.array([pred for pred_prob in pred_probs for pred in pred_prob])
issues_main = find_label_issues_main(
- labels_flatten, pred_probs_flatten, return_indices_ranked_by=return_indices_ranked_by
+ labels_flatten,
+ pred_probs_flatten,
+ return_indices_ranked_by=return_indices_ranked_by,
+ **kwargs,
)
lengths = [len(label) for label in labels]
diff --git a/cleanlab/token_classification/rank.py b/cleanlab/token_classification/rank.py
index bd7a3f59d0..be41568aa2 100644
--- a/cleanlab/token_classification/rank.py
+++ b/cleanlab/token_classification/rank.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -154,9 +154,15 @@ def issues_from_scores(
Converts scores output by :py:func:`token_classification.rank.get_label_quality_scores `
to a list of issues of similar format as output by :py:func:`token_classification.filter.find_label_issues `.
- Only considers as issues those tokens with label quality score lower than `threshold`.
+ Issues are sorted by label quality score, from most to least severe.
- Issues are sorted by label quality score, from most severe to least.
+ Only considers as issues those tokens with label quality score lower than `threshold`,
+ so this parameter determines the number of issues that are returned.
+ This method is intended for converting the most severely mislabeled examples to a format compatible with
+ ``summary`` methods like :py:func:`token_classification.summary.display_issues `.
+ This method does not estimate the number of label errors since the `threshold` is arbitrary,
+ for that instead use :py:func:`token_classification.filter.find_label_issues `,
+ which estimates the label errors via Confident Learning rather than score thresholding.
Parameters
----------
@@ -275,8 +281,10 @@ def _softmin_sentence_score(
return np.array([np.mean(scores) for scores in token_scores])
def softmax(scores: np.ndarray) -> np.ndarray:
- exp_scores = np.exp(scores / temperature)
- return exp_scores / np.sum(exp_scores)
+ scores = scores / temperature
+ scores_max = np.amax(scores, axis=0, keepdims=True)
+ exp_scores_shifted = np.exp(scores - scores_max)
+ return exp_scores_shifted / np.sum(exp_scores_shifted, axis=0, keepdims=True)
def fun(scores: np.ndarray) -> float:
return np.dot(scores, softmax(1 - np.array(scores)))
diff --git a/cleanlab/token_classification/summary.py b/cleanlab/token_classification/summary.py
index dc93de720b..bf1fe45580 100644
--- a/cleanlab/token_classification/summary.py
+++ b/cleanlab/token_classification/summary.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
diff --git a/cleanlab/version.py b/cleanlab/version.py
index 7c36fadd30..86e18e5262 100644
--- a/cleanlab/version.py
+++ b/cleanlab/version.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2022 Cleanlab Inc.
+# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
@@ -15,9 +15,27 @@
# along with cleanlab. If not, see .
-__version__ = "2.2.1"
+__version__ = "2.4.1"
-# 2.2.1 - Not yet released, you are using developer version. See documentation at: https://docs.cleanlab.ai/master/
+# 2.4.1 - Not yet released, you are using bleeding-edge developer version. See its documentation at: https://docs.cleanlab.ai/master/
+
+# ------------------------------------------------
+# | PREVIOUS MAJOR VERSION RELEASE NOTES SUMMARY |
+# ------------------------------------------------
+
+# 2.4.0 - One line of code to detect all sorts of dataset issues
+#
+# Major new functionalities include:
+# - Datalab: A unified audit to detect different types of issues in your data and labels. This is the primary way most users should apply cleanlab to their dataset.
+# - Nicer APIs for label issues in multi-label classification datasets, including dataset-level issue summaries for multi-label classification.
+# - Updated tutorials with more interesting datasets and ML models.
+
+# 2.3.0 - Extending cleanlab beyond label errors into a complete library for data-centric AI
+#
+# Major new functionalities include:
+# - Active learning with data re-labeling (ActiveLab)
+# - KerasWrapperModel and KerasSequentialWrapper to make arbitrary Keras models compatible with scikit-learn
+# - Computational improvements for detecting label issues (better efficiency and mini-batch estimation that works with lower memory)
# 2.2.0 - Re-invented algorithms for multi-label classification and support for datasets with missing classes
#
@@ -30,12 +48,6 @@
# - cleanlab now works much better for datasets in which some classes happen to not be present.
# - Algorithmic improvements to ensure count.num_label_issues() returns more accurate estimates.
# - For developers: introduction of flake8 code linter and more comprehensive mypy type annotations.
-#
-# See release for a full changelog.
-
-# ------------------------------------------
-# | PREVIOUS VERSION RELEASE NOTES SUMMARY |
-# ------------------------------------------
# 2.1.0 - "Multiannotator, Outlier detection, and Token Classification" - Cleanlab supports several new features
#
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 00db3f8b9e..2772a81f02 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -8,21 +8,20 @@ ipython==8.0.1
ipykernel==6.8.0
ipywidgets==7.6.5
sphinx-multiversion==0.2.4
-torchvision==0.12.0
sphinx-copybutton==0.5.0
sphinxcontrib-katex==0.8.6
-matplotlib==3.5.1
-skorch==0.11.0
-tensorflow-datasets==4.5.2
+sphinx-autodoc-typehints==1.19.2
+matplotlib==3.6.3
+requests==2.28.2
tensorflow==2.9.1
-scikeras==0.9.0
-scikit-learn<1.2.0
-speechbrain==0.5.12
tensorflow-io==0.26.0
-huggingface_hub==0.7
-torchaudio==0.11.0
-fasttext==0.9.2
-timm==0.6.5
-torch==1.11.0
-requests==2.28.0
-sphinx-autodoc-typehints==1.19.2
+sentence-transformers==2.2.2
+speechbrain==0.5.13
+huggingface_hub==0.11.1
+fasttext-wheel==0.9.2
+torch==1.13.1
+skorch==0.12.1
+torchvision==0.14.1
+torchaudio==0.13.1
+timm==0.6.12
+datasets>=2.9.0
diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css
index a3f141448c..365c0171b1 100644
--- a/docs/source/_static/css/custom.css
+++ b/docs/source/_static/css/custom.css
@@ -38,4 +38,4 @@ h5 {
h6 {
font-size: .75em;
-}
\ No newline at end of file
+}
diff --git a/docs/source/cleanlab/datalab/data.rst b/docs/source/cleanlab/datalab/data.rst
new file mode 100644
index 0000000000..6da27efbc7
--- /dev/null
+++ b/docs/source/cleanlab/datalab/data.rst
@@ -0,0 +1,9 @@
+data
+====
+
+.. automodule:: cleanlab.datalab.data
+ :autosummary:
+ :members:
+ :undoc-members:
+ :show-inheritance:
+ :ignore-module-all:
\ No newline at end of file
diff --git a/docs/source/cleanlab/datalab/data_issues.rst b/docs/source/cleanlab/datalab/data_issues.rst
new file mode 100644
index 0000000000..d3a578ae2b
--- /dev/null
+++ b/docs/source/cleanlab/datalab/data_issues.rst
@@ -0,0 +1,9 @@
+data_issues
+===========
+
+.. automodule:: cleanlab.datalab.data_issues
+ :autosummary:
+ :members:
+ :undoc-members:
+ :show-inheritance:
+ :ignore-module-all:
\ No newline at end of file
diff --git a/docs/source/cleanlab/datalab/datalab.rst b/docs/source/cleanlab/datalab/datalab.rst
new file mode 100644
index 0000000000..8a38a27f95
--- /dev/null
+++ b/docs/source/cleanlab/datalab/datalab.rst
@@ -0,0 +1,9 @@
+datalab
+=======
+
+.. automodule:: cleanlab.datalab.datalab
+ :autosummary:
+ :members:
+ :undoc-members:
+ :show-inheritance:
+ :ignore-module-all:
\ No newline at end of file
diff --git a/docs/source/cleanlab/datalab/factory.rst b/docs/source/cleanlab/datalab/factory.rst
new file mode 100644
index 0000000000..d65d1eac5d
--- /dev/null
+++ b/docs/source/cleanlab/datalab/factory.rst
@@ -0,0 +1,9 @@
+factory
+=======
+
+.. automodule:: cleanlab.datalab.factory
+ :autosummary:
+ :members:
+ :undoc-members:
+ :show-inheritance:
+ :ignore-module-all:
\ No newline at end of file
diff --git a/docs/source/cleanlab/datalab/guide/custom_issue_manager.rst b/docs/source/cleanlab/datalab/guide/custom_issue_manager.rst
new file mode 100644
index 0000000000..3ff417ec86
--- /dev/null
+++ b/docs/source/cleanlab/datalab/guide/custom_issue_manager.rst
@@ -0,0 +1,227 @@
+.. _issue_manager_creating_your_own:
+
+Creating Your Own Issues Manager
+================================
+
+
+
+This guide walks through the process of creating creating your own
+:py:class:`IssueManager `
+to detect a custom-defined type of issue alongside the pre-defined issue types in
+:py:class:`Datalab `.
+
+.. seealso::
+
+ - :py:meth:`register `:
+ You can either use this function at runtime to register a new issue manager:
+
+ .. code-block:: python
+
+ from cleanlab.datalab.factory import register
+ register(MyIssueManager)
+
+ or add as a decorator to the class definition:
+
+ .. code-block:: python
+
+ @register
+ class MyIssueManager(IssueManager):
+ ...
+
+Prerequisites
+-------------
+
+As a starting point for this guide, we'll import the necessary things for the next section and create a dummy dataset.
+
+.. note::
+
+ .. include:: ../optional_dependencies.rst
+
+.. code-block:: python
+
+
+ import numpy as np
+ import pandas as pd
+ from cleanlab import IssueManager
+
+ # Create a dummy dataset
+ N = 20
+ data = pd.DataFrame(
+ {
+ "text": [f"example {i}" for i in range(N)],
+ "label": np.random.randint(0, 2, N),
+ },
+ )
+
+
+Implementing IssueManagers
+--------------------------
+
+.. _basic_issue_manager:
+
+Basic Issue Check
+~~~~~~~~~~~~~~~~~
+
+
+To create a basic issue manager, inherit from the
+:py:class:`IssueManager ` class,
+assign a name to the class as the class-variable, `issue_name`, and implement the
+:py:meth:`find_issues ` method.
+
+The :py:meth:`find_issues `
+method should mark each example in the dataset as an issue or not with a boolean array.
+It should also provide a score for each example in the dataset that quantifies the quality of the example
+with regards to the issue.
+
+.. code-block:: python
+
+ class Basic(IssueManager):
+ # Assign a name to the issue
+ issue_name = "basic"
+ def find_issues(self, **kwargs) -> None:
+ # Compute scores for each example
+ scores = np.random.rand(len(self.datalab.data))
+
+ # Construct a dataframe where examples are marked for issues
+ # and the score for each example is included.
+ self.issues = pd.DataFrame(
+ {
+ f"is_{self.issue_name}_issue" : scores < 0.1,
+ self.issue_score_key : scores,
+ },
+ )
+
+ # Score the dataset as a whole based on this issue type
+ self.summary = self.make_summary(score = scores.mean())
+
+
+.. _intermediate_issue_manager:
+
+Intermediate Issue Check
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+To create an intermediate issue:
+
+- Perform the same steps as in the :ref:`basic issue check ` section.
+- Populate the `info` attribute with a dictionary of information about the identified issues.
+
+The information can be included in a report generated by :py:class:`Datalab `,
+if you add any of the keys to the `verbosity_levels` class-attribute.
+Optionally, you can also add a description of the type of issue this issue manager handles to the `description` class-attribute.
+
+.. code-block:: python
+
+ class Intermediate(IssueManager):
+ issue_name = "intermediate"
+ # Add a dictionary of information to include in the report
+ verbosity_levels = {
+ 0: [],
+ 1: ["std"],
+ 2: ["raw_scores"],
+ }
+ # Add a description of the issue
+ description = "Intermediate issues are a bit more involved than basic issues."
+ def find_issues(self, *, intermediate_arg: int, **kwargs) -> None:
+ N = len(self.datalab.data)
+ raw_scores = np.random.rand(N)
+ std = raw_scores.std()
+ threshold = min(0, raw_scores.mean() - std)
+ sin_filter = np.sin(intermediate_arg * np.arange(N) / N)
+ kernel = sin_filter ** 2
+ scores = kernel * raw_scores
+ self.issues = pd.DataFrame(
+ {
+ f"is_{self.issue_name}_issue" : scores < threshold,
+ self.issue_score_key : scores,
+ },
+ )
+ self.summary = self.make_summary(score = scores.mean())
+
+ # Useful information that will be available in the Datalab instance
+ self.info = {
+ "std": std,
+ "raw_scores": raw_scores,
+ "kernel": kernel,
+ }
+
+Advanced Issue Check
+~~~~~~~~~~~~~~~~~~~~
+
+.. note::
+
+ WIP: This section is a work in progress.
+
+
+
+Use with Datalab
+----------------
+
+We can create a
+:py:class:`Datalab `
+instance and run issue checks with the custom issue managers we created like so:
+
+
+.. code-block:: python
+
+ from cleanlab.datalab.factory import register
+ from cleanlab import Datalab
+
+
+ # Register the issue manager
+ for issue_manager in [Basic, Intermediate]:
+ register(issue_manager)
+
+ # Instantiate a datalab instance
+ datalab = Datalab(data, label_name="label")
+
+ # Run the issue check
+ issue_types = {"basic": {}, "intermediate": {"intermediate_arg": 2}}
+ datalab.find_issues(issue_types=issue_types)
+
+ # Print report
+ datalab.report(verbosity=0)
+
+
+The report will look something like this:
+
+.. code-block:: text
+
+ Here is a summary of the different kinds of issues found in the data:
+
+ issue_type score num_issues
+ basic 0.477762 2
+ intermediate 0.286455 0
+
+ (Note: A lower score indicates a more severe issue across all examples in the dataset.)
+
+
+ ------------------------------------------- basic issues -------------------------------------------
+
+ Number of examples with this issue: 2
+ Overall dataset quality in terms of this issue: 0.4778
+
+ Examples representing most severe instances of this issue:
+ is_basic_issue basic_score
+ 13 True 0.003042
+ 8 True 0.058117
+ 11 False 0.121908
+ 15 False 0.169312
+ 17 False 0.229044
+
+
+ --------------------------------------- intermediate issues ----------------------------------------
+
+ About this issue:
+ Intermediate issues are a bit more involved than basic issues.
+
+ Number of examples with this issue: 0
+ Overall dataset quality in terms of this issue: 0.2865
+
+ Examples representing most severe instances of this issue:
+ is_intermediate_issue intermediate_score kernel
+ 0 False 0.000000 0.0
+ 1 False 0.007059 0.009967
+ 3 False 0.010995 0.087332
+ 2 False 0.016296 0.03947
+ 11 False 0.019459 0.794251
diff --git a/docs/source/cleanlab/datalab/guide/index.rst b/docs/source/cleanlab/datalab/guide/index.rst
new file mode 100644
index 0000000000..b3dd678816
--- /dev/null
+++ b/docs/source/cleanlab/datalab/guide/index.rst
@@ -0,0 +1,29 @@
+Datalab guides
+==============
+
+This page contains a list of guides for using Datalab.
+
+.. note::
+
+ .. include:: ../optional_dependencies.rst
+
+
+Types of issues
+---------------
+
+These guides are for users who want to use Datalab with greater control, selecting what issues to search for and what nondefault settings to use for detecting them.
+
+.. toctree::
+ :maxdepth: 3
+
+ issue_type_description
+
+Customizing issue types
+-----------------------
+
+These guides are for developers to create a custom issue type that Datalab can audit for together with the built-in issue types it already detects.
+
+.. toctree::
+ :maxdepth: 3
+
+ custom_issue_manager
\ No newline at end of file
diff --git a/docs/source/cleanlab/datalab/guide/issue_type_description.rst b/docs/source/cleanlab/datalab/guide/issue_type_description.rst
new file mode 100644
index 0000000000..34b411341d
--- /dev/null
+++ b/docs/source/cleanlab/datalab/guide/issue_type_description.rst
@@ -0,0 +1,211 @@
+Datalab Issue Types
+*******************
+
+
+Types of issues Datalab can detect
+===================================
+
+This page describes the various types of issues that Datalab can detect in a dataset.
+For each type of issue, we explain: what it says about your data if detected, why this matters, and what parameters you can optionally specify to control the detection of this issue.
+
+Estimates for Each Issue Type
+------------------------------
+
+Datalab produces three estimates for **each** type of issue (called say `` here):
+
+
+1. A numeric quality score `_score` (between 0 and 1) estimating how severe this issue is exhibited in each example from a dataset. Examples with higher scores are less likely to suffer from this issue. Access these via: the :py:attr:`Datalab.issues ` attribute or the method :py:meth:`Datalab.get_issues(\) `.
+2. A Boolean `is__issue` flag for each example from a dataset. Examples where this has value `True` are those estimated to exhibit this issue. Access these via: the :py:attr:`Datalab.issues ` attribute or the method :py:meth:`Datalab.get_issues(\) `.
+3. An overall dataset quality score (between 0 and 1), quantifying how severe this issue is overall across the entire dataset. Datasets with higher scores do not exhibit this issue as badly overall. Access these via: the :py:attr:`Datalab.issue_summary