Skip to content

Commit d762673

Browse files
committed
Merge remote-tracking branch 'dask/main' into 734-pca-skip-centering
2 parents 4c82add + db2e7d5 commit d762673

File tree

7 files changed

+178
-25
lines changed

7 files changed

+178
-25
lines changed

README.rst

+31-8
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,35 @@
1-
dask-ml
1+
Dask-ML
22
=======
33

4-
``dask-ml`` is a library for distributed and parallel machine learning using `dask`_.
5-
See the `documentation`_ for more.
4+
|Build Status| |Azure Pipelines| |Coverage| |Doc Status| |Gitter| |Version Status| |NumFOCUS|
65

7-
.. image:: https://dev.azure.com/dask-dev/dask/_apis/build/status/dask.dask-ml?branchName=main
8-
:target: https://dev.azure.com/dask-dev/dask/_build/latest?definitionId=1&branchName=main
9-
:alt: CI Status
6+
Dask-ML provides scalable machine learning in Python using `Dask <https://dask.org/>`__ alongside popular machine learning libraries like `Scikit-Learn <http://scikit-learn.org/>`__, `XGBoost <https://ml.dask.org/xgboost.html>`__, and others.
7+
8+
You can try Dask-ML on a small cloud instance by clicking the following button:
9+
10+
.. image:: https://mybinder.org/badge.svg
11+
:target: https://mybinder.org/v2/gh/dask/dask-examples/main?filepath=machine-learning.ipynb
1012

11-
.. _dask: https://dask.org
12-
.. _documentation: http://ml.dask.org
13+
LICENSE
14+
-------
15+
16+
New BSD. See `License File <https://github.com/dask/dask-ml/blob/main/LICENSE.txt>`__.
17+
18+
.. _documentation: https://dask.org
19+
.. |Build Status| image:: https://github.com/dask/dask-ml/workflows/CI/badge.svg?branch=main
20+
:target: https://github.com/dask/dask-ml/actions?query=workflow%3A%22CI%22
21+
.. |Azure Pipelines| image:: https://dev.azure.com/dask-dev/dask/_apis/build/status/dask.dask-ml?branchName=main
22+
:target: https://dev.azure.com/dask-dev/dask/_build/latest?definitionId=1&branchName=main
23+
.. |Coverage| image:: https://codecov.io/gh/dask/dask-ml/branch/main/graph/badge.svg
24+
:target: https://codecov.io/gh/dask/dask-ml/branch/main
25+
:alt: Coverage status
26+
.. |Doc Status| image:: https://readthedocs.org/projects/ml/badge/?version=latest
27+
:target: https://ml.dask.org/
28+
:alt: Documentation Status
29+
.. |Gitter| image:: https://badges.gitter.im/Join%20Chat.svg
30+
:alt: Join the chat at https://gitter.im/dask/dask
31+
:target: https://gitter.im/dask/dask?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
32+
.. |Version Status| image:: https://img.shields.io/pypi/v/dask-ml.svg
33+
:target: https://pypi.python.org/pypi/dask-ml/
34+
.. |NumFOCUS| image:: https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A
35+
:target: https://www.numfocus.org/

ci/posix.yaml

+3-3
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ jobs:
1010
matrix:
1111
linux37:
1212
envFile: 'ci/environment-3.7.yaml'
13-
SKLARN_DEV: "no"
13+
SKLEARN_DEV: "no"
1414
linux38:
1515
envFile: 'ci/environment-3.8.yaml'
16-
SKLARN_DEV: "no"
16+
SKLEARN_DEV: "no"
1717
earliest:
1818
envFile: 'ci/environment-3.6.yaml'
19-
SKLARN_DEV: "no"
19+
SKLEARN_DEV: "no"
2020
sklearnDev:
2121
envFile: 'ci/environment-3.7.yaml'
2222
SKLEARN_DEV: "yes"

dask_ml/metrics/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
)
77
from .regression import ( # noqa
88
mean_absolute_error,
9+
mean_absolute_percentage_error,
910
mean_squared_error,
1011
mean_squared_log_error,
1112
r2_score,

dask_ml/metrics/regression.py

+75-12
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def _check_sample_weight(sample_weight: Optional[ArrayLike]):
1616
def _check_reg_targets(
1717
y_true: ArrayLike, y_pred: ArrayLike, multioutput: Optional[str]
1818
):
19-
if multioutput != "uniform_average":
19+
if multioutput is not None and multioutput != "uniform_average":
2020
raise NotImplementedError("'multioutput' must be 'uniform_average'")
2121

2222
if y_true.ndim == 1:
@@ -40,12 +40,12 @@ def mean_squared_error(
4040
_check_sample_weight(sample_weight)
4141
output_errors = ((y_pred - y_true) ** 2).mean(axis=0)
4242

43-
if isinstance(multioutput, str):
43+
if isinstance(multioutput, str) or multioutput is None:
4444
if multioutput == "raw_values":
45-
return output_errors
46-
elif multioutput == "uniform_average":
47-
# pass None as weights to np.average: uniform mean
48-
multioutput = None
45+
if compute:
46+
return output_errors.compute()
47+
else:
48+
return output_errors
4949
else:
5050
raise ValueError("Weighted 'multioutput' not supported.")
5151
result = output_errors.mean()
@@ -67,12 +67,75 @@ def mean_absolute_error(
6767
_check_sample_weight(sample_weight)
6868
output_errors = abs(y_pred - y_true).mean(axis=0)
6969

70-
if isinstance(multioutput, str):
70+
if isinstance(multioutput, str) or multioutput is None:
7171
if multioutput == "raw_values":
72-
return output_errors
73-
elif multioutput == "uniform_average":
74-
# pass None as weights to np.average: uniform mean
75-
multioutput = None
72+
if compute:
73+
return output_errors.compute()
74+
else:
75+
return output_errors
76+
else:
77+
raise ValueError("Weighted 'multioutput' not supported.")
78+
result = output_errors.mean()
79+
if compute:
80+
result = result.compute()
81+
return result
82+
83+
84+
def mean_absolute_percentage_error(
85+
y_true: ArrayLike,
86+
y_pred: ArrayLike,
87+
sample_weight: Optional[ArrayLike] = None,
88+
multioutput: Optional[str] = "uniform_average",
89+
compute: bool = True,
90+
) -> ArrayLike:
91+
"""Mean absolute percentage error regression loss.
92+
93+
Note here that we do not represent the output as a percentage in range
94+
[0, 100]. Instead, we represent it in range [0, 1/eps]. Read more in
95+
https://scikit-learn.org/stable/modules/model_evaluation.html#mean-absolute-percentage-error
96+
97+
Parameters
98+
----------
99+
y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
100+
Ground truth (correct) target values.
101+
y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
102+
Estimated target values.
103+
sample_weight : array-like of shape (n_samples,), default=None
104+
Sample weights.
105+
multioutput : {'raw_values', 'uniform_average'} or array-like
106+
Defines aggregating of multiple output values.
107+
Array-like value defines weights used to average errors.
108+
If input is list then the shape must be (n_outputs,).
109+
'raw_values' :
110+
Returns a full set of errors in case of multioutput input.
111+
'uniform_average' :
112+
Errors of all outputs are averaged with uniform weight.
113+
compute : bool
114+
Whether to compute this result (default ``True``)
115+
116+
Returns
117+
-------
118+
loss : float or array-like of floats in the range [0, 1/eps]
119+
If multioutput is 'raw_values', then mean absolute percentage error
120+
is returned for each output separately.
121+
If multioutput is 'uniform_average' or ``None``, then the
122+
equally-weighted average of all output errors is returned.
123+
MAPE output is non-negative floating point. The best value is 0.0.
124+
But note the fact that bad predictions can lead to arbitarily large
125+
MAPE values, especially if some y_true values are very close to zero.
126+
Note that we return a large value instead of `inf` when y_true is zero.
127+
"""
128+
_check_sample_weight(sample_weight)
129+
epsilon = np.finfo(np.float64).eps
130+
mape = abs(y_pred - y_true) / da.maximum(y_true, epsilon)
131+
output_errors = mape.mean(axis=0)
132+
133+
if isinstance(multioutput, str) or multioutput is None:
134+
if multioutput == "raw_values":
135+
if compute:
136+
return output_errors.compute()
137+
else:
138+
return output_errors
76139
else:
77140
raise ValueError("Weighted 'multioutput' not supported.")
78141
result = output_errors.mean()
@@ -90,7 +153,7 @@ def r2_score(
90153
compute: bool = True,
91154
) -> ArrayLike:
92155
_check_sample_weight(sample_weight)
93-
_, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
156+
_, y_true, y_pred, _ = _check_reg_targets(y_true, y_pred, multioutput)
94157
weight = 1.0
95158

96159
numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype="f8")

docs/source/index.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ Dask-ML
55
=======
66

77
Dask-ML provides scalable machine learning in Python using Dask_ alongside
8-
popular machine learning libraries like Scikit-Learn_, XGBoost, and others.
8+
popular machine learning libraries like Scikit-Learn_, XGBoost_, and others.
99

1010
You can try Dask-ML on a small cloud instance by clicking the following button:
1111

@@ -132,3 +132,4 @@ See :doc:`Dask-ML + XGBoost <xgboost>` for more information.
132132

133133
.. _Dask: https://dask.org/
134134
.. _Scikit-Learn: http://scikit-learn.org/
135+
.. _XGBoost: https://ml.dask.org/xgboost.html

docs/source/modules/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ Regression Metrics
245245
:toctree: generated/
246246

247247
metrics.mean_absolute_error
248+
metrics.mean_absolute_percentage_error
248249
metrics.mean_squared_error
249250
metrics.mean_squared_log_error
250251
metrics.r2_score

tests/metrics/test_regression.py

+65-1
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,32 @@
11
import numbers
22

33
import dask.array as da
4+
import numpy as np
45
import pytest
56
import sklearn.metrics
7+
from dask.array.utils import assert_eq
68

79
import dask_ml.metrics
10+
from dask_ml._compat import SK_024
811

12+
_METRICS_TO_TEST = [
13+
"mean_squared_error",
14+
"mean_absolute_error",
15+
"r2_score",
16+
]
917

10-
@pytest.fixture(params=["mean_squared_error", "mean_absolute_error", "r2_score"])
18+
# mean_absolute_percentage_error() was added in scikit-learn 0.24.0
19+
if SK_024:
20+
_METRICS_TO_TEST.append("mean_absolute_percentage_error")
21+
22+
23+
@pytest.fixture(params=_METRICS_TO_TEST)
1124
def metric_pairs(request):
1225
"""Pairs of (dask-ml, sklearn) regression metrics.
1326
1427
* mean_squared_error
1528
* mean_absolute_error
29+
* mean_absolute_percentage_error (if scikit-learn >= 0.24.0)
1630
* r2_score
1731
"""
1832
return (
@@ -60,3 +74,53 @@ def test_mean_squared_log_error():
6074
result = m1(a, b)
6175
expected = m2(a, b)
6276
assert abs(result - expected) < 1e-5
77+
78+
79+
@pytest.mark.parametrize("multioutput", ["uniform_average", None])
80+
def test_regression_metrics_unweighted_average_multioutput(metric_pairs, multioutput):
81+
m1, m2 = metric_pairs
82+
83+
a = da.random.uniform(size=(100,), chunks=(25,))
84+
b = da.random.uniform(size=(100,), chunks=(25,))
85+
86+
result = m1(a, b, multioutput=multioutput)
87+
expected = m2(a, b, multioutput=multioutput)
88+
assert abs(result - expected) < 1e-5
89+
90+
91+
@pytest.mark.parametrize("compute", [True, False])
92+
def test_regression_metrics_raw_values(metric_pairs, compute):
93+
m1, m2 = metric_pairs
94+
95+
if m1.__name__ == "r2_score":
96+
pytest.skip("r2_score does not support multioutput='raw_values'")
97+
98+
a = da.random.uniform(size=(100, 3), chunks=(25, 3))
99+
b = da.random.uniform(size=(100, 3), chunks=(25, 3))
100+
101+
result = m1(a, b, multioutput="raw_values", compute=compute)
102+
expected = m2(a, b, multioutput="raw_values")
103+
104+
if compute:
105+
assert isinstance(result, np.ndarray)
106+
else:
107+
assert isinstance(result, da.Array)
108+
109+
assert_eq(result, expected)
110+
assert result.shape == (3,)
111+
112+
113+
def test_regression_metrics_do_not_support_weighted_multioutput(metric_pairs):
114+
m1, _ = metric_pairs
115+
116+
a = da.random.uniform(size=(100, 3), chunks=(25, 3))
117+
b = da.random.uniform(size=(100, 3), chunks=(25, 3))
118+
weights = da.random.uniform(size=(3,))
119+
120+
if m1.__name__ == "r2_score":
121+
error_msg = "'multioutput' must be 'uniform_average'"
122+
else:
123+
error_msg = "Weighted 'multioutput' not supported."
124+
125+
with pytest.raises((NotImplementedError, ValueError), match=error_msg):
126+
_ = m1(a, b, multioutput=weights)

0 commit comments

Comments
 (0)