Skip to content

Commit b2689d3

Browse files
committed
Improve docstrings and remove redundant checks (fixes dask#734) (dask#808)
1 parent d762673 commit b2689d3

File tree

2 files changed

+26
-29
lines changed

2 files changed

+26
-29
lines changed

dask_ml/decomposition/incremental_pca.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -128,11 +128,11 @@ def __init__(
128128
n_components=None,
129129
whiten=False,
130130
center=True,
131-
copy=True,
132131
batch_size=None,
133132
svd_solver="auto",
134133
iterated_power=0,
135134
random_state=None,
135+
copy=True,
136136
):
137137
self.n_components = n_components
138138
self.whiten = whiten
@@ -144,7 +144,6 @@ def __init__(
144144
self.random_state = random_state
145145

146146
def _check_params(self):
147-
super()._check_params()
148147
if self.center is False:
149148
raise ValueError("IncrementalPCA with center=False is not supported.")
150149

dask_ml/decomposition/pca.py

+25-27
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ class PCA(sklearn.decomposition.PCA):
5151
ignored
5252
5353
whiten : bool, optional (default False)
54-
When True (False by default) the `components_` vectors are multiplied
54+
When True (False by default) the ``components_`` vectors are multiplied
5555
by the square root of n_samples and then divided by the singular values
5656
to ensure uncorrelated outputs with unit component-wise variances.
5757
@@ -60,25 +60,10 @@ class PCA(sklearn.decomposition.PCA):
6060
improve the predictive accuracy of the downstream estimators by
6161
making their data respect some hard-wired assumptions.
6262
63-
center : bool, optional (default True)
64-
When False (True by default), the underlying data gets centered at zero
65-
by subtracting the mean of the data from the data itself.
66-
67-
PCA is performed on centered data due to its being a regression model,
68-
without an intercept. As such, its pricipal components originate at the
69-
origin of the transformed space.
70-
71-
`center` set to False may be employed when performing PCA on already
72-
centered data.
73-
74-
Since centering is a required step as part of whitening, `center` set
75-
to False and `whiten` set to True is a combination which may result in
76-
unexpected behavior, if performed on not previously centered data.
77-
7863
svd_solver : string {'auto', 'full', 'tsqr', 'randomized'}
7964
auto :
80-
the solver is selected by a default policy based on `X.shape` and
81-
`n_components`: if the input data is larger than 500x500 and the
65+
the solver is selected by a default policy based on ``X.shape`` and
66+
``n_components``: if the input data is larger than 500x500 and the
8267
number of components to extract is lower than 80% of the smallest
8368
dimension of the data, then the more efficient 'randomized'
8469
method is enabled. Otherwise the exact full SVD is computed and
@@ -99,7 +84,22 @@ class PCA(sklearn.decomposition.PCA):
9984
If int, random_state is the seed used by the random number generator;
10085
If RandomState instance, random_state is the random number generator;
10186
If None, the random number generator is the RandomState instance used
102-
by `da.random`. Used when ``svd_solver`` == 'randomized'.
87+
by ``da.random``. Used when ``svd_solver`` == 'randomized'.
88+
89+
center : bool, optional (default True)
90+
When True (the default), the underlying data gets centered at zero
91+
by subtracting the mean of the data from the data itself.
92+
93+
PCA is performed on centered data due to its being a regression model,
94+
without an intercept. As such, its principal components originate at the
95+
origin of the transformed space.
96+
97+
``center=False`` may be employed when performing PCA on already
98+
centered data.
99+
100+
Since centering is a required step as part of whitening, ``center`` set
101+
to False and ``whiten`` set to True is a combination which may result in
102+
unexpected behavior, if performed on not previously centered data.
103103
104104
Attributes
105105
----------
@@ -128,7 +128,7 @@ class PCA(sklearn.decomposition.PCA):
128128
mean_ : array, shape (n_features,)
129129
Per-feature empirical mean, estimated from the training set.
130130
131-
Equal to `X.mean(axis=0)`.
131+
Equal to ``X.mean(axis=0)``.
132132
133133
n_components_ : int
134134
The estimated number of components. When n_components is set
@@ -197,20 +197,22 @@ class PCA(sklearn.decomposition.PCA):
197197
``dask.linalg.svd_compressed``.
198198
* n_components : ``n_components='mle'`` is not allowed.
199199
Fractional ``n_components`` between 0 and 1 is not allowed.
200-
* center : defaults to ``True`` and enables control over whether centering
201-
gets implicitly performed as part of the PCA model steps.
200+
* center : if ``True`` (the default), automatically center input data before
201+
performing PCA.
202+
Set this parameter to ``False``, if the input data have already been
203+
centered before running ``fit()``.
202204
"""
203205

204206
def __init__(
205207
self,
206208
n_components=None,
207209
copy=True,
208210
whiten=False,
209-
center=True,
210211
svd_solver="auto",
211212
tol=0.0,
212213
iterated_power=0,
213214
random_state=None,
215+
center=True,
214216
):
215217
self.n_components = n_components
216218
self.copy = copy
@@ -221,14 +223,10 @@ def __init__(
221223
self.iterated_power = iterated_power
222224
self.random_state = random_state
223225

224-
def _check_params(self):
225-
pass
226-
227226
def fit(self, X, y=None):
228227
if not dask.is_dask_collection(X):
229228
raise TypeError(_TYPE_MSG.format(type(X)))
230229

231-
self._check_params()
232230
self._fit(X)
233231
self.n_features_in_ = X.shape[1]
234232
return self

0 commit comments

Comments
 (0)