-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstats.py
526 lines (453 loc) · 20 KB
/
stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
#!/usr/bin/env python3
"""
Assorted statistical methods, e.g. for fitting non-linear functions.
"""
from __future__ import division
import itertools
import numpy as np
from scipy import stats
from scipy.optimize import curve_fit, minimize
#### Function definitions ####
def sigmoid(x, x0, k):
"""
Sigmoid (logistic) function.
Arguments
---------
x : array like, required
Values to plot over.
x0 : float, required
Mid-point of function (y=0.5 when x=x0).
k : float, required
Slope parameter of function.
Returns
-------
y : numpy array
y-values of function.
"""
return 1 / ( 1 + np.exp(-k*(x-x0)) )
def weibullCDF(x, gamma, k):
"""
Weibull cumulative density function.
Arguments
---------
x : array like, required
Values to plot over.
gamma : float, required
Scale parameter. Parameter is often referred to as lambda, but that
name has a special meaning in Python so we use gamma instead here.
k : float, required
Shape parameter.
Returns
-------
y : numpy array
y-values of function.
"""
y = np.zeros_like(x, dtype=float)
y[x>=0] = 1 - np.exp(-(x[x>=0]/gamma)**k)
return y
def gaussCDFdiff(x, mu1, sigma1, mu2, sigma2):
"""
Takes difference between 2 Gaussian CDFs - result is like a Gaussian PDF
but with the possibility of a sustained response in the central region.
Could be used to fit responses from a 2IFC task where one interval is
fixed across trials and the other is variable.
Arguments
---------
x : array like, required
x values to plot over.
mu1, sigma1 : float, required
Mean and standard deviation for the first (leftmost) CDF.
mu2, sigma2 : float, required
Mean and standard deviation for the second (rightmost) CDF.
Returns
-------
f : numpy array
Resutling function, defined as CDF1 - CDF2
"""
return stats.norm.cdf(x, mu1, sigma1) - stats.norm.cdf(x, mu2, sigma2)
##### Class definitions #####
class _BaseFitFunction(object):
"""
Base class arguments
--------------------
func : function instance, optional
Non-linear function to fit. Must accept an array of x-values as its
first argument, and then any further arguments should be function
parameters that are to be estimated via the fitting process. Default
is to use a Gaussian CDF.
invfunc : function instance or None, optional
Inverse of main function. Only necessary if wanting to use the
.getXForY method.
fit_method : 'mle' or 'lsq', optional
Fitting method to use. Pass 'mle' to use maximum-likelihood estimation
(default), or 'lsq' to use non-linear least squares.
ymin : float, optional
Expected minimum value of y (e.g. use to adjust for chance level)
lapse : float | 'optim', optional
Lapse parameter (expected ymax = 1 - lapse). Can also specify as str
'optim' to add this as an additional optimisation parameter - note
that in this case the lapse parameter must then be appended as the
final value in any starting parameters, bounds, etc.
Base class methods
------------------
.doFit
Performs function fitting.
.doInterp
Returns interpolated values for x and y variables, e.g. for plotting.
.getFittedParams
Return fitted parameters, assuming .doFit has already been run.
.getXForY
Use inverse function to get x-value for given y-value.
"""
"""
Separate docstring so it's not inherited by child classes. Base class
provides methods for performing function fits. May pass to child classes.
If wanting to do MLE, child class must implement a method .negLogLik,
taking an array of parameters as its only argument, and returning
a single-value giving the negative log-likelihood.
"""
def __init__(self, func=stats.norm.cdf, invfunc=stats.norm.ppf,
fit_method='mle', ymin=0, lapse=0):
# Error check
if fit_method not in ['lsq','mle']:
raise ValueError("fit_method must be one of 'lsq' or 'mle', " \
"but received: {}".format(fit_method))
if not ( (isinstance(lapse, str) and lapse == 'optim') \
or isinstance(lapse, (int, float)) ):
raise ValueError("lapse must be numeric or 'optim', " \
"but received: {}".format(lapse))
# Assign args to class
self.fit_method = fit_method
self.ymin = ymin
self.lapse = lapse
self.fit = None # place holder for fit results
self.selected_x0 = None # place holder for grid search results
# Assign funcs last as setter methods need access to other attributes
self.func = func
self.invfunc = invfunc
@property
def func(self):
return self._func
@func.setter
def func(self, fun):
"""Setter adjusts forward func for ymin and lapse params"""
a = self.ymin
b = self.lapse
if isinstance(b, (int, float)):
self._func = lambda x, *p: a + (1 - a - b) * np.asarray(fun(x, *p))
else: # optimise
self._func = lambda x, *p: a + (1 - a - p[-1]) \
* np.asarray(fun(x, *p[:-1]))
@property
def invfunc(self):
return self._invfunc
@invfunc.setter
def invfunc(self, fun):
"""Setter adjusts inverse func for ymin and lapse params"""
if fun is None:
self._invfunc = None
else:
a = self.ymin
b = self.lapse
if isinstance(b, (int, float)):
self._invfunc = lambda y, *p: fun((np.asarray(y) - a) \
/ (1-a-b), *p)
else: # optimise
self._invfunc = lambda y, *p: fun((np.asarray(y) - a) \
/ (1 - a - p[-1]), *p[:-1])
def _assertFitDone(self):
if self.fit is None:
raise RuntimeError('Must call .doFit method first')
def doFit(self, *args, **kwargs):
"""
Performs the function fit. May include further positional and / or
keyword arguments to be passed to the relevant optimization function.
These vary depending on the fitting method chosen:
* If using MLE, these should be additional arguments for the
scipy.optimize.minimize function. Note that this requires a first
positional argument or keyword argument 'x0' containing initial
guesses for the non-linear function parameters.
* If using non-linear least squares, should be additional arguments for
the scipy.optimize.curve_fit function. Although not required, it is
recommended that you supply a first positional argument or keyword
argument 'p0' containing initial guesses for the non-linear
function parameters.
* See the documentation for the minimize / curve_fit functions for
more details on other available arguments.
* If optimising lapse parameter (self.lapse == 'optim') then this
parameter must be included as the final value in any relevant args.
* Starting parameters can also be specified as a list of lists, where
each inner list contains a range of values for a given parameter
(i.e. each inner list represents a function parameter in turn).
In this case a grid search is performed over all parameter
combinations, and the best performing set is selected for the
optimisation procedure. The selected values will be stored in the
.selected_x0 attribute.
Results are stored within the .fit attribute of this class, and can
also be accessed with the .getFittedParams method.
"""
# Pop out starting params (so we can check for grid search)
if self.fit_method == 'mle' and 'x0' in kwargs:
x0 = kwargs.pop('x0')
elif self.fit_method == 'lsq' and 'p0'in kwargs:
x0 = kwargs.pop('p0')
else:
args = list(args)
x0 = args.pop(0)
# Grid search?
if all(hasattr(p, '__iter__') for p in x0):
x0grid = list(itertools.product(*x0))
errs = []
for p in x0grid:
if self.fit_method == 'mle':
err = self.negLogLik(p)
elif self.fit_method == 'lsq':
err = sum((self.func(self.x, *p) - self.y)**2)
errs.append(err)
self.selected_x0 = x0 = x0grid[np.argmin(errs)]
# Main fit
with np.errstate(divide='ignore', invalid='ignore'):
if self.fit_method == 'mle':
self.fit = minimize(self.negLogLik, x0, *args, **kwargs)
elif self.fit_method == 'lsq':
self.fit = curve_fit(self.func, self.x, self.y, x0,
*args, **kwargs)
def getFittedParams(self):
"""
Return fitted parameters if .doFit has been run.
"""
self._assertFitDone()
if self.fit_method == 'mle':
return self.fit.x
elif self.fit_method == 'lsq':
return self.fit[0]
def getXForY(self, y):
"""
Use inverse function to get corresponding x-value for given y-value.
Requires that an inverse function has been supplied and .doFit method
has been run.
"""
params = self.getFittedParams()
if self.invfunc is None:
raise RuntimeError('Inverse function was not supplied')
return self.invfunc(y, *params)
def doInterp(self, npoints=100, interpX=None):
"""
Returns an (x,y) tuple of values interpolated along x-dimension(s),
which can be used for plotting. If original x was 1D then both
interpolated x and y will be <npoints> long 1D vectors. If original x
was > 1D then interpolated x will be an <ndims> long list of <npoints>
1D vectors, and interpolated y will be an ([npoints] * ndims)
n-dimensional array.
Arguments
---------
npoints : int or list of ints, optional
Number of points to interpolate. For multivariate cases, can also
specify a list of values for each dimension (i.e. column of
original x array) to interpolate a different number of points
for each one. Ignored if <interpX> is not None.
interpX : array-like or None, optional
Interpolated x-values to calculate y-values over. If None, will
create a default range (see <npoints>). For multivariate cases,
should be an (N * M) array, where M is the number of dimensions,
and N is the number of pairwise combinations between x-values
across those dimensions.
Returns
-------
interpX, interpY
Interpoalted x- and y-values respectively.
"""
params = self.getFittedParams()
# For 1D problems, can just interpolate x as is
if self.x.ndim == 1:
if interpX is None:
interpX = np.linspace(self.x.min(), self.x.max(), npoints)
interpY = self.func(interpX, *params)
# else > 1D, needs a bit more work
else:
dims = self.x.shape[1]
# Ensure have npoints for each dim
if not hasattr(npoints, '__iter__'):
npoints = [npoints] * dims
if interpX is None:
# Construct interpolated vectors for each dim
interpX = [np.linspace(xx.min(), xx.max(), n) for (n, xx) in \
zip(npoints, self.x.T)]
# Take all pairwise combinations of vectors - easy way is to
# make a meshgrid of all dims, then flatten and zip each one
interp_mesh = np.meshgrid(*interpX)
interpX2 = np.asarray(list(zip(*map(np.ravel, interp_mesh))))
# Pass through function, reshape to array
interpY = self.func(interpX2, *params).reshape(npoints[::-1])
# Return
return (interpX, interpY)
class FitFunction(_BaseFitFunction):
"""
Class provides functions for fitting a non-linear function via maximum
likelihood (using a normal PDF) or non-linear least squares.
Arguments
---------
x, y : array like, required
Predictor and outcome variables, respectively, which the function is to
be fit to. x can either be an (nsamples,) 1D array for univariate
cases, or an (nsamples * ndims) 2D array for multivariate cases.
y should always be an (nsamples,) 1D array.
mle_func : function instance
Function for evaluating log-likelihood of residuals. Ignored if
<fit_method> is 'lsq'. Should accept array of y-axis residual values
as its only argument. Default is a Gaussian PDF.
*args, **kwargs :
Futher arguments passed to base class (see below)
Methods
-------
.negLogLik
Return negative log-likelihood for a given set of params
See also
--------
* MLEBinomFitFunction - Class for fitting a non-linear function to data
derived from Bernoulli trials via MLE using a Binomial PMF.
Example usage
-------------
Generate some (slightly) noisy data from a Gaussian CDF.
>>> from scipy import stats
>>> true_mu, true_sigma = -3, 1.5
>>> x = np.linspace(-10, 10, 30)
>>> y = stats.norm.cdf(x, true_mu, true_sigma)
>>> ynoise = y + (np.random.rand(len(y)) - 0.5) * 0.3
Fit the function using MLE. For both the initial guess (x0) and the bound
values, the parameters refer to the Gaussian CDF parameters (mu, sigma).
Bounds are specified as (min, max) tuples for each of parameter in turn.
>>> mle_fit = FitFunction(x, ynoise, fit_method='mle')
>>> mle_x0 = (0, 1)
>>> mle_bounds = ( (min(x), max(x)), (1e-5, None))
>>> mle_fit.doFit(x0=mle_x0, bounds=mle_bounds, method='L-BFGS-B')
>>> mle_params = mle_fit.getFittedParams()
Fit the function using non-linear least squares. For both the initial
guess (p0) and the bound values, the parameters refer to the Gaussian CDF
parameters (mu, sigma). Bounds are specified as two tuples giving the
minimum and maximum values respectively, each containing values for each
of the function parameters in turn.
>>> lsq_fit = FitFunction(x, ynoise, fit_method='lsq')
>>> lsq_p0 = (0, 1)
>>> lsq_bounds = ( (min(x), 1e-5), (max(x), np.inf) )
>>> lsq_fit.doFit(p0=lsq_p0, bounds=lsq_bounds, method='trf')
>>> lsq_params = lsq_fit.getFittedParams()
Inspect function fits.
>>> print('True parameters: mu = {0}, sigma = {1}' \\
... .format(true_mu, true_sigma))
>>> print('MLE parameter estimates: mu = {0}, sigma = {1}' \\
... .format(*mle_params))
>>> print('LSQ parameter estimates: mu = {0}, sigma = {1}' \\
... .format(*lsq_params))
Make a plot.
>>> mle_interpX, mle_interpY = mle_fit.doInterp()
>>> lsq_interpX, lsq_interpY = lsq_fit.doInterp()
>>> plt.figure()
>>> plt.plot(x, y, 'k-', label='True fit')
>>> plt.plot(x, ynoise, 'go', label='Noisy data')
>>> plt.plot(mle_interpX, mle_interpY, 'r--', label='MLE fit')
>>> plt.plot(lsq_interpX, lsq_interpY, 'b:', label='LSQ fit')
>>> plt.legend()
>>> plt.show()
"""
__doc__ += _BaseFitFunction.__doc__
def __init__(self, x, y, mle_func=stats.norm.pdf, *args, **kwargs):
self.x = np.asarray(list(x))
self.y = np.asarray(list(y))
self.mle_func = mle_func
super(FitFunction, self).__init__(*args, **kwargs)
def negLogLik(self, params):
"""
Computes negative log likelihood for a given set of function params.
"""
ypred = self.func(self.x, *params)
resid = self.y - ypred
return -np.log(self.mle_func(resid).clip(min=1e-10)).sum()
class MLEBinomFitFunction(_BaseFitFunction):
"""
Class contains functions for fitting a non-linear function in the special
case where the data are derived from Bernoulli trials, i.e. where an event
either happens or doesn't. Fit is done via MLE using a binomial PMF.
Arguments
---------
x : array-like, required
The values along which the predictor variable varies. May be bin values
in the case where x is discrete, or just a list of the individual
x-values where x is continuous. Can be either an (nsamples,) 1D
array for univariate cases, or an (nsamples * ndims) 2D array for
multivarariate cases.
counts : array-like of ints, required
The counts (sums) of the number of occurences of the measured event
happening for each level of x. These can be the 0 and 1 values from the
Bernoulli trials themselves if x is continous.
n : int or array-like of ints, required
The total number of trials. Can be a single integer in which case the
same number is assumed for all levels of x, or a list of separate
integers for each level of x. This can be set to 1 if x is continuous.
*args, **kwargs :
Further arguments passed to base class (see below). Note that the
<fit_method> argument will be overidden and set to 'mle'.
Methods
-------
.negLogLik
Return negative log-likelihood for a given set of params
Example usage
-------------
Generate some random data. We will have x-values in 11 discrete bins
ranging from -10 to +10. For each bin we will simulate a random number of
trials (between 5 and 8), generating a random set of binary responses
(i.e. as per a Bernoulli trial) with the probability of responding 0 or 1
at the given x-value determined by a Gaussian CDF.
>>> from scipy import stats
>>> true_mu, true_sigma = -2, 3
>>> x = np.linspace(-10, 10, 11)
>>> responses = []
>>> for this_x in x:
... this_n = np.random.choice((5,6,7,8))
... p = stats.norm.cdf(this_x, true_mu, true_sigma)
... these_resps = np.random.choice((0,1), size=this_n, p=(1-p,p))
... responses.append(these_resps)
>>> counts = list(map(sum, responses))
>>> n = list(map(len, responses))
Fit the function. For both the inital guess (x0) and the bound values,
parameters refer to those of the Gaussian CDF parameters (mu, sigma).
Bounds are specified as (min, max) tuples for each of the parameters in
turn.
>>> fit = MLEBinomFitFunction(x, counts, n)
>>> x0 = (0, 1)
>>> bounds = ( (min(x), max(x)), (1e-5, None) )
>>> fit.doFit(x0=x0, bounds=bounds, method='L-BFGS-B')
>>> mle_params = fit.getFittedParams()
Inspect function fits.
>>> print('True parameters: mu = {0}, sigma = {1}' \\
... .format(true_mu, true_sigma))
>>> print('MLE parameter estimates: mu = {0}, sigma = {1}' \\
... .format(*mle_params))
Make a plot.
>>> interpX, interpY = fit.doInterp()
>>> plt.figure()
>>> plt.plot(interpX, stats.norm.cdf(interpX, true_mu, true_sigma),
... 'k-', label='True fit')
>>> plt.plot(x, list(map(np.mean, responses)), 'bo', label='Mean response')
>>> plt.plot(interpX, interpY, 'r--', label='MLE fit')
>>> plt.legend()
>>> plt.show()
"""
__doc__ += _BaseFitFunction.__doc__
def __init__(self, x, counts, n, *args, **kwargs):
self.x = np.asarray(list(x))
self.counts = np.asarray(list(counts))
if hasattr(n, '__iter__'):
self.n = np.asarray(list(n))
else:
self.n = n
kwargs['fit_method'] = 'mle' # force MLE
super(MLEBinomFitFunction, self).__init__(*args, **kwargs)
def negLogLik(self, params):
"""
Computes negative log likelihood via fitting a Binomial PMF, using
probability values generated by the given non-linear function.
Params should be for the non-linear function.
"""
p = self.func(self.x, *params)
return -np.log(stats.binom.pmf(self.counts, self.n, p).clip(min=1e-10)).sum()