-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzmll.py
201 lines (169 loc) · 6.89 KB
/
zmll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
from scipy.optimize import minimize
import numpy as np
from scipy import stats
from statsmodels.formula.api import ols
import pandas as pd
import sympy as sym
from scipy.misc import derivative
import scipy.optimize as opt
import warnings
warnings.filterwarnings('ignore')
def length(x):
if type(x) == float or type(x) == int or type(x) == np.int32 or type(x) == np.float64 or type(x) == np.float32 or type(x) == np.int64:
return 1
return len(x)
def zetafun(x):
'''
Zeta Function
Description
Zeta function, internal
Usage
zetafun(x)
Parameters
----------
x:
For zetafun, a vector or matrix whose real values must be greater than
or equal to 1.
Details
-------
This functions are not intended to be called by the user. zetafun is a
condensed version of the Riemann's zeta function given in R's VGAM package.
Please use that reference if looking to directly implement Riemann's zeta
function. The function we have included is done so out of convenience.
References
----------
Derek S. Young (2010). tolerance: An R Package for Estimating Tolerance
Intervals. Journal of Statistical Software, 36(5), 1-39.
URL http://www.jstatsoft.org/v36/i05/.
Yee, T. (2010), The VGAM Package for Categorical Data Analysis, Journal of
Statistical Software, 32, 1–34.
Example
-------
zetafun([2,3,4,5,6])
'''
x = np.array(x)
if any(x < 1):
return "Invalid input for Riemann's zeta function."
a = 12
k = 8
B = [1/6, -1/30, 1/42, -1/30, 5/66, -691/2730, 7/6, -3617/510]
ans = []
for i in range(1,a):
ans.append(1/i**x[i%length(x)-1])
ans = np.array(ans)
ans = np.sum(ans)
ans = ans + 1/((x - 1) * a**(x - 1)) + 1/(2 * a**x)
term = (x/2)/a**(x + 1)
ans = ans + term * B[0]
for mm in range(1,k):
term = term * (x + 2 * mm - 2) * (x + 2 * mm - 3)/(a * a * 2 * mm * (2 * mm - 1))
ans = ans + term * B[mm]
return ans
def zmll(x, N = None, s = 1, b = 1, dist = 'Zipf'):
'''
Maximum Likelihood Estimation for Zipf-Mandelbrot Models
Description
Performs maximum likelihood estimation for the parameters of the Zipf,
Zipf-Mandelbrot, and zeta distributions.
Usage
zmll(x, N = None, s = 1, b = 1, dist = ["Zipf", "Zipf-Man", "Zeta"])
Parameters
----------
x:
A vector of raw data or a table of counts which is distributed
according to a Zipf, Zipf-Mandelbrot, or zeta distribution. Do not
supply a vector of counts!
N:
The number of categories when dist = "Zipf" or dist = "Zipf-Man". This
is not used when dist = "Zeta". If N = None, then N is estimated based
on the number of categories observed in the data.
s:
The initial value to estimate the shape parameter, which is set to 1
by default. If a poor initial value is specified, then a warning
message is returned.
b:
The initial value to estimate the second shape parameter when
dist = "Zipf-Man", which is set to 1 by default. If a poor initial
value is specified, then a warning message is returned.
dist:
Options are dist = "Zipf", dist = "Zipf-Man", or dist = "Zeta" if the
data is distributed according to the Zipf, Zipf-Mandelbrot, or zeta
distribution, respectively.
Details
Zipf-Mandelbrot models are commonly used to model phenomena where the
frequencies of categorical data are approximately inversely proportional
to its rank in the frequency table.
Returns
-------
zmll returns a dataframe with coefficients
Note
This function may be updated in a future version of the package so as to
allow greater flexibility with the inputs.
References
Derek S. Young (2010). tolerance: An R Package for Estimating Tolerance
Intervals. Journal of Statistical Software, 36(5), 1-39.
URL http://www.jstatsoft.org/v36/i05/.
Mandelbrot, B. B. (1965), Information Theory and Psycholinguistics. In B.
B. Wolman and E. Nagel, editors. Scientific Psychology, Basic Books.
Zipf, G. K. (1949), Human Behavior and the Principle of Least Effort,
Hafner.
Zornig, P. and Altmann, G. (1995), Unified Representation of Zipf
Distributions, Computational Statistics and Data Analysis, 19, 461–473.
Examples
## Maximum likelihood estimation for randomly generated data
## from the Zipf, Zipf-Mandelbrot, and zeta distributions.
N = 30
s = 2
b = 5
Zdata = [6, 2, 1, 4, 8, 3, 3, 14, 2, 1, 21, 5, 18, 2, 30, 10, 8, 2,
11, 4, 16, 13, 17, 1, 7, 1, 1, 28, 19, 27, 2, 7, 7, 13, 1,
15, 1, 16, 9, 9, 7, 29, 3, 10, 3, 1, 20, 8, 12, 6, 11, 5, 1,
5, 23, 3, 3, 14, 6, 9, 1, 24, 5, 11, 15, 1, 5, 5, 4, 10, 1,
12, 1, 3, 4, 2, 9, 2, 1, 25, 6, 8, 2, 1, 1, 1, 4, 6, 7, 26,
10, 2, 1, 2, 17, 4, 3, 22, 8, 2]
## Zipf
zmll(x = Zdata, N = N, s = s, b = b, dist = 'Zipf')
## Zipf-Mandelbrot
zmll(x = Zdata, N = N, s = s, b = b, dist = 'Zipf-Man')
# Zeta
zmll(x = Zdata, N = np.inf, s = s, b = b, dist = 'Zeta')
'''
x = pd.DataFrame(x)
x = pd.DataFrame(x.value_counts()).T
x.columns = list(range(0,length(x.iloc[0])))
Ntemp = length(x.iloc[0])
x = x.reindex(np.argsort(x.columns),axis=1)
if dist == 'Zeta':
N = Ntemp
if N == None:
N = Ntemp
if N < Ntemp:
return "N cannot be smaller than the maximun number of categories in x!"
Nseq = np.array(list(range(1,N+1)))
zeros = np.zeros(N-length(x.iloc[0]))
zeros = [int(z) for z in zeros]
zeros = pd.DataFrame(zeros).T
zeros.columns = ['']*length(zeros.columns)
x.iloc[0] = x.iloc[0]
x = pd.concat([x,zeros],axis=1)
x = x.iloc[0].to_numpy()
if dist == 'Zipf':
def llzipf(s):
return sum(x*(s*np.log(Nseq)+np.log(sum(1/(Nseq)**s))))
fit = opt.minimize(llzipf, x0=0, method = 'BFGS')['x']
fit = pd.DataFrame(fit,columns = ['s'])
fit.index = ['Coefficients']
if dist == "Zipf-Man":
def llzima(params = [s,b]):
return sum(x*(params[0]*np.log(Nseq+params[1])+np.log(sum(1/(Nseq+params[1])**params[0]))))
fit = opt.minimize(llzima, x0=[0,0],method = 'L-BFGS-B')['x']
fit = pd.DataFrame(fit).T
fit.index = ['Coefficients']
fit.columns = ['s','b']
if dist == "Zeta":
def llzeta(s):
return sum(x*(s*np.log(Nseq)+np.log(zetafun(s))))
fit = opt.minimize(llzeta, x0=1+1e-14, method = 'BFGS')['x']
fit = pd.DataFrame(fit,columns = ['s'])
fit.index = ['Coefficients']
return fit