-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathMultiple imputation.R
74 lines (57 loc) · 2.35 KB
/
Multiple imputation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
################
# use mice packge to do multiple imputations
################
library(mice)
print(nhanes)
## examining missing patterns
md.pattern(nhanes)
p <- md.pairs(nhanes)
p
## multiple imputations
imp <- mice(nhanes, seed = 23109)
print(imp)
## diagnostic analysis - skipped
## modeling
fit <- with(imp, lm(chl ~ age + bmi))
print(fit) ## one model per imputation
pool(fit) ## pool to form one solution
summary(pool(fit)) ## lambda is the proportion of the total variance that is attributable to the missing data
## increase the number of imputations
imp50 <- mice(nhanes, m = 50, seed = 23109)
fit50 <- with(imp50, lm(chl ~ age + bmi))
summary(pool(fit50))
## choose approriate models to imputate different types of variables
imp_meth <- mice(nhanes, method = c("", "norm", "pmm", "mean"))
## for more information, see
## Van Buuren, S., Groothuis-Oudshoorn, K. (2011). mice: Multivariate Imputation by Chained Equations in R. Journal of Statistical Software, 45(3), 1-67. http://www.jstatsoft.org/v45/i03/
################
# use mi packge to do multiple imputations
################
library(mi)
### load a dataset into R
data(nlsyV, package = "mi")
## convert it to an enchanced_data_classs called missing_data.frame
mdf <- missing_data.frame(nlsyV)
## show details about missing patterns
show(mdf)
## change variable types to approriate ones
mdf <- change(mdf, y = c("income", "momrace"), what = "type", to = c("non", "un")) ## un = unordered-categorical; non = nonnegative-continuous
## use class?missing_variable to see more details
show(mdf) ## check again
summary(mdf)
image(mdf)
hist(mdf)
## multiple imputation
imputations <- mi(mdf, n.iter = 30, n.chains = 4, max.minutes = 20) ## 迭代次数; 链数; 最长运算时间
show(imputations)
## 如果迭代次数足够大,那么运算收敛,每个运算链结果类似,否则不收敛。可以检查每个运算链得到结果的均值是否类似。
round( mipply(imputations, mean, to.matrix = TRUE), 3)
## if not converged, keep going
imputations <- mi(imputations, n.iter = 5)
## can use visualization to examine the result
# plot(imputations, y=c("ppvtr.36"))
## modeling
mod <- pool(ppvtr.36 ~ first + b.marr + income + momage + momed + momrace, data = imputations, m = 4) ## use 4 imputed datasets
summary(mod)
display(mod) ## bayesglm
## for more information, see > vignette("mi_vignette", package="mi")