-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathresid.qmd
211 lines (185 loc) · 4.9 KB
/
resid.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
---
title: "Residual Benchmarking"
subtitle: Benchmarking different ways to calculate residuals in R using a number of different packages.
format: html
df-print: paged
---
## Functions for generating matrices
These functions generate test X and Y vectors and matrices.
```{r}
make_dense <- function(nrow, ncol){
rnorm(nrow * ncol, mean = 1, sd = 5) |> matrix(nrow=nrow)
}
make_dense(10, 4)
```
```{r}
make_sparse <- function(nrow, ncol){
zeroes <- sample(c(TRUE, FALSE), size=nrow * ncol * 0.9, replace=TRUE)
`[<-`(make_dense(nrow, ncol), zeroes, 0) |> Matrix::Matrix(sparse=TRUE)
}
```
```{r}
make_y_vector <- function(x){
beta <- ncol(x) |> runif(max=100)
as.numeric(as.matrix(x) %*% beta)
}
make_dense(4, 4) |>
make_y_vector()
```
```{r}
make_y_matrix <- function(x, cols=2){
beta <- (cols * ncol(x)) |> runif(max=100) |> matrix(ncol=2)
x %*% beta
}
make_dense(4, 4) |>
make_y_matrix()
```
## Functions for calculating residuals
These functions take the vector $\vec{y}$ and matrix $X$, and return the residuals resulting from estimating $\vec{y}$ from $X$ using ordinary least squares regression:
$\mathbf{y} - \left(\mathbf {X} ^{\operatorname {T} }\mathbf {X} \right)^{-1}\mathbf {X} ^{\operatorname {T} }\mathbf {y}$
Using standard matrix operations in R:
```{r}
formula_resid <- function(x, y){
y - x %*% solve(t(x) %*% x) %*% t(x) %*% y
}
```
Base R's QR decomposition:
```{r}
qr_resid <- function(x, y){
qr(x) |> qr.resid(as.matrix(y))
}
```
Base R's `lm` function:
```{r}
lm_resid <- function(x, y){
lm(y ~ x) |> resid()
}
```
Base R's `.lm.fit`, which is a lower level function underlying `lm`:
```{r}
lm_fit_resid <- function(x, y){
.lm.fit(x, y)$residuals
}
```
Algebraically calculating the residuals, but using Eigen C++ instead of base R:
```{r}
eigen_resid <- Rcpp::cppFunction("
SEXP fastResidop(const Eigen::Map<Eigen::MatrixXd> X, Eigen::Map<Eigen::MatrixXd> Y){
Eigen::MatrixXd T1 = X.transpose() * X ;
Eigen::MatrixXd T2 = T1.inverse();
Eigen::MatrixXd T3 = X * T2;
Eigen::MatrixXd T4 = X.transpose() * Y;
Eigen::MatrixXd T5 = T3 * T4;
Eigen::MatrixXd T6 = Y - T5;
return Rcpp::wrap(T6);
}
", depends = "RcppEigen")
```
A custom function that uses Eigen + QR decomposition.
This relies on some C++ code:
```{Rcpp, file="resid.cpp"}
```
```{r}
qr_eigen <- function(x, y){
if (inherits(x, "sparseMatrix"))
qr_sparse_residop(x, as.matrix(y))
else
qr_dense_residop(x, y)
}
```
A custom function that uses QR decomposition, using Eigen only if the matrix is sparse:
```{r}
qr_eigen_partial <- function(x, y){
if (inherits(x, "sparseMatrix"))
qr(x) |> qr.resid(as.matrix(y))
else
qr_dense_residop(x, y)
}
```
A built-in function that uses C++ but comes with the `RcppEigen` package:
```{r}
eigen_fastlm_resid <- function(...){
RcppEigen::fastLm(...) |> resid()
}
```
The Rfast package's version of `lm`:
```{r}
rfast_resid <- function(...){
Rfast::lmfit(...)$residuals
}
```
## Benchmark
This function evaluates a given residual calculation and returns the time and memory usage:
```{r}
benchmark <- function(x, y, calculate, repetition, ...){
time <- bench::mark({
result <- try(calculate(x, y), silent = TRUE)
}, min_iterations = repetition)
time |> dplyr::mutate(
result = list(result),
...
)
}
```
Compile all the functions to benchmark:
```{r}
all_funcs = list(
qr=qr_resid,
formula=formula_resid,
lm=lm_resid,
eigen_cpp=eigen_resid,
qr_eigen=qr_eigen,
qr_eigen_partial=qr_eigen_partial,
.lm.fit=lm_fit_resid,
# eigen_fastlm has a number of modes
eigen_fastlm_colpiv_qr = \(x, y) eigen_fastlm_resid(x, y, 0),
eigen_fastlm_unpiv_qr = \(x, y) eigen_fastlm_resid(x, y, 1),
eigen_fastlm_llt_chol = \(x, y) eigen_fastlm_resid(x, y, 2),
eigen_fastlm_ldlt_chol = \(x, y) eigen_fastlm_resid(x, y, 3),
eigen_fastlm_jacobi_svd = \(x, y) eigen_fastlm_resid(x, y, 4),
eigen_fastlm_eig = \(x, y) eigen_fastlm_resid(x, y, 4),
rfast = rfast_resid
)
```
Run the benchmark:
```{r, error=FALSE, message=FALSE, warning=FALSE, eval=FALSE}
tibble::tibble(
name = names(all_funcs),
calculate = all_funcs,
) |>
dplyr::cross_join(
tibble::tibble(
x = list(
make_dense(nrow = 10000, ncol = 500),
make_sparse(nrow = 10000, ncol = 500)
),
x_type = c("dense", "sparse")
)
) |>
dplyr::cross_join(
tibble::tibble(
make_y = list(
make_y_matrix,
make_y_vector
),
y_type = c("y_matrix", "y_vector")
)
) |>
dplyr::mutate(
y = purrr::map2(x, make_y, function(x, make_y) make_y(x)),
make_y = NULL,
repetition = 2,
) |>
dplyr::filter(
# eigen_cpp will crash with a sparse matrix
!(x_type == "sparse" & name == "eigen_cpp")
) |>
purrr::pmap(benchmark) |>
purrr::list_rbind() |>
dplyr::mutate(
success = result |> purrr::map_lgl(function(x){
!(x[[1]] |> attr("condition") |> inherits("error"))
})
) |>
dplyr::select(name, success, x_type, y_type, median, mem_alloc)
```