-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGotham City Covid 19.Rmd
485 lines (381 loc) · 29.6 KB
/
Gotham City Covid 19.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
---
title: "COVID 19 in Gotham City"
author: "Amy Kim"
output: github_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r include=FALSE,results='hide'}
library(astsa)
library(forecast)
library(knitr)
library(TSA)
```
```{r, echo = FALSE}
covid <- read.csv("~/Desktop/school/stat153/project/projectdata_covid.csv")
Date <- as.Date(covid$Date, "%m/%d/%y")
cases <- covid$New.Cases
month <- format(as.Date(covid$Date, tryFormats = "%m/%d/%y"), "%m")
year <- format(as.Date(covid$Date, tryFormats = "%m/%d/%y"), "%y")
day <- format(as.Date(covid$Date, tryFormats = "%m/%d/%y"), "%d")
weekday <- weekdays(as.Date(covid$Date, tryFormats = "%m/%d/%y"))
covid <- cbind(covid, year, month, day, weekday)
covid$X <- 1:nrow(covid)
X <- covid$X
```
<!-- The following script adds the PACF to sarima() -->
```{r, echo= FALSE}
sarima_wPACF = function (xdata, p, d, q, P = 0, D = 0, Q = 0, S = -1, details = TRUE,
xreg = NULL, Model = TRUE, fixed = NULL, tol = sqrt(.Machine$double.eps),
no.constant = FALSE, max.lag = -1)
{
layout = graphics::layout
par = graphics::par
plot = graphics::plot
grid = graphics::grid
title = graphics::title
polygon = graphics::polygon
abline = graphics::abline
lines = graphics::lines
frequency = stats::frequency
coef = stats::coef
dnorm = stats::dnorm
ppoints = stats::ppoints
qnorm = stats::qnorm
time = stats::time
na.pass = stats::na.pass
trans = ifelse(is.null(fixed), TRUE, FALSE)
trc = ifelse(details, 1, 0)
n = length(xdata)
if (is.null(xreg)) {
constant = 1:n
xmean = rep(1, n)
if (no.constant == TRUE)
xmean = NULL
if (d == 0 & D == 0) {
fitit = stats::arima(xdata, order = c(p, d, q), seasonal = list(order = c(P,
D, Q), period = S), xreg = xmean, include.mean = FALSE,
fixed = fixed, trans = trans, optim.control = list(trace = trc,
REPORT = 1, reltol = tol))
}
else if (xor(d == 1, D == 1) & no.constant == FALSE) {
fitit = stats::arima(xdata, order = c(p, d, q), seasonal = list(order = c(P,
D, Q), period = S), xreg = constant, fixed = fixed,
trans = trans, optim.control = list(trace = trc,
REPORT = 1, reltol = tol))
}
else fitit = stats::arima(xdata, order = c(p, d, q),
seasonal = list(order = c(P, D, Q), period = S),
include.mean = !no.constant, fixed = fixed, trans = trans,
optim.control = list(trace = trc, REPORT = 1, reltol = tol))
}
if (!is.null(xreg)) {
fitit = stats::arima(xdata, order = c(p, d, q), seasonal = list(order = c(P,
D, Q), period = S), xreg = xreg, fixed = fixed, trans = trans,
optim.control = list(trace = trc, REPORT = 1, reltol = tol))
}
if (details) {
old.par <- par(no.readonly = TRUE)
layout(matrix(c(1,2,3), ncol = 3))
par(mfrow = c(1,3))
## Standardized residuals
rs <- fitit$residuals
stdres <- rs/sqrt(fitit$sigma2)
num <- sum(!is.na(rs))
## ACF
alag <- max(10 + sqrt(num), 3 * S, max.lag)
ACF = stats::acf(rs, alag, plot = FALSE, na.action = na.pass)$acf[-1]
LAG = 1:alag/frequency(xdata)
L = 2/sqrt(num)
plot(LAG, ACF, type = "h"
, ylim = c(min(ACF) - 0.1, min(1, max(ACF + 0.4))), cex.axis = 1.5, cex.lab = 1.5)
title("ACF of Residuals", line = 1, cex.main = 2)
abline(h = c(0, -L, L), lty = c(1, 2, 2), col = c(1,4, 4))
## PACF
alag <- max(10 + sqrt(num), 3 * S, max.lag)
PACF = stats::pacf(rs, alag, plot = FALSE, na.action = na.pass)$acf
LAG = 1:alag/frequency(xdata)
L = 2/sqrt(num)
plot(LAG, PACF, type = "h", ylim = c(min(PACF) - 0.1, min(1,max(PACF + 0.4))), cex.axis = 1.5, cex.lab = 1.5)
title("PACF of Residuals", line = 1, cex.main = 2)
abline(h = c(0, -L, L), lty = c(1, 2, 2), col = c(1,4, 4))
##?
nlag <- ifelse(S < 7, 20, 3 * S)
ppq <- p + q + P + Q - sum(!is.na(fixed))
if (nlag < ppq + 8) {
nlag = ppq + 8
}
pval <- numeric(nlag)
for (i in (ppq + 1):nlag) {
u <- stats::Box.test(rs, i, type = "Ljung-Box")$statistic
pval[i] <- stats::pchisq(u, i - ppq, lower.tail = FALSE)
}
plot((ppq + 1):nlag, pval[(ppq + 1):nlag], xlab = "LAG (H)",
ylab = "p value", ylim = c(-0.1, 1), cex.axis = 1.5, cex.lab = 1.5)
title("p values for Ljung-Box statistic", line = 1, cex.main = 2)
abline(h = 0.05, lty = 2, col = "blue")
on.exit(par(old.par))
}
if (is.null(fixed)) {
coefs = fitit$coef
}
else {
coefs = fitit$coef[is.na(fixed)]
}
dfree = fitit$nobs - length(coefs)
t.value = coefs/sqrt(diag(fitit$var.coef))
p.two = stats::pf(t.value^2, df1 = 1, df2 = dfree, lower.tail = FALSE)
ttable = cbind(Estimate = coefs, SE = sqrt(diag(fitit$var.coef)),
t.value, p.value = p.two)
ttable = round(ttable, 4)
k = length(coefs)
n = n - (d + D)
BIC = stats::BIC(fitit)/n
AIC = stats::AIC(fitit)/n
AICc = (n * AIC + ((2 * k^2 + 2 * k)/(n - k - 1)))/n
list(fit = fitit, degrees_of_freedom = dfree, ttable = ttable,
AIC = AIC, AICc = AICc, BIC = BIC)
}
```
# 1 Executive Summary
Gotham City has been fighting the notorious COVID 19 for the last few months and has only recently tracking the number of new cases, from June 20, 2020 to August 18, 2020. There dataset contains a strong weekly seasonal pattern with peaks in the beginning of the week and heteroscedasticity. According to our parametric model with ARMA(0,0)x(1,0)[3] noise, new cases found will spike towards the weekend and drop at the start of the following week. There will also be an overall increase of new COVID 19 cases found in Gotham City for the next 10 day; days with low number of new cases found will be higher than that of the past 60 days.
# 2 Exploratory Data Analysis
The newly found COVID 19 cases in Gotham City (hereafter referred to simply as "cases") has heteroskedasticity, increasing in variance, as seen in the left panel of Figure 1, which prevents the data from being stable and stationary. In addition, there is a trend of the cases generally decreased until the beginning of July and increased since mid-July, forming a parabolic trend.
Also note that there exists a strong weekly seasonal pattern: there are spikes of cases generally during the weekdays and smaller number of cases during the weekends. Since COVID-19 tests shows accurate results after two weeks, those who got COVID-19 during large events such as the Fourth of July and the Black Lives Matter Movement would show positive results around 14 days from the events.
```{r time, echo= F, fig.cap= "Figure 1: New COVID-19 Cases Found in Gotham City. The right panels shows the distribution of cases by days of week.", fig.height = 3.5, fig.width= 12}
par(mfrow=c(1,2), mgp = c(2.5, 1, 0))
plot(Date, cases, ylab = "New COVID-19 Cases", type = "l")
title("New COVID-19 Cases in Gotham City", line = 1, cex.main = 1.5)
boxplot(cases ~ weekday, data = covid, ylab = "Cases", xlab = "Day of Week", cex.axis = 0.8)
title("Disbribution of Cases by Days of Week", line = 1, cex.main = 1.5)
```
When looking closer, as shown through the boxplot of Figure 1, weekends (Saturdays and Sundays) have the lowest medians while on Mondays and Tuesdays, the start of the week, have the highest medians. Possible explanations for these observations are that people social distancing during weekends but are forced to work and leave the house/socialize during the weekdays.
# 3 Models Considered
```{r, echo = FALSE}
data <- read.csv("~/Desktop/school/stat153/project/projectdata_covid.csv")
data$Date <- as.Date(covid$Date, "%m/%d/%y")
data$X <- 1:nrow(data)
```
Two models will be considered to model the natural signal in the dataset, a parametric model and a differencing approach, to pursue stationarity. Both of these models of the signal will be complimented with ARMA models for the remaining noise.
## 3.1 Parametric Signal Model + Variable Stabilizing Transformation (VST)
```{r density, echo= F, fig.width = 6 , fig.height = 3.5, out.width= "50%", out.extra='style="float:left; padding:1px"'}
par(mgp = c(2.5, 1, 0))
mvspec(log(data$New.Cases), ylab = "Spectrum", xlab = "Frequency", main = "")
title("Estimated Spectral Density", line = 1)
title(sub = "Figure 2: Estimated spectral density of COVID-19 cases", line = -12)
```
First, the parametric model is considered. The estimated spectral density is shown in Figure 2. There are several significant peaks, implying multiple sinusoids. In order to get rid of the quadratically increasing heteroskedasticity of the residuals, the logarithm variance stabilizing transformation (VST) will be incorporated.
The first peak of estimated spectral density at frequency close to 0 can be assumed due to the trend of the dataset, thus disregarded. As per the seasonal patterns that were observed in the data, sinusoids of three different frequencies are included. The highest peak occur at frequency around 0.14, which corresponds to period of around 7. This observation alligns with the weekly seasonality mentioned above in the Exploratory Data Analysis. The remaining two significant peaks occur at frequency around 0.08 and 0.28, which corresponds to a period of around 4 and 12 respectively.
The fitted values are created by three sinusoids (detected in Figure 2) with frequencies 0.08, 0.14, and 0.28, capturing seasonality. The sinusoid with the dominant frequency, 9/60, which determines the weekly seasonality, is interacted with time to capture the increasing amplitude over time. In order to capture the parabolic trend, indicator of $t^ 2$ , where t stands for day, is included in the parametric signal model.
```{r parametric, echo = FALSE, fig.cap= "Figure 3: Parametric signal model fit and residuals. The left panel shows this model's fitted values in red, plotted on top of the COVID 19 data in black. The right panel shows the residuals of this model.", fig.height = 3.5, fig.width= 12}
par(mfrow=c(1,2), mgp = c(2.5, 1, 0))
mod1 = lm(log(cases) ~ cos(2*pi*X*(0.14))*X +
sin(2*pi*X*(0.14))*X +
cos(2*pi*X*(0.28)) + sin(2*pi*X*(0.28)) +
cos(2*pi*X*(0.08)) + sin(2*pi*X*(0.08)) +
I(X^2), data = covid)
plot(log(data$New.Cases) ~ as.Date(data$Date, tryFormats - "%m/%d/%Y"), type = "l", ylab = "Log(Cases)", xlab = "Date")
lines(as.Date(data$Date, "%m/%d/%y"), mod1$fitted.values, col = "red")
title("Parametric Signal Model", line = 1, cex.main = 1.5)
plot(mod1$residuals ~ as.Date(data$Date, "%m/%d/%y"), ylab='Residuals', type = "l", xlab = "Date")
title("Residuals of Parametric Signal Model", line = 1, cex.main = 1.5)
```
The signal model shown in the left panel of Figure 3 addresses the seasonality and trend, though the amplitude may not be perfectly alligned. The residual of the signal model seems stationary in the right panel of Figure 3; there are no signs of heteroskedasticity with a constant mean of around 0.
### 3.1.1 Parametric Signal with ARMA(0,0)
The R function `auto.arima()`, with no differencing option specified, suggests ARMA(0,0). This is reasonable because even the significant ACF and PACF value appear to be white noise in Figure 4, thus Model 1 is ARMA(0,0) i.e. p = P = 0 and q = Q = 0.
```{r model1, echo = F, results='hide', fig.keep='all', fig.height = 2, fig.cap= "Figure 4: The left panel is autocorrelation function (ACF) and the middle panel is the partial autocorrelation function (PACF) of both the parametric model residuals and Model 1: ARMA(0,0). The right panel indicates the p values of the Ljung box statistic, with a significance level of 0.05.", fig.height = 3, fig.width= 12}
model1 <- sarima_wPACF(mod1$residuals, p = 0, d = 0, q = 0, S = 0, P = 0, D = 0, Q = 0)
```
The right panel of Figure 4 suggests that although p value of lag 1 is barely above the 0.05 line, all p values of the Ljung-Box statistics are above the 0.05 suggesting that Model 1 can be a fairly good model fit.
### 3.1.2 Parametric Signal with ARMA(0,0)x(1,0)[3]
After pursuing stationary with the parametric signal model, the magnitudes of autocorrelation function (ACF) and partial autocorrelation function (PACF) suggests white noise. However, there is a magnitude slightly beyond the confidence intervals for both ACF and PACF at lag 3 as shown in Figure 5. Thus, Model 2 proposes Q = 1 for S = 3. Q = 1 is chosen instead of P = 1 because the value of PACF in the PACF plot was larger at lag 3 than that of the ACF plot.
```{r model2, echo = F,results='hide',fig.keep='all', fig.height= 2, fig.cap= "Figure 5: The left panel is autocorrelation function (ACF) and the middle panel is the partial autocorrelation function (PACF) of the parametric model residuals and Model 2: ARMA(0,0)x(1,0)[3]. The right panel indicates the p values of the Ljung box statistic, with a significance level of 0.05." , fig.height = 3, fig.width= 12}
model2 <- sarima_wPACF(mod1$residuals, p = 0, d = 0, q = 0, S = 3, P = 0, D = 0, Q = 1)
```
The ACF and PACF plots of the residuals of Model 2 in Figure 5 display that that there are no significant values beyond the confidence intervals. The right panel of Figure 5 suggests that the p values of the Ljung-Box statistics are all above the 0.05 value, implying that Model 2 can be a good model fit.
## 3.2 Differencing Model
Secondly, the differencing approach is considered to pursue stationary. As previously addressed, there are weekly seasonal effects, thus lag-7 differencing will be helpful. In order to capture the quadratic trend of the original datset, second differencing will also be beneficiary.
```{r diff, echo = FALSE, fig.cap= "Figure 6: Diagnostics for differencing signal model. The left panel shows this model's fitted values in red, plotted on top of the COVID 19 data in black. The right panel shows the differences themselves, to be assessed for stationarity.", fig.height = 3.5, fig.width= 12}
par(mfrow=c(1,2))
d = diff(diff(data$New.Cases,7), 1)
covid$impliedmodel = NA
for (i in 9 : nrow(covid)){
covid$impliedmodel[i] = mean(d) + covid$New.Cases[i-7] +
covid$New.Cases[i-1] - covid$New.Cases[i-7-1]
}
#fitted values
plot(covid$New.Cases~as.Date(data$Date, "%m/%d/%y"), type = "l", ylab = "Cases", xlab = "Date", ylim = c(200,1450))
lines(as.Date(data$Date, "%m/%d/%y"),covid$impliedmodel,col='red')
title("Differencing Fitted Values", line = 1, cex.main = 1.5)
#residuals
plot(d~ as.Date(data$Date[-c(1:8)]), type = "l", ylab = "Cases", xlab = "Date")
title(main = expression(paste(nabla[7],nabla[1],"Cases"[t])), line = 1, cex.main =1.5)
```
```{r dacfpacf, echo = F,fig.cap= "Figure 7: Autocorrelation function (ACF) and partial autocorrelation function (PACF) of Differenced Data", fig.height = 3.5, fig.width= 12}
par(mfrow=c(1,2))
acf(d, lag.max = 60, main = "")
title("ACF of Differencing Model", line = 1, cex.main = 1.5)
pacf(d, lag.max = 60, main = "")
title("PACF of Differencing Model", line = 1, cex.main = 1.5)
```
The second order differencing helped residuals seem more stationary because the trend seemed more quadratic (as a slight parabolic concave up form). Though the fitted values on the left panel of Figure 6 are not as concise with the original data compared to the parametric model, the peaks and drops are captured at a fairly accurate date. The right panel of Figure 6 seem fairly stationary, with a constant mean at around 0 and little to no heteroskedasticity visible.
### 3.2.1 Differencing with ARMA(0,1)x(1,0)[7]
From the ACF and PACF plots of the differenced data shown in Figure 7, there is a significant value at the seventh lag beyond the confidence interval for both plots, suggesting S = 7, Q = 1, and P = 1. However, adding both P = 1 and Q = 1 does not make much difference in the fit, so the simpler model, S = 7 and P = 1, is chosen. P = 1 is chosen instead of Q = 1 for Model 3 because the magnitude of PACF value was larger than that of the ACF value. Trial and error showed that not all values of ACF and PACF are not within the confidence intervals unless q = 1 as well. Thus, ARMA(0,1)x(1,0)[7] is proposed as Model 3.
```{r model3, echo = F,results='hide',fig.keep='all', fig.cap = "Figure 8: The left panel is autocorrelation function (ACF) and the middle panel is the partial autocorrelation function (PACF) of Model 3: ARMA(0,1)x(1,0)[7]. The right panel indicates the p values of the Ljung box statistic, with a significance level of 0.05.", fig.height = 3, fig.width= 12}
model3 <- sarima_wPACF(d, p = 0, d = 0, q = 1, S = 7, P = 1, D = 0, Q = 0)
```
The ACF and PACF plots of Model 3, shown in Figure 8, display that that there are no significant values beyond the confidence intervals. The right panel of Figure 8 shows that although the p value of lag 5 is barely above the 0.05 line, the p values of the Ljung-Box statistics are all above the 0.05, suggesting that Model 3 can be a fairly good model fit.
### 3.2.2 Differencing with ARMA(0,1)x(0,1)[7]
```{r, echo = FALSE, results= "hide"}
datats = ts(d, frequency =7)
auto.arima(datats)
```
The R function `auto.arima()`, with no differencing option specified, suggests ARMA(0,1)x(0,1)[7], ie. q = 1 and Q = 1 for S = 7, is Model 4. Model 4 is another alternative to Model 3 as there is a significant value at the seventh lag in the ACF plot of Figure 7.
```{r model4, echo = F, results='hide',fig.keep='all', fig.cap = "Figure 9: The left panel is autocorrelation function (ACF) and the middle panel is the partial autocorrelation function (PACF) of Model 4: ARMA(0,1)x(0,1)[7]. The right panel indicates the p values of the Ljung box statistic, with a significance level of 0.05.", fig.height = 3, fig.width= 12}
model4 <- sarima_wPACF(d, p = 0, d = 0, q = 1, S = 7, P = 0, D = 0, Q = 1)
```
The ACF and PACF plots of the residuals of Model 4, shown in Figure 9, display that that there are no significant values beyond the confidence intervals. The right panel of Figure 9 shows that the p values of the Ljung-Box statistics are all above the 0.05 value, suggesting that Model 4 can be a good model fit.
# 4 Model Comparison and Selection
These four model options are compared through time series cross validation and information criterions (IC) to determine the best model to predict the dataset.
Cross validation was done on 4 different (non)overlaping testing datasets, each consisting 10 days of data, beginning with the 7/10/2020 to 7/19/2020. The training set consist of all data that occur before the appropriate testing set. Each model's forecasting performances will be compared through root-mean-square prediction error (RMSPE). The model with the least RMSPE will be used for testing the IC values.
Because the original dataset of new cases of COVID 19 in Gotham City is a small sample size (of only 60 days), the signal model with the lower RMSPE will be chosen to perform Akaike Information Criterion (AIC), Akaike Information Criterion adjusted for small samples (AICc), and Bayesian Information Criterion (BIC), which is only appropriate when likelihoods are the same. The model chosen by the information criterions will be used for predicting cases for the next 10 days in August 2020.
```{r, echo = FALSE, include=FALSE}
sum_squared_errors <- c(model1 = 0 , model2 = 0, model3 = 0, model4 = 0)
for (i in c(20,30,40,50)) {
train_set <- covid[1:i,]
test_set <- covid[(i+1):(i +10),]
N = nrow(train_set)
# Signal model 1
mod1 = lm(log(cases) ~ cos(2*pi*X*(0.14))*X +
sin(2*pi*X*(0.14))*X+
cos(2*pi*X*(0.28)) + sin(2*pi*X*(0.28)) +
cos(2*pi*X*(0.08)) + sin(2*pi*X*(0.08)) +
I(X^2) , data = covid)
signal.forecast1 = exp(predict(mod1,test_set))
noise.forecast1 = exp(sarima.for(mod1$residuals, n.ahead=10, p = 0, d = 0, q = 0, S = 0, P = 0, D = 0, Q = 0)$pred)
noise.forecast2 = exp(sarima.for(mod1$residuals, n.ahead=10, p = 0, d = 0, q = 0, S = 3, P = 1, D = 0, Q = 0)$pred)
forecast1 = signal.forecast1 + noise.forecast1
forecast2 = signal.forecast1 + noise.forecast2
# Signal model 2 - Differencing
d = diff(diff(covid$New.Cases,7), 1)
noise.forecast3 = sarima.for(d,n.ahead=10,p = 0, d = 0, q = 1, S = 7, P = 1, D = 0, Q = 0)$pred
noise.forecast4 = sarima.for(d,n.ahead=10,p = 0, d = 0, q = 1, S = 7, P = 0, D = 0, Q = 1)$pred
forecast3 = numeric(10)
forecast4 = numeric(10)
for(i in 1:7){
forecast3[i] = noise.forecast3[i] + train_set$New.Cases[N+i-7]
+ train_set$New.Cases[N+i-1] - train_set$New.Cases[N+i-7-1]
forecast4[i] = noise.forecast4[i] + train_set$New.Cases[N+i-7]
+ train_set$New.Cases[N+i-1] - train_set$New.Cases[N+i-1-7]
}
for(i in 8:10){
forecast3[i] = noise.forecast3[i] + forecast3[i-7] #this is hat(Y)_[N+i-7]
+ train_set$New.Cases[N+i-1] - train_set$New.Cases[N+i-1-7]
forecast4[i] = noise.forecast4[i] + forecast4[i-7] #this is hat(Y)_[N+i-7]
+ train_set$New.Cases[N+i-1] - train_set$New.Cases[N+i-1-7]
}
#
sum_squared_errors[1] = sum_squared_errors[1] + sum((forecast1 - test_set$New.Cases)^2)
sum_squared_errors[2] = sum_squared_errors[2] + sum((forecast2 - test_set$New.Cases)^2)
sum_squared_errors[3] = sum_squared_errors[3] + sum((forecast3 - test_set$New.Cases)^2)
sum_squared_errors[4] = sum_squared_errors[4] + sum((forecast4 - test_set$New.Cases)^2)
}
```
```{r rmsetable, echo = FALSE}
#RMSE table
rmse = matrix(sqrt(sum_squared_errors/40), nrow=4, ncol = 1)
colnames(rmse) = "RMSPE"
rownames(rmse) = c(
"Parametric Model + ARMA(0,0)",
"Parametric Model + ARMA(0,0)x(1,0)[3]",
"Differencing + ARMA(0,1)x(1,0)[7]",
"Differencing + ARMA(0,1)x(0,1)[7]"
)
knitr::kable(rmse,caption = "Table 1: Cross-validated out-of-sample root mean squared prediction error for the four models under consideration.")
```
Table 1 shows that the parametric model with ARMA(0,0)x(1,0)[3] has the lowest cross-validated forecast error, as measured by RMSPE, though the parametric model with ARMA(0,0) is a close second.
The Akaike Information Criterion (AIC), Akaike Information Criterion adjusted for small samples (AICc), and Bayesian Information Criterion (BIC) will be calculated because the dataset consists of only 60 data points in total. Because the data is small, the training and testing sets are inevitably small. Less data for training and testing set makes the cross validation results more uncertain, thus the Information Criterion, specifically the AIC will be examined.
```{r, echo = FALSE}
selection <- sapply(list(model1, model2), function(mdl){
c(mdl$AIC, mdl$AICc, mdl$BIC)
})
selection <- cbind(selection,apply(selection,1,which.min))
colnames(selection) <- c("Model 1", "Model 2","Selection")
rownames(selection) <- c("AIC", "AICc", "BIC")
knitr::kable(selection, caption = "Table 2: Akaike Information Criterion (AIC), Akaike Information Criterion adjusted for small samples (AICc), and Bayesian Information Criterion (BIC) for Models with Parametric Singal Model")
```
The AIC, AICc, and BIC all suggests model 2 to be the forecasted model; it has the lower AIC, AICc, and BIC values. Through the cross validation and information criteria, Model 2, Parametric model with ARMA(0,0)x(1,0)[3] is chosen as the forecast model.
# 5 Result
The forecast model to predict new COVID 19 cases from 8/19/2020 to 8/28/2020 is the Parametric model with ARMA(0,0)x(1,0)[3]. This can be also represented as a mathematical equation.
Let $Y_t$ be the new COVID 19 cases found in Gotham city on daty $t$ with the additive noise term $X_t$. $X_t$ is a stationary process defined by ARMA(0,0)x(1,0)[3], where $W_t$ is white noise with variance $\sigma ^2_W$.
$$log(Y_t) = \beta_0 + \beta_1t + \beta_2 I_{t^2} + \beta_3 cos((0.14)2\pi t) + \beta_4 sin((0.14)2\pi t) + \beta_5 t cos((0.14)2\pi t) + \beta_6 t sin((0.14)2\pi t)\\ + \beta_7 cos((0.08)2\pi t) + \beta_8 sin((0.08)2\pi t) + \beta_9 cos((0.28)2\pi t) + \beta_{10}sin((0.28)2\pi t) + X_t$$
The ARMA model also be represented as:
$X_t = \Phi X_{t-7} + W_t$.
There is one indcator $I_{t^2}$ that indicates if day t is quadratic, capturing the quadratic trend of the original dataset.$\Phi$ and $\beta$'s are coefficients (also known as parameters) that will be estimated in the next subsection. As mentioned previously in Section 3.1.2, the model displays a constant variance and mean value of 0 (white noise).
## 5.1 Estimation of Model Parameters
Estimates of the model parameters are given in Table 3 in Appendix 1. There is not a particular nominal observation since the only indicator included in the forecast model is $t^2$. However, by looking at the numerical values, it is particularly interesting to note sinusoid (cosine and sine) interacted with day (time) with frequencies of around 0.14 has a parameter estimate far closer to 0 than sinusoids of frequency 0.08 and 0.28. Also, it can be recognized that the parameter estimate of the intercept and sine with frequency 0.14 has the largest/smallest estimate along with a high Standard Error (SE) value.
## 5.2 Prediction
Figure 10 shows the forecasted values of new COVID 19 cases in Gotham city for 8/19/2020 to 8/28/2020. The model predicts that overall, more people will be tested positive for COVID 19, thus an increases in new COVID 19 cases found. This is highly plausible because of COVID 19 being an extremely contagious virus. If individuals are not careful and not follow quarantine regulations, it is only a matter of time before the virus spreads exponentially.
Figure 10 predicts that there are high expected new COVID cases found on 8/20/2020 (Thursday), 8/22/2020 (Saturday), and 8/28/2020 (Friday).
The number of cases during peak days are generally higher, above 1000 cases on peak days, than the previous days which were mostly below 1000 cases on peak days before August 2020.
In addition, lower expected cases on 8/19/2020 (Wednesday), 8/25/2020 (Tuesday), and 8/26/2020 (Wednesday) are higher in value compared to previous weekly "drops", which were mostly below 600 cases.
The increasing cases found is highly due to an increase in exposure of the virus. It may be that an individual will be more exposed to COVID 19 when more people has the virus or because of more social events that happened in the United States towards August 2020 such as the Black Lives Matter Movement and presidential elections. These crowds, no matter how good the intention may be, could be severely detrimental to the health of residents in Gotham City. There may have to be more strict regulations for quarantine such as fining and shutdowns as well as more awareness on COVID 19 and the importance of wearing masks.
```{r forecasts, echo = FALSE, warning = FALSE, fig.cap='Figure 10: Forecasts of New COVID Cases in Gotham City for the next 10 days, 8/19/2020 to 8/28/2020. The black vertical line indicates 8/20/2020, the first forecast.', fig.height= 3.5, fig.width= 8.5, fig.align = 'center'}
signal1 = lm(cases ~ I(X^2) +
cos(2*pi*X*(0.14))*X +
sin(2*pi*X*(0.14))*X+
cos(2*pi*X*(0.08)) + sin(2*pi*X*(0.08)) +
cos(2*pi*X*(0.28)) + sin(2*pi*X*(0.28)), data = covid)
c <- as.Date(c("8/19/20", "8/20/20", "8/21/20", "8/22/20", "8/23/20", "8/24/20", "8/25/20", "8/26/20", "8/27/20", "8/28/20"), tryFormats = "%m/%d/%Y")
# convert forecast object into data frame
ts_values <- data.frame(
Date = as.Date(covid$Date, tryFormats = "%m/%d/%Y"),
d = covid$New.Cases,
Cases = signal1$fitted.values)
ts_forecast <- data.frame(
Date = c,
Cases = signal.forecast1)
# combine fitted data and forecast mean
ts_values <- rbind(ts_values, transform(ts_forecast[c("Date", "Cases")], d = NA))
# plot it all
library(ggplot2)
ggplot(NULL, aes(x = Date)) +
ggtitle("Forecasts of New COVID Cases in Gotham City for the Next 10 Days, 8/19/2020 to 8/28/2020") +
geom_line(data = ts_values, aes(y = Cases), colour = "red", size = 1) +
geom_vline(xintercept = as.Date("8/19/20", tryFormats = "%m/%d/%Y", colour = "black")) +
theme_linedraw()
```
```{r, echo = FALSE, results = "hide", fig.show = "FALSE"}
Cases = ts(log(data$New.Cases), start = c(6,20), frequency = 30)
attempt = exp(sarima.for(Cases, n.ahead = 10, p = 0, d = 0, q = 0, S = 3, P = 0, D = 0, Q = 1, xreg = signal1$fitted.values, newxreg = signal.forecast1)$pred)
write.table(x = attempt,file = "~/Desktop/covid_3031902888.csv", sep=",",row.names=FALSE, col.names=FALSE)
```
# Appendix 1 - Table of Parameter Estimates
```{r, echo = FALSE, results = "hide"}
signal1 = lm(cases ~ cos(2*pi*X*(0.14))*X +
sin(2*pi*X*(0.14))*X+
cos(2*pi*X*(0.28)) + sin(2*pi*X*(0.28)) +
cos(2*pi*X*(0.08)) + sin(2*pi*X*(0.08)) +
I(X^2), data = covid)
s = summary(signal1)
kable(s$coefficients[,1:2])
kable(model2$ttable[,1:2])
model2$fit$sigma2
```
Table 3: Estimates of the forecasting model parameters of Parametric model with ARMA(0,0)x(1,0)[3] with their standard errors (SE).
|Parameter|Estimate|SE|Coefficient Description|
|:---------|-----------:|---------:|:----------------------------------|
|$\beta_{0}$|738.2592433|53.7373667|Intercept|
|$\beta_{1}$|-11.4045913|4.0470966|Day|
|$\beta_{2}$|0.2892588|0.0642786|Indicator Day$^2$|
|$\beta_{3}$|-13.4065149|49.7695210|Cosine frequency 0.14|
|$\beta_{4}$|-125.7360533|48.5266554|Sine frequency 0.14|
|$\beta_{5}$|-4.6494506|1.4170210|Cosine frequency 0.14 $\times$ Day|
|$\beta_{6}$|1.8685184|1.3852627|Sine frequency 0.14 $\times$ Day|
|$\beta_{7}$|40.3719867|24.6470344|Cosine frequency 0.08|
|$\beta_{8}$|52.0014029|24.0913796|Sine frequency 0.08|
|$\beta_{9}$|-31.2507559|24.2539166|Cosine frequency 0.28|
|$\beta_{10}$|-36.2552937|23.9106809|Sine frequency 0.08|
|$\Phi$|0.4129|0.1278|Seasonal AR coefficient|
|$\sigma^2_W$|0.02441683| |Variance of White Noise|