did a quick test to see if collapse / tidyfast / data.table approach …

…might work for arriaga smoothing. Yes, it does, but it's 8-10 times slower than the current base implementation.
timriffe · Dec 30, 2023 · c6e15b7 · c6e15b7
1 parent 021dcf8
commit c6e15b7
Showing 1 changed file with 157 additions and 0 deletions.
diff --git a/dev/arriaga_tidy.R b/dev/arriaga_tidy.R
@@ -0,0 +1,157 @@
+Ages         <- seq(0, 80, by = 5)
+AMales       <- smooth_age_5_arriaga(Value = pop5m_pasex, Age = Ages, OAG = TRUE)
+# PAS spreadsheet result:
+Atest        <- c(662761, 495126, 345744, 287629, 285919, 261018, 237469, 203277,
+                  161733, 126960, 88586, 67496, 54587, 41257, 28790, 17189, 34729)
+all(round(AMales) - Atest == 0, na.rm = TRUE)
+plot(Ages, pop5m_pasex)
+lines(as.integer(names(AMales)),AMales)
+# Before:
+smooth_age_5_arriaga <- function(Value,
+                                 Age,
+                                 OAG = TRUE) {
+
+  # these values are not used, it's just for lengths, and to make sure we
+  # end on an even 10. Technically we could even provide data in 10-year
+  # age groups and it'd still not break.
+  Value1     <- graduate_uniform(Value = Value, Age = Age, OAG = OAG)
+  Value5     <-
+    groupAges(Value1, Age = as.integer(names(Value1)), N = 5)
+  N          <- length(Value5)
+  Age5       <- as.integer(names(Value5))
+
+  # would need to move this up to ensure?
+  # or in case of 85+ would we want to keep 80-84, 85+ as-is?
+  Value10    <- groupAges(Value, Age = Age, N = 10)
+
+  # what OAG is a strange digit? Then take OAG after grouping.
+  if (OAG) {
+    OAGvalue <- Value5[length(Value5)]
+    Value10[length(Value10)] <- NA
+    Value5[length(Value5)]   <- NA
+  }
+
+  # again get staggered vectors
+  Value10LL   <- shift.vector(Value10,-2, fill = NA)
+  Value10L    <- shift.vector(Value10,-1, fill = NA)
+  Value10R    <- shift.vector(Value10, 1, fill = NA)
+  Value10RR   <- shift.vector(Value10, 2, fill = NA)
+
+  # alternating calc, with differences at tails
+  vevens      <- (-Value10R + 11 * Value10 + 2 * Value10L) / 24
+  # tails different
+  vevens[1]   <-
+    (8 * Value10[1] + 5 * Value10L[1] - Value10LL[1]) / 24
+  lastind     <- which(is.na(vevens))[1]
+  vevens[lastind] <-
+    Value10[lastind] - (8 * Value10[lastind] + 5 * Value10R[lastind] - Value10RR[lastind]) / 24
+  # odds are complement
+  vodds       <- Value10 - vevens
+
+  # prepare output
+  interleaf  <- c(rbind(vodds, vevens))
+  # produce results vector
+  out        <- Value5 * NA
+  n          <- min(c(length(interleaf), N))
+  out[1:n]   <- interleaf[1:n]
+
+  # if OA ends in 5, then we can save penultimate value too
+  na.i       <- is.na(out)
+  out[na.i]  <- Value5[na.i]
+  if (OAG) {
+    out[N] <- OAGvalue
+  }
+
+  out
+}
+library(tidyverse)
+data <- tibble(Age = Ages, Pop = pop5m_pasex)
+
+age2single <- function(Age, OAG = TRUE, AgeInt = NULL){
+  maxA <- ifelse(OAG, max(Age), max(Age) + AgeInt[which.max(Age)])
+  min(Age):maxA
+}
+
+library(collapse)
+library(data.table)
+library(tidyfast)
+smooth_age_5_arriaga_tidy <- function(data, variable, OAG = TRUE){
+
+  if (OAG){
+    OAvalue <- data |> 
+      fsubset(Age == max(Age)) |> 
+      pull(!!sym(variable))
+  }
+
+  data <- 
+    data |> 
+    rename(V = !!variable) |> 
+    # force to single
+    reframe(
+      V = graduate_uniform(Value = V,
+                           Age = Age,
+                           OAG = OAG),
+      Age = age2single(Age))  
+  data |> 
+    # group to 5 (innocuous if 5-year data given)
+    fmutate(Age = Age - Age %% 5,
+            V = fifelse(Age == max(Age) & OAG, NA_real_, V)) |> 
+    fgroup_by(Age) |> 
+    fsummarize(V = sum(V))  |> 
+    fungroup() |> 
+    fmutate(Age10 = Age - Age %% 10) |> 
+    fmutate(V10 = sum(V), .by = Age10) |> 
+    fmutate(V10LL = flag(V10,-4),
+           V10L = flag(V10,-2),
+           V10R = flag(V10, 2),
+           V10RR = flag(V10,4),
+           vevens = dt_case_when(Age10 == min(Age10) ~           (8 * V10 + 5 * V10L - V10LL) / 24,
+                          Age10 == (max(Age10) - 10) ~ V10 - (8 * V10 + 5 * V10R - V10RR) / 24,
+                          TRUE ~ (-V10R + 11 * V10 + 2 * V10L) / 24),
+           vodds = V10 - vevens,
+           V5out = dt_case_when(
+                        Age == max(Age)~ OAvalue,
+                        Age %% 10 == 0 ~ vodds, 
+                        Age %% 5 == 0  ~ vevens,
+                        TRUE ~ V),
+           V5out = fifelse(is.na(V5out), V, V5out)) |> 
+    fselect(Age, Age10, V5out) |> 
+    rename(!!variable := V5out)
+
+  }
+# ---------------- #
+# test equivalency #
+# ---------------- #
+# for OAG divisible by 10
+smooth_age_5_arriaga_tidy(tibble(Age = Ages, 
+                                 Pop = pop5m_pasex), 
+                          variable = "Pop", 
+                          OAG = TRUE) 
+  mutate(V5orig = smooth_age_5_arriaga(pop5m_pasex, Ages, TRUE))
+
+
+# for OAG divisible by 5
+Age75 <- seq(0,75,by=5)
+v75 <- pop5m_pasex
+v75[16] <- sum(pop5m_pasex[16:17])
+v75 <- v75[-17]
+
+data <- tibble(Age = Age75, 
+               Pop = v75)
+smooth_age_5_arriaga_tidy(data, 
+                          variable = "Pop", 
+                          OAG = TRUE) |> 
+  mutate(V5orig = smooth_age_5_arriaga(v75, Age75, TRUE))
+
+# ----------------------- #
+# Compare execution speed #
+# ----------------------- #
+library(rbenchmark)
+benchmark(smooth_age_5_arriaga_tidy(data, 
+                                    variable = "Pop", 
+                                    OAG = TRUE)) #  .522
+benchmark(smooth_age_5_arriaga(v75, Age75, TRUE)) # .054, 10 times faster...
+
+# Conclusion:
+# Possibly data.table could do the same a tic faster than collapse/tidyfast, 
+# but only marginally do, whereas base-powered DemoTools arithmetic is 10x faster