fixed arg handling for lifetables; handling other package issues, not…

… done yet
timriffe · Dec 29, 2023 · ab2b2ad · ab2b2ad
1 parent 9b62861
commit ab2b2ad
Show file tree

Hide file tree

Showing 11 changed files with 285 additions and 10 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: DemoTools
 Type: Package
 Title: Standardize, Evaluate, and Adjust Demographic Data
-Version: 01.13.79
-Date: 2023-12-20
+Version: 01.13.80
+Date: 2023-12-29
 Authors@R: c(
    person("Tim", "Riffe", role = c("aut", "cre"), 
           email = "[email protected]", comment = c(ORCID = "0000-0002-2673-4622")),
@@ -22,7 +22,7 @@ License: file LICENSE
 LazyLoad: yes
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.0
+RoxygenNote: 7.2.3
 Depends:
     R (>= 3.6),
     Rcpp (>= 0.12.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -161,6 +161,7 @@ importFrom(data.table,rbindlist)
 importFrom(data.table,setDT)
 importFrom(data.table,uniqueN)
 importFrom(demogR,cdmltw)
+importFrom(dplyr,case_when)
 importFrom(dplyr,group_by)
 importFrom(dplyr,mutate)
 importFrom(dplyr,rename)

diff --git a/R/lt_abridged.R b/R/lt_abridged.R
@@ -42,6 +42,7 @@
 #' @param extrapLaw character. If extrapolating, which parametric mortality law should be invoked? Options include
 #'   \code{"Kannisto", "Kannisto_Makeham", "Makeham", "Gompertz", "GGompertz", "Beard",	"Beard_Makeham", "Quadratic"}. Default \code{"Kannisto"} if the highest age is at least 90, otherwise `"makeham"`. See details.
 #' @inheritParams lt_a_closeout
+#' @importFrom dplyr case_when
 #' @export
 #' @return Lifetable in data.frame with columns
 #' \itemize{
@@ -183,9 +184,30 @@ lt_abridged <- function(Deaths = NULL,
                   extrapFrom = max(Age),
                   extrapFit = NULL,
                   ...) {
+
+  # some handy name coercion
+  a0rule <- case_when(a0rule == "Andreev-Kingkade" ~ "ak",
+                      a0rule == "Coale-Demeny" ~ "cd",
+                      TRUE ~ a0rule)
+  axmethod <- case_when(axmethod == "UN (Greville)" ~ "un",
+                        axmethod == "PASEX" ~ "pas",
+                        TRUE ~ axmethod)
+  Sex <- substr(Sex, 1, 1) |> 
+    tolower()
+  Sex <- ifelse(Sex == "t", "b", Sex)
+
+  region <-  substr(region, 1, 1) |> 
+    tolower()
+  if (!is.null(extrapLaw)){
+    extrapLaw <- tolower(extrapLaw)
+  }
+
+
+  # now we check args
   axmethod <- match.arg(axmethod, choices = c("pas","un"))
   Sex      <- match.arg(Sex, choices = c("m","f","b"))
   a0rule   <- match.arg(a0rule, choices = c("ak","cd"))
+
   if (!is.null(extrapLaw)){
     extrapLaw      <- tolower(extrapLaw)
     extrapLaw      <- match.arg(extrapLaw, choices = c("kannisto",

diff --git a/R/lt_regroup_age.R b/R/lt_regroup_age.R
@@ -99,6 +99,7 @@ lt_single2abridged <- function(lx,
 #' 
 #' @export
 #' @importFrom ungroup pclm
+#' @importFrom dplyr case_when
 #' @examples
 #'  Mx <- c(.23669,.04672,.00982,.00511,.00697,.01036,.01169,
 #'          .01332,.01528,.01757,.02092,.02517,.03225,.04241,.06056,
@@ -151,6 +152,23 @@ lt_abridged2single <- function(
   NN <- length(Age)
   #stopifnot(length(nMx) == NN)
 
+  # some handy name coercion
+  a0rule <- case_when(a0rule == "Andreev-Kingkade" ~ "ak",
+                      a0rule == "Coale-Demeny" ~ "cd",
+                      TRUE ~ a0rule)
+  axmethod <- case_when(axmethod == "UN (Greville)" ~ "un",
+                        axmethod == "PASEX" ~ "pas",
+                        TRUE ~ axmethod)
+  Sex <- substr(Sex, 1, 1) |> 
+    tolower()
+  Sex <- ifelse(Sex == "t", "b", Sex)
+
+  region <-  substr(region, 1, 1) |> 
+    tolower()
+  if (!is.null(extrapLaw)){
+    extrapLaw <- tolower(extrapLaw)
+  }
+
   if (!is.null(extrapLaw)){
     extrapLaw      <- tolower(extrapLaw)
     extrapLaw      <- match.arg(extrapLaw, choices = c("kannisto",

diff --git a/R/lt_single.R b/R/lt_single.R
@@ -19,6 +19,7 @@
 #'   \item{ex}{numeric. Age-specific remaining life expectancy.}
 #' }
 #' @export
+#' @importFrom dplyr case_when
 lt_single_mx <- function(nMx,
                         Age = 1:length(nMx) - 1,
                         radix = 1e5,
@@ -36,6 +37,21 @@ lt_single_mx <- function(nMx,
                         ...) {
 
   stopifnot(extrapFrom <= max(Age))
+
+  # some handy name coercion
+  a0rule <- case_when(a0rule == "Andreev-Kingkade" ~ "ak",
+                      a0rule == "Coale-Demeny" ~ "cd",
+                      TRUE ~ a0rule)
+  Sex <- substr(Sex, 1, 1) |> 
+    tolower()
+  Sex <- ifelse(Sex == "t", "b", Sex)
+
+  region <-  substr(region, 1, 1) |> 
+    tolower()
+  if (!is.null(extrapLaw)){
+    extrapLaw <- tolower(extrapLaw)
+  }
+
   Sex      <- match.arg(Sex, choices = c("m","f","b"))
   a0rule   <- match.arg(a0rule, choices = c("ak","cd"))
   if (!is.null(extrapLaw)){

diff --git a/R/utils.R b/R/utils.R
@@ -196,7 +196,9 @@ getModelLifeTable <- function(ModelName, Sex) {
 
 avg_adj <- function(x) {
   n <- length(x)
-  (shift.vector(x,-1, NA) + shift.vector(x, 1, NA)) / 2
+  out <- (shift.vector(x,-1, NA) + shift.vector(x, 1, NA)) / 2
+  names(out) <- names(x)
+  out
 }
 
 #' convert strings to concatenation of lower case alphabet

diff --git a/R/utilsAge.R b/R/utilsAge.R
@@ -735,7 +735,7 @@ rescaleAgeGroups <- function(Value1,
 #' Age       <- c(0,1,3,seq(5,100,5))
 #' AgeInt    <- c(1,2,2,rep(5,19),1)
 #' Value     <- tapply(V1,rep(Age,times=AgeInt), sum)
-#'
+#' 
 #' is_abridged(Age)
 #' age_abridge_force(Value, Age)
 age_abridge_force <- function(Value, Age) {
@@ -744,7 +744,7 @@ age_abridge_force <- function(Value, Age) {
                       Value,
                       Age = Age)
   #a1     <- min(Age):(length(v1) - 1)
-  a1     <- 1:length(v1) - 1 + min(Age)
+  a1     <- (1:length(v1) - 1 + min(Age)) |> as.integer()
   AgeAbr <- calcAgeAbr(a1)
   vabr   <- tapply(v1, AgeAbr, sum)
   vabr

diff --git a/dev/ediev.R b/dev/ediev.R
@@ -0,0 +1,217 @@
+smooth_age_zigzag_dd <- function(
+    Value, #population counts by single year of age
+    Age, #age index for Value: should be a continuous range of integer values (we will not check that)
+    OAG = TRUE, #is the last age group an open interval?
+    ret_ggPlot = TRUE, #if TRUE, we return a list(data=ValueOut, ggplot=myPlot) with a plot for checking results
+    #if FALSE, we return the smoothed series ValueOut
+    legendPlot = NULL #legend for the ggPlot
+)
+{
+  #Extension of Feeney (2013) "Removing Zig-Zag from Age Data" for single age data
+  #in the spirit of Dalkhat M. Ediev (2021) "A model of age heaping with applications to population graduation that retains informative demographic variation"
+  #Main differences with Feeney's algorithm is that, apart for working with single age data
+  #to suppose that age heaping can start 2 years before (and symmetrically 2 years after)
+  #and not only 1 year before and after.
+  #For example for age 40, we suppose that heaping occurs symmetrically, in the same proportion, at
+  #ages 39 and 41, as Feeney supposed for five-years age groups with adjacent age groups,
+  #but in the same spirit we suppose that
+  #heaping can occur also at ages 38 and 42 towards attracting age 40.
+  #We also suppose that heaping can only occurs for ages multiple of 5 and 10
+  if (OAG) {
+    #if the last age is an open interval, we drop it
+    AgeRangeOut <- Age[1:(length(Age)-1)]
+  }
+  #determine the ages with the variable parameters
+  #For example we start with ages 3 and 4 for heaping at age 5 and ages 6 and 7 use symmetrical values
+  #generally speaking heaping can start at ages x-2, x-1 and then symmetrically at ages x+1, x+2 towards attracting age x
+  #we suppose that the first and the last age we can treat have at least values for 2 ages before and after
+  #that means that if the first age in Age is 9 or 10, we will skip the first
+  getAgeRange <- function(AgeRangeOut) {
+    ageRangeMin <- max(5, trunc((AgeRangeOut[1] + 2) / 5) * 5)
+    if (ageRangeMin < AgeRangeOut[1] + 2) ageRangeMin <- ageRangeMin + 5
+    ageRangeMax <- trunc((AgeRangeOut[length(AgeRangeOut)] - 2) / 5) * 5
+    if (ageRangeMax > AgeRangeOut[length(AgeRangeOut)] - 2) ageRangeMax <- ageRangeMax - 5
+    return (c(ageRangeMin, ageRangeMax))
+  }
+  ageRange <- getAgeRange (AgeRangeOut)
+  ageRangeMin <- ageRange[1]
+  ageRangeMax <- ageRange[2]
+  #number of ages multiples of 5
+  nAges_mult <- trunc ((ageRangeMax - ageRangeMin) / 5) + 1
+  if (nAges_mult < 2) stop ("Age range should include at least two consecutive ages multiple of 5")
+  #initial values for the parameters
+  #if first attracting age is 5, age 3 has initial proportion of 0.1, age 4 of 0.2
+  #these two parameters will get 'optimized' by R optim
+  #the same parameters are applied, symmetrically, for age 6 (0.2) and age 7 (0.1)
+  param <- rep (c(0.1, 0.2), nAges_mult)
+  param_min <- rep(0, nAges_mult * 2)
+  param_max <- rep(9, nAges_mult * 2)
+
+  computeNewPop <- function(data, par, Age, ageRangeMin, nAges_mult, ret_Props=FALSE) {
+    if (ret_Props) props <- rep(NA, length(data))
+    for (ageIter in (1:nAges_mult)) {
+      currFirstAge <- ageRangeMin + (ageIter - 1) * 5 - 2
+      indexInData <- currFirstAge - Age[1] + 1
+      if (ret_Props) {
+        props[indexInData] <- par[(ageIter - 1) * 2 + 1]
+        props[indexInData+1] <- par[(ageIter - 1) * 2 + 2]
+        props[indexInData+3] <- par[(ageIter - 1) * 2 + 2]
+        props[indexInData+4] <- par[(ageIter - 1) * 2 + 1]
+      } else {
+        data[indexInData + 2] <- data[indexInData + 2] -
+          (data[indexInData] + data[indexInData + 4]) * par[(ageIter - 1) * 2 + 1] -
+          (data[indexInData + 1] + data[indexInData + 3]) * par[(ageIter - 1) * 2 + 2]
+        data[indexInData] <- data[indexInData] * (1 + par[(ageIter - 1) * 2 + 1])
+        data[indexInData+1] <- data[indexInData+1] * (1 + par[(ageIter - 1) * 2 + 2])
+        data[indexInData+3] <- data[indexInData+3] * (1 + par[(ageIter - 1) * 2 + 2])
+        data[indexInData+4] <- data[indexInData+4] * (1 + par[(ageIter - 1) * 2 + 1])
+      }
+    }
+
+    if (ret_Props) {
+      return (props)
+    } else {
+      return (data)
+    }
+  }
+
+  optim_fun <- function(data, par) {
+    popIter <- computeNewPop(data, par, Age, ageRangeMin, nAges_mult)
+    totPop <- sum (popIter)
+    objective_to_min <- 0
+    for (ageIter in (1:nAges_mult)) {
+      currFirstAge <- ageRangeMin + (ageIter - 1) * 5 - 2
+      indexInData <- currFirstAge - Age[1] + 1
+      term1 <- ((popIter[indexInData + 2] - (popIter[indexInData + 1] + popIter[indexInData + 3]) / 2)) / totPop
+      term2 <- ((popIter[indexInData + 2] - (popIter[indexInData] + popIter[indexInData + 4]) / 2)) / totPop
+      objective_to_min <- objective_to_min + term1^2 + term2^2
+    }
+    return (objective_to_min)
+  }
+
+  oldOptimValue <- 10^1000
+
+  for (rep in (1:10)) {
+    #apply optimization up to 10 times, using parameters obtained in the previous step, just in case...
+    parResult <- optim(data=Value, par=param, lower=param_min, upper=param_max, fn=optim_fun, method = "L-BFGS-B")
+    if (parResult$value >= oldOptimValue) break
+    param <- parResult$par
+    oldOptimValue <- parResult$value
+  }
+  valueOut <- computeNewPop(Value, parResult$par, Age, ageRangeMin, nAges_mult)
+
+  if (ret_ggPlot) {
+    library2 <- function(pack) {
+      #pack<-"tcltk2"
+      if( !(pack %in% installed.packages()))
+      {install.packages(pack)}
+      library(pack,character.only = TRUE)
+    }
+    library2("ggplot2")
+    library2("scales")
+
+    getScales <- function(Value, ageRangeMin, ageRangeMax) {
+      maxCount <- max(Value) * 1.0
+      power <- 0
+      while (maxCount > 1) {
+        maxCount <- maxCount / 10.0
+        power <- power + 1
+      }
+      maxCount <- ceiling(maxCount * 10) * 10^(power-1)
+      leftScale <- seq(0, maxCount, maxCount / 4)
+      rightScale <- seq(0, 1, 0.25)
+      xScale <- seq(ageRangeMin-5, ageRangeMax+5, 5)
+      return (list(left=leftScale, right=rightScale, x=xScale))
+    }
+
+    df <- data.frame(Age=Age, val=Value)
+    df$type <- 'Original'
+    temp <- data.frame(Age=Age, val=valueOut)
+    temp$type <- 'Corrected'
+    df <- rbind(df, temp)
+    props <- computeNewPop(Value, parResult$par, Age, ageRangeMin, nAges_mult, ret_Prop = TRUE)
+    temp <- data.frame(Age=Age, val=props)
+    temp$type <- 'Attraction'
+    #transform into proportions
+    temp$val <- temp$val / (1 + temp$val)
+    scale_axes <- getScales (Value, ageRangeMin, ageRangeMax)
+    maxY_left <- scale_axes$left[length(scale_axes$left)]
+    #adjust values in preparation for second axis (we will correct the labels later on in scale_y_continuous)
+    temp$val <- temp$val * maxY_left
+    df <- rbind(df, temp)
+    df$type <- factor (df$type, levels=c("Original", "Corrected", "Attraction"))
+    myLabelNames <- c("Original", "Corrected", "Attraction factors")
+
+    myPlot <- ggplot(df, aes(x=Age, y=val, color=type, alpha=type, size=type)) + geom_line()
+    myPlot <- myPlot + scale_y_continuous(breaks = scale_axes$left, name="population counts",
+                                          sec.axis = sec_axis(as.formula(paste("~./", maxY_left, sep="")), name="attraction proportions",
+                                                              breaks=scale_axes$right),
+                                          labels = scales::label_number())
+    myPlot <- myPlot + scale_x_continuous(breaks = scale_axes$x)
+    myPlot <- myPlot + scale_colour_manual(values=c("red", "blue", "grey40"), labels=myLabelNames)
+    myPlot <- myPlot + scale_size_manual(values=c(1, 2, 2), labels=myLabelNames)
+    myPlot <- myPlot + scale_alpha_manual(values=c(1, 1, 0.2), labels=myLabelNames)
+    myPlot <- myPlot + theme_bw() + theme(legend.key = element_blank(), legend.title=element_blank(), legend.position="bottom")
+    myPlot <- myPlot + theme_text(1)
+    myPlot <- myPlot + labs(color="", alpha="", size="")
+    if (!is.null(legendPlot)) myPlot <- myPlot + ggtitle(legendPlot)
+
+    return (list(data=valueOut, ggplot=myPlot))
+  } else {
+    return (valueOut)
+  }
+}
+
+#data for Colombia 1973
+col1973 <- structure(list(
+  Age = 0:99,
+  Value = c(493538L, 497954L, 580000L, 
+            610204L, 609266L, 609670L, 599184L, 628352L, 624302L, 549158L, 
+            624112L, 527810L, 611266L, 541084L, 490342L, 466650L, 435310L, 
+            423912L, 444468L, 334732L, 345390L, 301686L, 319468L, 312100L, 
+            267828L, 293902L, 230690L, 230770L, 244100L, 178590L, 278284L, 
+            136552L, 201120L, 218514L, 159054L, 217800L, 156786L, 162270L, 
+            203560L, 141258L, 247758L, 94344L, 166104L, 164336L, 116024L, 
+            182858L, 104268L, 102916L, 136222L, 90550L, 180598L, 68132L, 
+            109788L, 108058L, 81438L, 116446L, 77230L, 62844L, 73906L, 49812L, 
+            136486L, 37506L, 61658L, 69702L, 43570L, 72760L, 36930L, 34102L, 
+            40052L, 24126L, 68924L, 16004L, 32988L, 40868L, 18406L, 31836L, 
+            14222L, 11824L, 16206L, 6972L, 24294L, 5016L, 7392L, 6906L, 6244L, 
+            9424L, 3806L, 3286L, 3478L, 1912L, 5260L, 916L, 1540L, 1142L, 
+            1266L, 1674L, 602L, 468L, 752L, 2360L)),
+  class = "data.frame",
+  row.names = c(NA,-100L))
+
+#Alexandropol 1897
+Alex1897 <- structure(list(
+  Age = 0:99,
+  Value = c(376.7579251, 342.1757925, 
+          398.3717579, 337.8530259, 314.0778098, 290.3025937, 355.1440922, 
+          334, 345, 360, 392, 260.0432277, 370, 307.5936599, 290.3025937, 
+          247.074928, 281.6570605, 268.6887608, 298.9481268, 164.9423631, 
+          333.5302594, 1401.253602, 1729.783862, 1643.32853, 1669.26513, 
+          700.9654179, 324.8847262, 273.0115274, 234.1066282, 104.4236311, 
+          471.8587896, 108.7463977, 186.556196, 151.9740634, 113.0691643, 
+          389.7262248, 151.9740634, 126.037464, 156.29683, 48.22766571, 
+          458.8904899, 48.22766571, 108.7463977, 56.87319885, 65.51873199, 
+          277.3342939, 82.80979827, 52.55043228, 91.45533141, 48.22766571, 
+          363.7896254, 26.61383285, 52.55043228, 39.58213256, 39.58213256, 
+          199.5244957, 56.87319885, 39.58213256, 48.22766571, 5, 285.9798271, 
+          17.96829971, 26.61383285, 17.96829971, 22.29106628, 78.4870317, 
+          22.29106628, 22.29106628, 26.61383285, 9.322766571, 121.7146974, 
+          0.677233429, 9.322766571, 5, 5, 30.93659942, 13.64553314, 9.322766571, 
+          9.322766571, 13.64553314, 35.25936599, 9.322766571, 9.322766571, 
+          5, 0.677233429, 17.96829971, 0.677233429, 5, 0.677233429, 9.322766571, 
+          17.96829971, 5, 5, 5, 5, 0.677233429, 0.677233429, 0.677233429, 
+          0.677233429, 0.677233429)),
+  class = "data.frame",
+  row.names = c(NA, -100L))
+
+#call the function for Colombia
+resCol <- smooth_age_zigzag_dd(col1973$Value, col1973$Age, OAG = TRUE, ret_ggPlot = TRUE, legendPlot="Colombia 1973")
+#show the plot for Colombia
+resCol$ggplot
+#call the function for Alexandropol
+resAlex <- smooth_age_zigzag_dd(Alex1897$Value, Alex1897$Age, OAG = TRUE, ret_ggPlot = TRUE, legendPlot="Alexandropol 1897")
+#show the plot
+resAlex$ggplot
diff --git a/man/graduate.Rd b/man/graduate.Rd
diff --git a/vignettes/Age-heaping_quality_with_Demotools.Rmd b/vignettes/Age-heaping_quality_with_Demotools.Rmd
@@ -201,4 +201,4 @@ r5
 
 Values greater than 0.1 coupled with a sawtooth value greater than 0 already suggest that some smoothing is warranted. If there is no detected sawtooth pattern, then five-year-roughness indices should probably need to be higher and visually confirmed before deciding to smooth. Further guidelines can be found in the vignette on smoothing. 
 
-##References
+# References
diff --git a/vignettes/lifetables_with_demotools.Rmd b/vignettes/lifetables_with_demotools.Rmd
@@ -71,4 +71,4 @@ In this output, the `Age` column indicates the lower bound of each age group. Th
 
 
 
-## References
+# References
Original file line number	Diff line number	Diff line change
Expand Up		@@ -201,4 +201,4 @@ r5

		Values greater than 0.1 coupled with a sawtooth value greater than 0 already suggest that some smoothing is warranted. If there is no detected sawtooth pattern, then five-year-roughness indices should probably need to be higher and visually confirmed before deciding to smooth. Further guidelines can be found in the vignette on smoothing.

		##References
		# References
Original file line number	Diff line number	Diff line change
Expand Up		@@ -71,4 +71,4 @@ In this output, the `Age` column indicates the lower bound of each age group. Th



		## References
		# References