cleaning package code

markolalovic · Jun 6, 2024 · 7c437a6 · 7c437a6
1 parent 586b7d1
commit 7c437a6
Show file tree

Hide file tree

Showing 16 changed files with 129 additions and 152 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -14,3 +14,5 @@
 ^data$
 ^man/figures/*\.png$
 ^CODE_OF_CONDUCT.md
+^fix_site.py
+^tests/informal$
diff --git a/.nojekyll b/.nojekyll
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -17,7 +17,7 @@ Description: Provides an easy framework to simulate survey data commonly analyze
     The package also allows for the estimation of parameters from existing survey data to replicate it more accurately.
 URL: https://lalovic.io/responsesR, https://github.com/markolalovic/responsesR
 BugReports: https://github.com/markolalovic/responsesR/issues
-License: MIT + file LICENSE
+License: GPL-3
 Encoding: UTF-8
 LazyData: true
 Depends: R (>= 3.3)

diff --git a/LICENSE b/LICENSE
diff --git a/R/estimation_of_parameters.R b/R/estimation_of_parameters.R
@@ -4,7 +4,7 @@
 #' variables assuming that the latent variables are normally distributed.
 #'
 #' @export
-#' @param data survey responses, where the columns correspond to individual items.
+#' @param data responses, where the columns correspond to individual items.
 #'             Apart from this, `data` can be of almost any class such as
 #'             "data.frame" "matrix" or "array".
 #' @param K number of response categories, a vector or a number
@@ -25,8 +25,8 @@ estimate_parameters <- function(data, K, gamma1=0) {
     if (is.numeric(K)) {
       K <- rep(K, nitems)
     }
-    mat = matrix(data=NA, nrow=nitems, ncol=2)
-    for (i in 1:ncol(data)) {
+    mat <- matrix(data=NA, nrow=nitems, ncol=2)
+    for (i in seq_len(ncol(data))) {
       pk <- prop.table(table(data[,i]))
       estimates <- estimate_mu_sd(pk, K[i], gamma1[i])
       mat[i, ] <- estimates
@@ -62,14 +62,14 @@ estimate_mu_sd <- function(pk, K, gamma1=0, trace=FALSE) {
   cp <- c("mu"=x[1], "sd"=x[2], "gamma1"=gamma1)
   sim <- simulate_responses(K, cp)
   xk <- c(-Inf, sim$xk, Inf)
-  pk <- sapply(as.character(1:K), function(k) { # pad missing levels
+  pk <- vapply(as.character(seq_len(K)), function(k) { # pad missing levels
     ifelse(k %in% names(pk), pk[[k]], 0)
-  })
+  }, numeric(1))
 
   if (abs(gamma1) > 0) { # X ~ skew normal
     if (!requireNamespace("sn", quietly = TRUE)) {
       stop(
-        "Package \"sn\" must be installed estimate parameters for skew responses.
+        "Package \"sn\" must be installed to estimate skew normal parameters.
         Please run:\n\n \tinstall.packages(\"sn\")\n\n",
         call. = FALSE)
     }
@@ -114,16 +114,16 @@ estimate_mu_sd <- function(pk, K, gamma1=0, trace=FALSE) {
     }
   }
   f <- function(x) { # function to find roots
-    matrix(sapply(1:K, function(k) hk(x, k)))
+    matrix(vapply(seq_len(K), function(k) { hk(x, k) }, numeric(1)))
   }
   Df <- function(x) { # Jacobian column wise
-    matrix(c(sapply(1:K, function(k) dhk_u(x, k)),
-             sapply(1:K, function(k) dhk_v(x, k))),
+    matrix(c(vapply(seq_len(K), function(k) { dhk_u(x, k) }, numeric(1)),
+             vapply(seq_len(K), function(k) { dhk_v(x, k) }, numeric(1))),
            ncol=2)
   }
   x_trace <- c(x[1])
   y_trace <- c(x[2])
-  for (i in 1:niters) {
+  for (i in seq_len(niters)) {
     b <- f(x) # evaluate f
     A <- Df(x) # evaluate Jacobian
     A.svd <- svd(A) # can be unstable, use SVD
@@ -163,12 +163,13 @@ plot_contour <- function(f, x_trace, y_trace) {
   xgrid <- seq(-3, 3, length.out = xlen) # -5, 5
   ygrid <- seq(0.1, 3, length.out = ylen) # 0.1, 10
   zvals <- matrix(NA, ncol = xlen, nrow = ylen)
-  for (i in 1:xlen) {
-    for (j in 1:ylen) {
+  for (i in seq_len(xlen)) {
+    for (j in seq_len(ylen)) {
       zvals[i, j] <- norm(f(matrix(c(xgrid[i], ygrid[j]))) , "2")
     }
   }
-  graphics::contour(x = xgrid, y = ygrid, z = zvals, col="gray42", xlab = "u", ylab = "v")
+  graphics::contour(x = xgrid, y = ygrid, z = zvals, 
+                    col="gray42", xlab = "u", ylab = "v")
   graphics::grid(col = "lightgray", lty = "dotted")
   graphics::points(x_trace, y_trace, pch=20, col="blue")
 }
diff --git a/R/helper_functions.R b/R/helper_functions.R
@@ -1,31 +1,13 @@
-#' Calculate variance of probabilities `pk`
-#'
-#' @param pk vector of probabilities
-#' @return variance of `pk`
-get_pk_var <- function(pk) {
-  domain <- (1:length(pk))
-  m <- get_pk_mean(pk)
-  return(sum(pk * (domain - m)^2))
-}
-
-#' Variance of a skew normal distribution
-#'
-#' @param alpha determines the shape
-#' @return variance of a skew-normal distribution
-var_skew_normal <- function(alpha) {
-  return(1 - 2*(delta_skew_normal(alpha)^2)/pi)
-}
-
 #' Pad missing levels with zeros
 #'
 #' @export
 #' @param pk proportions or probabilities across possible responses
 #' @param K number of response categories
 #' @return table of proportions across all possible responses
 pad_levels <- function(pk, K) {
-  pk <- sapply(as.character(1:K), function(k) {
+  pk <- vapply(as.character(seq_len(K)), function(k) {
     ifelse(k %in% names(pk), pk[[k]], 0)
-  })
+  }, numeric(1))
   return(pk)
 }
 

diff --git a/R/simulation_of_responses.R b/R/simulation_of_responses.R
@@ -1,6 +1,6 @@
 #' Get responses
 #'
-#' Generates a sample of random responses based on parameters of latent variables.
+#' Returns a sample of random responses based on parameters of latent variables.
 #'
 #' @export
 #' @param n number of observations
@@ -20,15 +20,17 @@
 #' @examples
 #' data <- get_responses(n = 5, mu = c(0, 1, 2))
 get_responses <- function(n=10, mu=0, sd=1, gamma1=0, K=5, R=0) {
-  inputs <- list(mu, sd, gamma1, K)
-  nitems <- max(lengths(inputs))
+  raw_inputs <- list(mu, sd, gamma1, K)
+  nitems <- max(lengths(raw_inputs))
 
   if (nitems == 1) {
     return(get_univariate_responses(n, mu, sd, gamma1, K))
   }
 
-  # single inputs are repeated to create a vector
-  inputs <- sapply(inputs, function(input) rep(input, length.out = nitems))
+  # single raw_inputs are repeated to create a vector
+  inputs <- vapply(raw_inputs,
+                   function(input) { rep(input, length.out=nitems) },
+                   numeric(nitems))
   colnames(inputs) <- c("mu", "sd", "gamma1", "K")
 
   # conditions to deal with correlation input `R`
@@ -42,7 +44,7 @@ get_responses <- function(n=10, mu=0, sd=1, gamma1=0, K=5, R=0) {
       get_univariate_responses(
         n, input[["mu"]], input[["sd"]], input[["gamma1"]], input[["K"]])
     })
-    colnames(data) <- sapply(1:nitems, function(i) paste("Y", i, sep = ""))
+    colnames(data) <- sapply(seq_len(nitems), function(i) paste("Y", i, sep=""))
     return(data)
   }
   if (case_2(R) == 1) { # random correlation matrix is used
@@ -56,7 +58,8 @@ get_responses <- function(n=10, mu=0, sd=1, gamma1=0, K=5, R=0) {
   } else {
     # a correlation matrix must be provided
     if(case_4(R) != 1)
-      stop("Error: correlation R can be a number, \"random\", or a correlation matrix.")
+      stop("Error: correlation R can be a number, \"random\", 
+           or a correlation matrix.")
   }
   sigma <- cor2cov(R, inputs[, "sd"])
 
@@ -84,10 +87,10 @@ get_responses <- function(n=10, mu=0, sd=1, gamma1=0, K=5, R=0) {
                        c("mu"=0, "sd"=1, "gamma1"=input[["gamma1"]]))
   })
 
-  data <- sapply(1:nitems, function(i) {
+  data <- sapply(seq_len(nitems), function(i) {
     findInterval(lat[, i], c(-Inf, sims[[i]]$xk, Inf))
   })
-  colnames(data) <- sapply(1:nitems, function(i) paste("Y", i, sep = ""))
+  colnames(data) <- sapply(seq_len(nitems), function(i) paste("Y", i, sep = ""))
   return(data)
 }
 
@@ -105,7 +108,7 @@ get_responses <- function(n=10, mu=0, sd=1, gamma1=0, K=5, R=0) {
 #' discrete random variable from which samples are taken.
 get_univariate_responses <- function(n=10, mu=0, sd=1, gamma1=0, K=5) {
   sim <- simulate_responses(K, c("mu" = mu, "sd" = sd, "gamma1" = gamma1))
-  data <- sample(x = 1:K, size = n, replace = TRUE, prob = sim$pk)
+  data <- sample(x = seq_len(K), size = n, replace = TRUE, prob = sim$pk)
   return(data)
 }
 
@@ -117,7 +120,7 @@ get_univariate_responses <- function(n=10, mu=0, sd=1, gamma1=0, K=5) {
 #'
 #' @export
 #' @param K number of response categories
-#' @param params parameters `xi`, `omega` and `alpha` of skew-normal distribution
+#' @param params skew-normal parameters `xi`, `omega` and `alpha`
 #' @return lists of probabilities `pk` and cut points `xk`
 #' @examples
 #' simulate_responses(K = 5, params = c("mu"=0, "sd"=1, "gamma1"=0))
@@ -141,7 +144,7 @@ simulate_responses <- function(K, params) {
 
 #' Implementation of Lloyd's algorithm
 #'
-#' Given a probability density function `fX` of a continuous random variable `X`,
+#' Given a probability density function `fX` of a random variable `X`,
 #' the function returns a discrete random variable with `K` possible outcomes
 #' that best approximates `X` in the mean-square or L2 sense by minimizing MSE.
 #'
@@ -157,7 +160,7 @@ run_Lloyd <- function(fX, K) {
   niters <- 100 # enough for convergence
   pk_means <- c()
   pk_MS_errors <- c()
-  for (i in 1:niters) {
+  for (i in seq_len(niters)) {
     xk <- get_new_xk(rk) # calculate the new cut points
     rk <- get_new_rk(xk, fX) # calculate the new representatives
     ## calculate estimated probabilities, means and distortion measure
@@ -179,7 +182,7 @@ run_Lloyd <- function(fX, K) {
 get_new_xk <- function(rk) {
   K <- (length(rk) - 1) # generalized to arbitrary number of points
   xk <- rep(0, K)
-  for (k in 1:K) {
+  for (k in seq_len(K)) {
     xk[k] <- (rk[k] + rk[k+1])/2
   }
   return(xk)
@@ -200,7 +203,7 @@ get_new_rk <- function(xk, fX) {
   f_below <- function(x) {
     fX(x)
   }
-  for (k in 1:K) {
+  for (k in seq_len(K)) {
     result_above <- stats::integrate(f_above, lower=xk[k], upper=xk[k+1])[[1]]
     result_below <- stats::integrate(f_below, lower=xk[k], upper=xk[k+1])[[1]]
     rk[k] <- result_above/result_below
@@ -217,7 +220,7 @@ get_pk <- function(xk, fX) {
   K <- length(xk) + 1
   xk <- c(-Inf, xk, Inf)
   pk <- rep(0, K)
-  for (k in 1:K) {
+  for (k in seq_len(K)) {
     lower_bound <- xk[k]
     upper_bound <- xk[k+1]
     pk[k] <- stats::integrate(fX, lower=lower_bound, upper=upper_bound)[[1]]
@@ -230,7 +233,7 @@ get_pk <- function(xk, fX) {
 #' @param pk vector of probabilities `pk`
 #' @return mean of `pk`
 get_pk_mean <- function(pk) {
-  domain <- (1:length(pk))
+  domain <- seq_len(length(pk))
   return(sum(pk * domain))
 }
 
@@ -244,11 +247,12 @@ get_MSE <- function(xk, rk, fX) {
   K <- length(rk) # generalized for testing
   xk <- c(-Inf, xk, Inf)
   mse <- 0
-  for (k in 1:K) {
+  for (k in seq_len(K)) {
     lower_bound <- xk[k]
     upper_bound <- xk[k+1]
     integrand <- function(x) { ((x - rk[k])^2)*fX(x)  }
-    mse <- mse + stats::integrate(integrand, lower=lower_bound, upper=upper_bound)[[1]]
+    mse <- mse + stats::integrate(integrand, 
+                                  lower=lower_bound, upper=upper_bound)[[1]]
   }
   return(mse)
 }
@@ -263,7 +267,8 @@ get_MSE <- function(xk, rk, fX) {
 #' @return density at `x`
 #' @seealso [sn::dsn()]
 d_skew_normal <- function(x, xi=0, omega=1, alpha=0) {
-  return(2/omega*stats::dnorm((x - xi)/omega)*stats::pnorm(alpha*(x - xi)/omega))
+  return(2/omega*stats::dnorm((x - xi)/omega) * 
+           stats::pnorm(alpha*(x - xi)/omega))
 }
 
 #' The mean of skew normal distribution
@@ -282,6 +287,14 @@ delta_skew_normal <- function(alpha) {
   return(alpha / (sqrt(1 + alpha^2)))
 }
 
+#' Variance of a skew normal distribution
+#'
+#' @param alpha determines the shape
+#' @return variance of a skew-normal distribution
+var_skew_normal <- function(alpha) {
+  return(1 - 2*(delta_skew_normal(alpha)^2)/pi)
+}
+
 #' Convert parameters
 #'
 #' Converts from centered parameters to direct parameters appearing in

diff --git a/README_files/figure-gfm/unnamed-chunk-4-1.svg b/README_files/figure-gfm/unnamed-chunk-4-1.svg
diff --git a/tests/informal/discretization_test.R b/tests/informal/discretization_test.R
@@ -6,7 +6,10 @@ plot_density <- function(K, params) {
 
   params_dp <- convert_params(params)
   x <- seq(-3, 3, length = 200)
-  y <- d_skew_normal(x, params_dp[["xi"]], params_dp[["omega"]], params_dp[["alpha"]])
+  y <- d_skew_normal(x, 
+                     params_dp[["xi"]], 
+                     params_dp[["omega"]], 
+                     params_dp[["alpha"]])
 
   dd <- data.frame(x=x, y=y)
   eps <- 0.008

diff --git a/tests/informal/sn_univariate_test.R b/tests/informal/sn_univariate_test.R
@@ -1,4 +1,3 @@
-setwd("/home/marko/repos/responsesR/R/")
 source("simulation_of_responses.R")
 source("helper_functions.R")
 

diff --git a/tests/testthat/test_discretization.R b/tests/testthat/test_discretization.R
@@ -6,7 +6,7 @@ test_that("discretization of N(0,1) with 5 categories gives expected result", {
   expect_equal(sim$pk, pk, tolerance=0.1)
 })
 
-test_that("discretization of N(0,1)  with 4 categories gives expected results", {
+test_that("discretization of N(0,1) with 4 categories gives expected results", {
   sim <- simulate_responses(K=4, params=c("mu"=0, "sd"=1, "gamma1"=0))
   expected_sim <- list(
     "pk"=c(0.16, 0.34, 0.34, 0.16), # expected probabilities
@@ -34,3 +34,15 @@ test_that("discretization of skew-normal with mu=-1, sd=0.5, gamma1=0.5
   )
   expect_equal(sim, expected_sim, tolerance=0.1)
 })
+
+test_that("scaling and shifting skew normal parameters works", {
+  expect_equal(delta_skew_normal(2), 2 / (sqrt(1 + 4)), tolerance=0.1)
+  expect_equal(mean_skew_normal(2), 
+               delta_skew_normal(2) * sqrt(2/pi), tolerance=0.1)
+  expect_equal(var_skew_normal(2), 0.5, tolerance=0.1)
+
+  x <- c(-0.98, 0, 0.98)
+  dp <- c("xi"=0.5, "omega"=1, "alpha"=-1)
+  x_scaled_and_shifted <- c(-1.48, -0.50, 0.48)
+  expect_equal(scale_and_shift(x, dp), x_scaled_and_shifted)
+})