refactored functions and updated README

markolalovic · Jun 17, 2024 · b1e3a24 · b1e3a24
1 parent 2cb94fe
commit b1e3a24
Show file tree

Hide file tree

Showing 180 changed files with 33,583 additions and 31,856 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -21,3 +21,4 @@
 ^Meta$
 ^revdep$
 ^cran-comments\.md$
+^codemeta\.json$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,9 +9,8 @@ Description: Effectively simulates the discretization process inherent to Likert
     Particularly useful for accurately modeling and analyzing survey data that use Likert scales, especially 
     when applying statistical techniques that require metric data.
 License: MIT + file LICENSE
-URL: https://lalovic.io/responsesR/,
-    https://github.com/markolalovic/responsesR/
-BugReports: https://github.com/markolalovic/responsesR/issues/
+URL: https://lalovic.io/latent2likert/
+BugReports: https://github.com/markolalovic/latent2likert/issues/
 Depends: 
     R (>= 3.5)
 Imports: 

diff --git a/NEWS.md b/NEWS.md
@@ -1,14 +1,13 @@
 # latent2likert 0.1.0 (Release date: 2020-10-17)
 
-- Initial release.
+- Initial release (development version).
 - Tested on platforms: x86_64-pc-linux-gnu (64-bit) and x86_64-w64-mingw32 (64-bit).
 
 # latent2likert 1.0.0 (Release date: 2024-03-28)
 
 - The option to generate correlated Likert scale items was added to the function `rLikert()`.
 - New function `estimate_parameters()` was added that allows for the estimation of parameters from existing survey data to replicate it more accurately.
-- Ensured compatibility with 
-- Issues related to the dependencies have been resolved. 
+- Issues related to the dependencies have been resolved.
 - This version now only imports the standard R packages mvtnorm, stats, and graphics packages stats and graphics, which are typically included in R releases.
 - The package sn is necessary only when generating correlated responses based on skew normal distribution.
 - Added user prompts for installing the sn package when necessary.
@@ -29,6 +28,6 @@
 - Modularized core functions for better readability.
 - Improved the structure and organization of the codebase.
 - Improved error handling of different cases for correlations input.
-- Updated the estimate_parameters function for estimating latent parameters from survey data.
+- Updated the `estimate_parameters()` function for estimating latent parameters from survey data.
 - The codebase is currently in development, finer details may change.
 
diff --git a/R/discretization.R b/R/discretization.R
@@ -1,10 +1,10 @@
 #' Discretize Density
 #'
 #' Transforms the density function of a continuous random variable into a
-#' discrete probability distribution with minimal distortion 
+#' discrete probability distribution with minimal distortion
 #' using the Lloyd-Max algorithm.
 #'
-#' @param density_func The probability density function of the continuous random variable.
+#' @param density_fn The probability density function of the continuous random variable.
 #' @param n_levels The cardinality of the set of all possible outcomes.
 #' @param eps The convergence threshold for the algorithm.
 #'
@@ -16,25 +16,25 @@
 #'   \item{dist}{Distortion measured as the mean-squared error (MSE).}
 #' }
 #' @export
-discretize_density <- function(density_func, n_levels, eps = 1e-6) {
-    # Initial set of representation points
-    repr <- seq(-5, 5, length.out = n_levels)
-    midp <- calc_midpoints(repr)
-    dist <- compute_distortion(density_func, midp, repr)
+discretize_density <- function(density_fn, n_levels, eps = 1e-6) {
+  # Initial set of representation points
+  repr <- seq(-5, 5, length.out = n_levels)
+  midp <- calc_midpoints(repr)
+  dist <- compute_distortion(density_fn, midp, repr)
 
-    diff <- Inf
-    while (diff > eps) {
-        repr <- find_representatives(density_func, midp)
-        midp <- calc_midpoints(repr)
-        newd <- compute_distortion(density_func, midp, repr)
+  diff <- Inf
+  while (diff > eps) {
+    repr <- find_representatives(density_fn, midp)
+    midp <- calc_midpoints(repr)
+    newd <- compute_distortion(density_fn, midp, repr)
 
-        diff <- dist - newd
-        dist <- newd
-    }
+    diff <- dist - newd
+    dist <- newd
+  }
 
-    endp <- c(-Inf, midp, Inf)
-    prob <- calc_probs(density_func, endp)
-    return(list(prob = prob, endp = endp, repr = repr, dist = dist))
+  endp <- c(-Inf, midp, Inf)
+  prob <- calc_probs(density_fn, endp)
+  return(list(prob = prob, endp = endp, repr = repr, dist = dist))
 }
 
 #' Calculate Midpoints
@@ -45,61 +45,61 @@ discretize_density <- function(density_func, n_levels, eps = 1e-6) {
 #'
 #' @return A vector of midpoints.
 calc_midpoints <- function(repr) {
-    n_levels <- length(repr)
-    midp <- (repr[2:n_levels] + repr[1:(n_levels - 1)]) / 2
-    return(midp)
+  n_levels <- length(repr)
+  midp <- (repr[2:n_levels] + repr[1:(n_levels - 1)]) / 2
+  return(midp)
 }
 
 #' Find Representatives
 #'
 #' Find the representation points for the intervals defined by the midpoints.
 #'
-#' @param density_func The probability density function of the continuous random variable.
+#' @param density_fn The probability density function of the continuous random variable.
 #' @param midp The midpoints of the intervals.
 #'
 #' @return A vector of representation points.
-find_representatives <- function(density_func, midp) {
-    n_levels <- length(midp) + 1
-    endp <- c(-Inf, midp, Inf)
-    repr <- numeric(n_levels)
-    for (k in seq_len(n_levels)) {
-        a <- stats::integrate(function(x) x * density_func(x), endp[k], endp[k + 1])
-        b <- stats::integrate(density_func, endp[k], endp[k + 1])
-        repr[k] <- a[[1]] / b[[1]]
-    }
-    return(repr)
+find_representatives <- function(density_fn, midp) {
+  n_levels <- length(midp) + 1
+  endp <- c(-Inf, midp, Inf)
+  repr <- numeric(n_levels)
+  for (k in seq_len(n_levels)) {
+    a <- stats::integrate(function(x) x * density_fn(x), endp[k], endp[k + 1])
+    b <- stats::integrate(density_fn, endp[k], endp[k + 1])
+    repr[k] <- a[[1]] / b[[1]]
+  }
+  return(repr)
 }
 
 #' Compute Distortion
 #'
 #' Compute the distortion (mean-squared error) for the given representation points.
 #'
-#' @param density_func The probability density function of the continuous random variable \code{X}.
+#' @param density_fn The probability density function of the continuous random variable \code{X}.
 #' @param midp The midpoints of the intervals.
 #' @param repr The representation points of the intervals.
 #'
 #' @return The distortion (mean-squared error).
-compute_distortion <- function(density_func, midp, repr) {
-    n_levels <- length(repr)
-    endp <- c(-Inf, midp, Inf)
-    mse <- vapply(seq_len(n_levels), function(k) {
-        stats::integrate(function(x) density_func(x) * (x - repr[k])^2, endp[k], endp[k + 1])[[1]]
-    }, numeric(1))
-    return(sum(mse))
+compute_distortion <- function(density_fn, midp, repr) {
+  n_levels <- length(repr)
+  endp <- c(-Inf, midp, Inf)
+  mse <- vapply(seq_len(n_levels), function(k) {
+    stats::integrate(function(x) density_fn(x) * (x - repr[k])^2, endp[k], endp[k + 1])[[1]]
+  }, numeric(1))
+  return(sum(mse))
 }
 
 #' Calculate Probabilities
 #'
 #' Calculate the discrete probabilities for the given cut points.
 #'
-#' @param density_func The probability density function of the continuous random variable.
+#' @param density_fn The probability density function of the continuous random variable.
 #' @param endp The end points of the intervals.
 #'
 #' @return A vector of probabilities for the intervals.
-calc_probs <- function(density_func, endp) {
-    n_levels <- length(endp) - 1
-    prob <- vapply(seq_len(n_levels), function(k) {
-        stats::integrate(function(x) density_func(x), endp[k], endp[k + 1])[[1]]
-    }, numeric(1))
-    return(prob)
+calc_probs <- function(density_fn, endp) {
+  n_levels <- length(endp) - 1
+  prob <- vapply(seq_len(n_levels), function(k) {
+    stats::integrate(function(x) density_fn(x), endp[k], endp[k + 1])[[1]]
+  }, numeric(1))
+  return(prob)
 }
diff --git a/R/estimation.R b/R/estimation.R
@@ -10,7 +10,7 @@
 #' @param skew Marginal skewness of latent variables, defaults to 0.
 #' @return A table of estimated parameters for each latent variable.
 #' @seealso See also [latent2likert::estimate_mean_and_sd()].
-estimate_parameters <- function(data, n_levels, skew=0) {
+estimate_params <- function(data, n_levels, skew = 0) {
   if (is.vector(data)) {
     prob <- prop.table(table(data))
     estimates <- estimate_mean_and_sd(prob, n_levels, skew)
@@ -22,9 +22,9 @@ estimate_parameters <- function(data, n_levels, skew=0) {
     if (is.numeric(n_levels)) {
       n_levels <- rep(n_levels, nitems)
     }
-    mat <- matrix(data=NA, nrow=nitems, ncol=2)
+    mat <- matrix(data = NA, nrow = nitems, ncol = 2)
     for (i in seq_len(ncol(data))) {
-      prob <- prop.table(table(data[,i]))
+      prob <- prop.table(table(data[, i]))
       estimates <- estimate_mean_and_sd(prob, n_levels[i], skew[i])
       mat[i, ] <- estimates
     }
@@ -36,71 +36,73 @@ estimate_parameters <- function(data, n_levels, skew=0) {
 
 # Solve reparameterized system for stability
 estimate_mean_and_sd <- function(prob, n_levels, skew = 0, eps = 1e-6, maxit = 100) {
-    prob <- as.vector(pad_levels(prob, 5))
-    endp <- calc_endpoints(n_levels, skew)
-    dist_funcs <- initialize_distributions(skew)
-    x <- matrix(c(0, 1))  # Initial guess
-    trace <- matrix(rep(x, maxit), ncol = maxit)
+  prob <- as.vector(pad_levels(prob, n_levels))
+  endp <- calc_endpoints(n_levels, skew)
+  dist_funcs <- initialize_distributions(skew)
+  x <- matrix(c(0, 1)) # Initial guess
+  trace <- matrix(rep(x, maxit), ncol = maxit)
 
-    for (i in seq_len(maxit)) {
-        b <- fn(x, endp, prob, dist_funcs$cdf_X)
-        A <- jac(x, endp, dist_funcs$pdf_X)
-        A_svd <- svd(A)
-        dx <- A_svd$v %*% diag(1 / A_svd$d) %*% t(A_svd$u) %*% (-b)
-
-        # Prevent stepping into negative values for v = 1 / sd
-        while ((x + dx)[2] < 0) {
-            dx <- dx / 2
-        }
+  for (i in seq_len(maxit)) {
+    b <- fn(x, endp, prob, dist_funcs$cdf_X)
+    A <- jac(x, endp, dist_funcs$pdf_X)
+    A_svd <- svd(A)
+    dx <- A_svd$v %*% diag(1 / A_svd$d) %*% t(A_svd$u) %*% (-b)
 
-        x <- x + 0.2 * dx
-        trace[, i] <- x
-
-        if (norm(dx, "2") < eps) {
-            break
-        }
+    # Prevent stepping into negative values for v = 1 / sd
+    while ((x + dx)[2] < 0) {
+      dx <- dx / 2
     }
-
-    mean <- x[1]
-    sd <- 1 / x[2]
-    # list("mean"=mean, "sd"=sd, "trace"=trace)
-    return(c(mean, sd))
+
+    x <- x + 0.2 * dx
+    trace[, i] <- x
+
+    if (norm(dx, "2") < eps) {
+      break
+    }
+  }
+
+  mean <- x[1]
+  sd <- 1 / x[2]
+  # list("mean"=mean, "sd"=sd, "trace"=trace)
+  return(c(mean, sd))
 }
 
 # Initialize CDF and PDF functions based on skew
 initialize_distributions <- function(skew) {
-    if (abs(skew) > 0) {
-        check_package("sn")
-        cp <- c("mu" = 0, "sd" = 1, "skew" = skew)
-        dp <- sn::cp2dp(cp, family = "SN")
-        return(list(cdf_X = function(x) sn::psn(x, dp = dp), 
-                    pdf_X = function(x) sn::dsn(x, dp = dp)))
-    } else {
-        return(list(cdf_X = stats::pnorm, pdf_X = stats::dnorm))
-    }
+  if (abs(skew) > 0) {
+    check_package("sn")
+    cp <- c("mu" = 0, "sd" = 1, "skew" = skew)
+    dp <- sn::cp2dp(cp, family = "SN")
+    return(list(
+      cdf_X = function(x) sn::psn(x, dp = dp),
+      pdf_X = function(x) sn::dsn(x, dp = dp)
+    ))
+  } else {
+    return(list(cdf_X = stats::pnorm, pdf_X = stats::dnorm))
+  }
 }
 
 # Function to find roots
 fn <- function(x, endp, prob, cdf_X) {
-    u <- x[1]
-    v <- x[2]
-    y <- cdf_X(v * endp - u * v)
-    return(matrix(tail(y, -1) - head(y, -1) - prob))
+  u <- x[1]
+  v <- x[2]
+  y <- cdf_X(v * endp - u * v)
+  return(matrix(tail(y, -1) - head(y, -1) - prob))
 }
-             
+
 # Jacobian column wise
 jac <- function(x, endp, pdf_X) {
-    u <- x[1]
-    v <- x[2]
-    midp <- head(tail(endp, -1), -1)
-    
-    du <- pdf_X(v * endp - u * v) * (-v)
-    dv <- pdf_X(v * midp - u * v) * (midp - u)
-    
-    du <- tail(du, -1) - head(du, -1)
-    dv <- c(head(dv, 1), tail(dv, -1) - head(dv, -1), -tail(dv, 1))
-    
-    return(cbind(du, dv))
+  u <- x[1]
+  v <- x[2]
+  midp <- head(tail(endp, -1), -1)
+
+  du <- pdf_X(v * endp - u * v) * (-v)
+  dv <- pdf_X(v * midp - u * v) * (midp - u)
+
+  du <- tail(du, -1) - head(du, -1)
+  dv <- c(head(dv, 1), tail(dv, -1) - head(dv, -1), -tail(dv, 1))
+
+  return(cbind(du, dv))
 }
 
 plot_contour <- function(fn, endp, prob, cdf_X, trace) {
@@ -111,11 +113,13 @@ plot_contour <- function(fn, endp, prob, cdf_X, trace) {
   zvals <- matrix(NA, ncol = xlen, nrow = ylen)
   for (i in seq_len(xlen)) {
     for (j in seq_len(ylen)) {
-      zvals[i, j] <- norm(fn(matrix(c(xgrid[i], ygrid[j])), endp, prob, cdf_X) , "2")
+      zvals[i, j] <- norm(fn(matrix(c(xgrid[i], ygrid[j])), endp, prob, cdf_X), "2")
     }
   }
-  graphics::contour(x = xgrid, y = ygrid, z = zvals, 
-                    col="gray42", xlab = "u = mu", ylab = "v = 1/sd")
+  graphics::contour(
+    x = xgrid, y = ygrid, z = zvals,
+    col = "gray42", xlab = "u = mu", ylab = "v = 1/sd"
+  )
   graphics::grid(col = "lightgray", lty = "dotted")
-  graphics::points(trace[1, ], trace[2, ], pch=20, col="blue")
-}
+  graphics::points(trace[1, ], trace[2, ], pch = 20, col = "blue")
+}