Skip to content

Commit

Permalink
refactored functions and updated README
Browse files Browse the repository at this point in the history
  • Loading branch information
markolalovic committed Jun 17, 2024
1 parent 2cb94fe commit b1e3a24
Show file tree
Hide file tree
Showing 180 changed files with 33,583 additions and 31,856 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@
^Meta$
^revdep$
^cran-comments\.md$
^codemeta\.json$
5 changes: 2 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ Description: Effectively simulates the discretization process inherent to Likert
Particularly useful for accurately modeling and analyzing survey data that use Likert scales, especially
when applying statistical techniques that require metric data.
License: MIT + file LICENSE
URL: https://lalovic.io/responsesR/,
https://github.com/markolalovic/responsesR/
BugReports: https://github.com/markolalovic/responsesR/issues/
URL: https://lalovic.io/latent2likert/
BugReports: https://github.com/markolalovic/latent2likert/issues/
Depends:
R (>= 3.5)
Imports:
Expand Down
7 changes: 3 additions & 4 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
# latent2likert 0.1.0 (Release date: 2020-10-17)

- Initial release.
- Initial release (development version).
- Tested on platforms: x86_64-pc-linux-gnu (64-bit) and x86_64-w64-mingw32 (64-bit).

# latent2likert 1.0.0 (Release date: 2024-03-28)

- The option to generate correlated Likert scale items was added to the function `rLikert()`.
- New function `estimate_parameters()` was added that allows for the estimation of parameters from existing survey data to replicate it more accurately.
- Ensured compatibility with
- Issues related to the dependencies have been resolved.
- Issues related to the dependencies have been resolved.
- This version now only imports the standard R packages mvtnorm, stats, and graphics packages stats and graphics, which are typically included in R releases.
- The package sn is necessary only when generating correlated responses based on skew normal distribution.
- Added user prompts for installing the sn package when necessary.
Expand All @@ -29,6 +28,6 @@
- Modularized core functions for better readability.
- Improved the structure and organization of the codebase.
- Improved error handling of different cases for correlations input.
- Updated the estimate_parameters function for estimating latent parameters from survey data.
- Updated the `estimate_parameters()` function for estimating latent parameters from survey data.
- The codebase is currently in development, finer details may change.

94 changes: 47 additions & 47 deletions R/discretization.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#' Discretize Density
#'
#' Transforms the density function of a continuous random variable into a
#' discrete probability distribution with minimal distortion
#' discrete probability distribution with minimal distortion
#' using the Lloyd-Max algorithm.
#'
#' @param density_func The probability density function of the continuous random variable.
#' @param density_fn The probability density function of the continuous random variable.
#' @param n_levels The cardinality of the set of all possible outcomes.
#' @param eps The convergence threshold for the algorithm.
#'
Expand All @@ -16,25 +16,25 @@
#' \item{dist}{Distortion measured as the mean-squared error (MSE).}
#' }
#' @export
discretize_density <- function(density_func, n_levels, eps = 1e-6) {
# Initial set of representation points
repr <- seq(-5, 5, length.out = n_levels)
midp <- calc_midpoints(repr)
dist <- compute_distortion(density_func, midp, repr)
discretize_density <- function(density_fn, n_levels, eps = 1e-6) {
# Initial set of representation points
repr <- seq(-5, 5, length.out = n_levels)
midp <- calc_midpoints(repr)
dist <- compute_distortion(density_fn, midp, repr)

diff <- Inf
while (diff > eps) {
repr <- find_representatives(density_func, midp)
midp <- calc_midpoints(repr)
newd <- compute_distortion(density_func, midp, repr)
diff <- Inf
while (diff > eps) {
repr <- find_representatives(density_fn, midp)
midp <- calc_midpoints(repr)
newd <- compute_distortion(density_fn, midp, repr)

diff <- dist - newd
dist <- newd
}
diff <- dist - newd
dist <- newd
}

endp <- c(-Inf, midp, Inf)
prob <- calc_probs(density_func, endp)
return(list(prob = prob, endp = endp, repr = repr, dist = dist))
endp <- c(-Inf, midp, Inf)
prob <- calc_probs(density_fn, endp)
return(list(prob = prob, endp = endp, repr = repr, dist = dist))
}

#' Calculate Midpoints
Expand All @@ -45,61 +45,61 @@ discretize_density <- function(density_func, n_levels, eps = 1e-6) {
#'
#' @return A vector of midpoints.
calc_midpoints <- function(repr) {
n_levels <- length(repr)
midp <- (repr[2:n_levels] + repr[1:(n_levels - 1)]) / 2
return(midp)
n_levels <- length(repr)
midp <- (repr[2:n_levels] + repr[1:(n_levels - 1)]) / 2
return(midp)
}

#' Find Representatives
#'
#' Find the representation points for the intervals defined by the midpoints.
#'
#' @param density_func The probability density function of the continuous random variable.
#' @param density_fn The probability density function of the continuous random variable.
#' @param midp The midpoints of the intervals.
#'
#' @return A vector of representation points.
find_representatives <- function(density_func, midp) {
n_levels <- length(midp) + 1
endp <- c(-Inf, midp, Inf)
repr <- numeric(n_levels)
for (k in seq_len(n_levels)) {
a <- stats::integrate(function(x) x * density_func(x), endp[k], endp[k + 1])
b <- stats::integrate(density_func, endp[k], endp[k + 1])
repr[k] <- a[[1]] / b[[1]]
}
return(repr)
find_representatives <- function(density_fn, midp) {
n_levels <- length(midp) + 1
endp <- c(-Inf, midp, Inf)
repr <- numeric(n_levels)
for (k in seq_len(n_levels)) {
a <- stats::integrate(function(x) x * density_fn(x), endp[k], endp[k + 1])
b <- stats::integrate(density_fn, endp[k], endp[k + 1])
repr[k] <- a[[1]] / b[[1]]
}
return(repr)
}

#' Compute Distortion
#'
#' Compute the distortion (mean-squared error) for the given representation points.
#'
#' @param density_func The probability density function of the continuous random variable \code{X}.
#' @param density_fn The probability density function of the continuous random variable \code{X}.
#' @param midp The midpoints of the intervals.
#' @param repr The representation points of the intervals.
#'
#' @return The distortion (mean-squared error).
compute_distortion <- function(density_func, midp, repr) {
n_levels <- length(repr)
endp <- c(-Inf, midp, Inf)
mse <- vapply(seq_len(n_levels), function(k) {
stats::integrate(function(x) density_func(x) * (x - repr[k])^2, endp[k], endp[k + 1])[[1]]
}, numeric(1))
return(sum(mse))
compute_distortion <- function(density_fn, midp, repr) {
n_levels <- length(repr)
endp <- c(-Inf, midp, Inf)
mse <- vapply(seq_len(n_levels), function(k) {
stats::integrate(function(x) density_fn(x) * (x - repr[k])^2, endp[k], endp[k + 1])[[1]]
}, numeric(1))
return(sum(mse))
}

#' Calculate Probabilities
#'
#' Calculate the discrete probabilities for the given cut points.
#'
#' @param density_func The probability density function of the continuous random variable.
#' @param density_fn The probability density function of the continuous random variable.
#' @param endp The end points of the intervals.
#'
#' @return A vector of probabilities for the intervals.
calc_probs <- function(density_func, endp) {
n_levels <- length(endp) - 1
prob <- vapply(seq_len(n_levels), function(k) {
stats::integrate(function(x) density_func(x), endp[k], endp[k + 1])[[1]]
}, numeric(1))
return(prob)
calc_probs <- function(density_fn, endp) {
n_levels <- length(endp) - 1
prob <- vapply(seq_len(n_levels), function(k) {
stats::integrate(function(x) density_fn(x), endp[k], endp[k + 1])[[1]]
}, numeric(1))
return(prob)
}
122 changes: 63 additions & 59 deletions R/estimation.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#' @param skew Marginal skewness of latent variables, defaults to 0.
#' @return A table of estimated parameters for each latent variable.
#' @seealso See also [latent2likert::estimate_mean_and_sd()].
estimate_parameters <- function(data, n_levels, skew=0) {
estimate_params <- function(data, n_levels, skew = 0) {
if (is.vector(data)) {
prob <- prop.table(table(data))
estimates <- estimate_mean_and_sd(prob, n_levels, skew)
Expand All @@ -22,9 +22,9 @@ estimate_parameters <- function(data, n_levels, skew=0) {
if (is.numeric(n_levels)) {
n_levels <- rep(n_levels, nitems)
}
mat <- matrix(data=NA, nrow=nitems, ncol=2)
mat <- matrix(data = NA, nrow = nitems, ncol = 2)
for (i in seq_len(ncol(data))) {
prob <- prop.table(table(data[,i]))
prob <- prop.table(table(data[, i]))
estimates <- estimate_mean_and_sd(prob, n_levels[i], skew[i])
mat[i, ] <- estimates
}
Expand All @@ -36,71 +36,73 @@ estimate_parameters <- function(data, n_levels, skew=0) {

# Solve reparameterized system for stability
estimate_mean_and_sd <- function(prob, n_levels, skew = 0, eps = 1e-6, maxit = 100) {
prob <- as.vector(pad_levels(prob, 5))
endp <- calc_endpoints(n_levels, skew)
dist_funcs <- initialize_distributions(skew)
x <- matrix(c(0, 1)) # Initial guess
trace <- matrix(rep(x, maxit), ncol = maxit)
prob <- as.vector(pad_levels(prob, n_levels))
endp <- calc_endpoints(n_levels, skew)
dist_funcs <- initialize_distributions(skew)
x <- matrix(c(0, 1)) # Initial guess
trace <- matrix(rep(x, maxit), ncol = maxit)

for (i in seq_len(maxit)) {
b <- fn(x, endp, prob, dist_funcs$cdf_X)
A <- jac(x, endp, dist_funcs$pdf_X)
A_svd <- svd(A)
dx <- A_svd$v %*% diag(1 / A_svd$d) %*% t(A_svd$u) %*% (-b)

# Prevent stepping into negative values for v = 1 / sd
while ((x + dx)[2] < 0) {
dx <- dx / 2
}
for (i in seq_len(maxit)) {
b <- fn(x, endp, prob, dist_funcs$cdf_X)
A <- jac(x, endp, dist_funcs$pdf_X)
A_svd <- svd(A)
dx <- A_svd$v %*% diag(1 / A_svd$d) %*% t(A_svd$u) %*% (-b)

x <- x + 0.2 * dx
trace[, i] <- x

if (norm(dx, "2") < eps) {
break
}
# Prevent stepping into negative values for v = 1 / sd
while ((x + dx)[2] < 0) {
dx <- dx / 2
}

mean <- x[1]
sd <- 1 / x[2]
# list("mean"=mean, "sd"=sd, "trace"=trace)
return(c(mean, sd))

x <- x + 0.2 * dx
trace[, i] <- x

if (norm(dx, "2") < eps) {
break
}
}

mean <- x[1]
sd <- 1 / x[2]
# list("mean"=mean, "sd"=sd, "trace"=trace)
return(c(mean, sd))
}

# Initialize CDF and PDF functions based on skew
initialize_distributions <- function(skew) {
if (abs(skew) > 0) {
check_package("sn")
cp <- c("mu" = 0, "sd" = 1, "skew" = skew)
dp <- sn::cp2dp(cp, family = "SN")
return(list(cdf_X = function(x) sn::psn(x, dp = dp),
pdf_X = function(x) sn::dsn(x, dp = dp)))
} else {
return(list(cdf_X = stats::pnorm, pdf_X = stats::dnorm))
}
if (abs(skew) > 0) {
check_package("sn")
cp <- c("mu" = 0, "sd" = 1, "skew" = skew)
dp <- sn::cp2dp(cp, family = "SN")
return(list(
cdf_X = function(x) sn::psn(x, dp = dp),
pdf_X = function(x) sn::dsn(x, dp = dp)
))
} else {
return(list(cdf_X = stats::pnorm, pdf_X = stats::dnorm))
}
}

# Function to find roots
fn <- function(x, endp, prob, cdf_X) {
u <- x[1]
v <- x[2]
y <- cdf_X(v * endp - u * v)
return(matrix(tail(y, -1) - head(y, -1) - prob))
u <- x[1]
v <- x[2]
y <- cdf_X(v * endp - u * v)
return(matrix(tail(y, -1) - head(y, -1) - prob))
}

# Jacobian column wise
jac <- function(x, endp, pdf_X) {
u <- x[1]
v <- x[2]
midp <- head(tail(endp, -1), -1)
du <- pdf_X(v * endp - u * v) * (-v)
dv <- pdf_X(v * midp - u * v) * (midp - u)
du <- tail(du, -1) - head(du, -1)
dv <- c(head(dv, 1), tail(dv, -1) - head(dv, -1), -tail(dv, 1))
return(cbind(du, dv))
u <- x[1]
v <- x[2]
midp <- head(tail(endp, -1), -1)

du <- pdf_X(v * endp - u * v) * (-v)
dv <- pdf_X(v * midp - u * v) * (midp - u)

du <- tail(du, -1) - head(du, -1)
dv <- c(head(dv, 1), tail(dv, -1) - head(dv, -1), -tail(dv, 1))

return(cbind(du, dv))
}

plot_contour <- function(fn, endp, prob, cdf_X, trace) {
Expand All @@ -111,11 +113,13 @@ plot_contour <- function(fn, endp, prob, cdf_X, trace) {
zvals <- matrix(NA, ncol = xlen, nrow = ylen)
for (i in seq_len(xlen)) {
for (j in seq_len(ylen)) {
zvals[i, j] <- norm(fn(matrix(c(xgrid[i], ygrid[j])), endp, prob, cdf_X) , "2")
zvals[i, j] <- norm(fn(matrix(c(xgrid[i], ygrid[j])), endp, prob, cdf_X), "2")
}
}
graphics::contour(x = xgrid, y = ygrid, z = zvals,
col="gray42", xlab = "u = mu", ylab = "v = 1/sd")
graphics::contour(
x = xgrid, y = ygrid, z = zvals,
col = "gray42", xlab = "u = mu", ylab = "v = 1/sd"
)
graphics::grid(col = "lightgray", lty = "dotted")
graphics::points(trace[1, ], trace[2, ], pch=20, col="blue")
}
graphics::points(trace[1, ], trace[2, ], pch = 20, col = "blue")
}
Loading

0 comments on commit b1e3a24

Please sign in to comment.