# R for simulation study ############################# ### Data Generation ############################# ############################# ### Multiple Regression # Simple Regression set.seed(123321) n <- 100 x <- rnorm(n, 0, 1) e <- rnorm(n, 0, sqrt(0.75)) y <- 2 + 0.5*x + e # Multiple Regression set.seed(123321) n <- 100 MX <- rep(0, 3) SX <- matrix(1, 3, 3) SX[1, 2] <- SX[2, 1] <- 0.5 SX[1, 3] <- SX[3, 1] <- 0.2 SX[2, 3] <- SX[3, 2] <- 0.3 library(MASS) X <- mvrnorm(n, MX, SX) e <- rnorm(n, 0, sqrt(0.546)) y <- 1 + 0.2 * X[,1] + 0.3 * X[,2] + 0.4 * X[,3] + e # Simple Regression with Matrix Notation set.seed(123321) X <- matrix(1, n, 2) X[,2] <- rnorm(n, 0, 1) beta <- matrix(c(2, 0.5)) e <- matrix(rnorm(n, 0, sqrt(0.75))) y <- X %*% beta + e # Multiple Regression with Matrix Notation set.seed(123321) n <- 100 MX <- rep(0, 3) SX <- matrix(1, 3, 3) SX[1, 2] <- SX[2, 1] <- 0.5 SX[1, 3] <- SX[3, 1] <- 0.2 SX[2, 3] <- SX[3, 2] <- 0.3 library(MASS) X <- cbind(1, mvrnorm(n, MX, SX)) e <- matrix(rnorm(n, 0, sqrt(0.546))) beta <- matrix(c(1, 0.2, 0.3, 0.4)) y <- X %*% beta + e ############################# ### Logistic Regression # Logistic Regression with Binary Response set.seed(123321) n <- 100 MX <- rep(0, 3) SX <- matrix(1, 3, 3) SX[1, 2] <- SX[2, 1] <- 0.5 SX[1, 3] <- SX[3, 1] <- 0.2 SX[2, 3] <- SX[3, 2] <- 0.3 library(MASS) X <- cbind(1, mvrnorm(n, MX, SX)) pred <- 1 + 0.2 * X[,1] + 0.3 * X[,2] + 0.4 * X[,3] predprob <- 1/(1 + exp(-pred)) y <- rbinom(n, 1, predprob) # Logistic Regression with Multinomial Responses set.seed(123321) n <- 100 x <- rnorm(n, 0, 1) yhat1 <- 0.5 + 0.5*x yhat2 <- 1 + 0.3*x p1overp3 <- exp(-yhat1) p2overp3 <- exp(-yhat2) p1 <- p1overp3 / (p1overp3 + p2overp3 + 1) p2 <- p2overp3 / (p1overp3 + p2overp3 + 1) p3 <- 1 / (p1overp3 + p2overp3 + 1) probs <- cbind(p1, p2, p3) drawmult <- function(p) which(as.logical(rmultinom(1, size = 1, prob = p))) y <- apply(probs, 1, drawmult) # Logistic Regression with Ordinal Responses set.seed(123321) n <- 100 x <- rnorm(n, 0, 1) yhat1 <- -2 + 0.5*x yhat2 <- -1 + 0.5*x yhat3 <- 0.5 + 0.5*x yhat4 <- 1.5 + 0.5*x pleq1 <- 1 / (1 + exp(-yhat1)) pleq2 <- 1 / (1 + exp(-yhat2)) pleq3 <- 1 / (1 + exp(-yhat3)) pleq4 <- 1 / (1 + exp(-yhat4)) p1 <- pleq1 p2 <- pleq2 - pleq1 p3 <- pleq3 - pleq2 p4 <- pleq4 - pleq3 p5 <- 1 - pleq4 probs <- cbind(p1, p2, p3, p4, p5) drawmult <- function(p) which(as.logical(rmultinom(1, size = 1, prob = p))) y <- apply(probs, 1, drawmult) ############################# ### Probit Model # Probit model for binary response set.seed(123321) n <- 100 MX <- rep(0, 3) SX <- matrix(1, 3, 3) SX[1, 2] <- SX[2, 1] <- 0.5 SX[1, 3] <- SX[3, 1] <- 0.2 SX[2, 3] <- SX[3, 2] <- 0.3 library(MASS) X <- cbind(1, mvrnorm(n, MX, SX)) pred <- 0.2 * X[,1] + 0.3 * X[,2] + 0.4 * X[,3] p1 <- 1 - pnorm(1, mean = pred, sd = 1) y <- rbinom(n, 1, p1) # Probit model for ordinal response set.seed(123321) n <- 100 x <- rnorm(n, 0, 1) pred <- 0.5*x pleq1 <- pnorm(-2, mean = pred, sd = 1) pleq2 <- pnorm(-1, mean = pred, sd = 1) pleq3 <- pnorm(0.5, mean = pred, sd = 1) pleq4 <- pnorm(1.5, mean = pred, sd = 1) p1 <- pleq1 p2 <- pleq2 - pleq1 p3 <- pleq3 - pleq2 p4 <- pleq4 - pleq3 p5 <- 1 - pleq4 probs <- cbind(p1, p2, p3, p4, p5) drawmult <- function(p) which(as.logical(rmultinom(1, size = 1, prob = p))) y <- apply(probs, 1, drawmult) ############################# ### Multilevel Model # Random ANOVA model with equal group size n <- 5 k <- 10 gamma00 <- 3 u <- rnorm(k, 0, 0.5) beta0 <- gamma00 + u beta0 <- rep(beta0, each = n) r <- rnorm(n * k, 0, 1) y <- beta0 + r g <- rep(1:k, each = n) dat <- data.frame(y, g) # Random ANOVA model with unequal group size n <- c(4, 5, 6, 4, 5, 7, 4, 5, 6, 7) k <- 10 gamma00 <- 3 u <- rnorm(k, 0, 0.5) beta0 <- gamma00 + u beta0 <- mapply(rep, beta0, n) beta0 <- unlist(beta0) r <- rnorm(sum(n), 0, 1) y <- beta0 + r g <- mapply(rep, 1:k, n) g <- unlist(g) dat <- data.frame(y, g) # Classroom-level covariate n <- 5 k <- 10 u <- rnorm(k, 0, 0.5) w <- rep(c(1, 0), each = k/2) beta0 <- 3 + 1.2 * w + u beta0 <- rep(beta0, each = n) r <- rnorm(n * k, 0, 1) y <- beta0 + r g <- rep(1:k, each = n) w <- rep(w, each = n) dat <- data.frame(y, g, w) # Random coefficient regression n <- 5 k <- 10 library(MASS) TAU <- matrix(c(0.5, 0.01, 0.01, 0.02), 2, 2) U <- mvrnorm(k, rep(0, 2), TAU) beta0 <- 3 + U[,1] beta1 <- 0.2 + U[,2] beta0 <- rep(beta0, each = n) beta1 <- rep(beta1, each = n) r <- rnorm(n * k, 0, 1) mux <- rnorm(k, 0, 0.5) mux <- rep(mux, each = n) x <- rnorm(n * k, mux, 1) y <- beta0 + beta1 * x + r g <- rep(1:k, each = n) dat <- data.frame(y, g, x) # Full multilevel model n <- 5 k <- 10 w <- rep(c(1, 0), each = k/2) library(MASS) TAU <- matrix(c(0.5, 0.01, 0.01, 0.02), 2, 2) U <- mvrnorm(k, rep(0, 2), TAU) beta0 <- 2.6 + 0.8*w + U[,1] beta1 <- 0.18 + 0.04*w + U[,2] beta0 <- rep(beta0, each = n) beta1 <- rep(beta1, each = n) r <- rnorm(n * k, 0, 1) mux <- rnorm(k, 0, 0.5) mux <- rep(mux, each = n) x <- rnorm(n * k, mux, 1) y <- beta0 + beta1 * x + r g <- rep(1:k, each = n) w <- rep(w, each = n) dat <- data.frame(y, g, x, w) ############################# ### Impose Missing Data ############################# # MCAR set.seed(123321) library(MASS) n <- 100 S <- matrix(0.5, 4, 4) S[4,] <- 0.4 S[,4] <- 0.4 diag(S) <- 1 dat <- mvrnorm(n, rep(0, 4), S) colnames(dat) <- c("x1", "x2", "x3", "z") mcardat <- dat mcardat[as.logical(rbinom(n, 1, 0.2)), 1] <- NA summary(mcardat) # MAR mardat <- dat z <- dat[,4] pred <- -1 + 0.4 * z predprob <- 1/(1 + exp(-pred)) mardat[as.logical(rbinom(n, 1, predprob)), 1] <- NA summary(mardat) # NMAR nmardat <- dat pred <- -1 + 0.4 * nmardat[,1] predprob <- 1/(1 + exp(-pred)) nmardat[as.logical(rbinom(n, 1, predprob)), 1] <- NA summary(nmardat) # Attrition MCAR t1 <- rnorm(n, 0, 1) t2 <- 0.5*t1 + rnorm(n, 0, sqrt(0.75)) t3 <- 0.5*t1 + rnorm(n, 0, sqrt(0.75)) t4 <- 0.5*t1 + rnorm(n, 0, sqrt(0.75)) m2 <- as.logical(rbinom(n, 1, 0.05)) m3 <- as.logical(rbinom(n, 1, 0.05)) | m2 m4 <- as.logical(rbinom(n, 1, 0.05)) | m3 t2[m2] <- NA t3[m3] <- NA t4[m4] <- NA dat <- data.frame(t1, t2, t3, t4) summary(dat) # Attrition MAR with time-invariant covariate t1 <- rnorm(n, 0, 1) t2 <- 0.5*t1 + rnorm(n, 0, sqrt(0.75)) t3 <- 0.5*t1 + rnorm(n, 0, sqrt(0.75)) t4 <- 0.5*t1 + rnorm(n, 0, sqrt(0.75)) x <- rnorm(n, 0, 1) pred <- -2.944 + 0.374 * x predprob <- 1/(1 + exp(-pred)) m2 <- as.logical(rbinom(n, 1, predprob)) m3 <- as.logical(rbinom(n, 1, predprob)) | m2 m4 <- as.logical(rbinom(n, 1, predprob)) | m3 t2[m2] <- NA t3[m3] <- NA t4[m4] <- NA dat <- data.frame(t1, t2, t3, t4) summary(dat) # Attrition MAR with time-varying covariate t1 <- rnorm(n, 0, 1) t2 <- 0.5*t1 + rnorm(n, 0, sqrt(0.75)) t3 <- 0.5*t1 + rnorm(n, 0, sqrt(0.75)) t4 <- 0.5*t1 + rnorm(n, 0, sqrt(0.75)) z1 <- rnorm(n, 0, 1) z2 <- 0.3*t1 + rnorm(n, 0, sqrt(0.91)) z3 <- 0.3*t1 + rnorm(n, 0, sqrt(0.91)) z4 <- 0.3*t1 + rnorm(n, 0, sqrt(0.91)) pred1 <- -2.944 + 0.374 * z1 pred2 <- -2.944 + 0.374 * z2 pred3 <- -2.944 + 0.374 * z3 pred4 <- -2.944 + 0.374 * z4 predprob1 <- 1/(1 + exp(-pred1)) predprob2 <- 1/(1 + exp(-pred2)) predprob3 <- 1/(1 + exp(-pred3)) predprob4 <- 1/(1 + exp(-pred4)) m1 <- as.logical(rbinom(n, 1, predprob1)) m2 <- as.logical(rbinom(n, 1, predprob2)) | m1 m3 <- as.logical(rbinom(n, 1, predprob3)) | m2 m4 <- as.logical(rbinom(n, 1, predprob4)) | m3 t1[m1] <- NA t2[m2] <- NA t3[m3] <- NA t4[m4] <- NA dat <- data.frame(t1, t2, t3, t4) summary(dat) ############################# ### Missing Data Handling ############################# # Data generation for this example n <- 100 x1 <- apply(rmultinom(n, size = 1, prob = c(0.2, 0.3, 0.4, 0.1)) == 1, 2, which) # Nominal x2 <- apply(rmultinom(n, size = 1, prob = c(0.2, 0.3, 0.4, 0.1)) == 1, 2, which) # Ordinal x3 <- rnorm(n, 0, 1) x4 <- rchisq(n, 4) x5 <- rbinom(n, 1, 0.3) z1 <- rnorm(n, 0, 1) z2 <- rnorm(n, 0, 1) predz1 <- -1 + 0.2 * z1 predprobz1 <- 1/(1 + exp(-predz1)) predz2 <- -2 + 0.3 * z2 predprobz2 <- 1/(1 + exp(-predz2)) predx4 <- -1 + 0.2 * x4 predprobx4 <- 1/(1 + exp(-predx4)) mx1 <- as.logical(rbinom(n, 1, 0.1)) mx2 <- as.logical(rbinom(n, 1, predprobz1)) mx3 <- as.logical(rbinom(n, 1, predprobz2)) mx4 <- as.logical(rbinom(n, 1, predprobx4)) mx5 <- as.logical(rbinom(n, 1, 0.05)) mz1 <- as.logical(rbinom(n, 1, 0.05)) mz2 <- as.logical(rbinom(n, 1, 0.20)) x1[mx1] <- NA; x2[mx2] <- NA; x3[mx3] <- NA; x4[mx4] <- NA; x5[mx5] <- NA z1[mz1] <- NA; z2[mz2] <- NA dat <- data.frame(x1, x2, x3, x4, x5, z1, z2) summary(dat) library(mice) md.pattern(dat) # Listwise rowna <- apply(is.na(dat), 1, any) datlistwise <- dat[!rowna, ] dim(datlistwise) # Pairwise cor(dat, use = "pairwise.complete.obs") # Mean substitution datmeansub <- dat targetvar <- 2:7 for(i in targetvar) { datmeansub[is.na(datmeansub[, i]), i] <- mean(datmeansub[, i], na.rm = TRUE) } # Group-mean substitution datgroupsub <- dat groupvar <- 1 catgroupvar <- setdiff(unique(dat[,groupvar]), NA) targetvar <- 2:7 for(i in targetvar) { for(j in 1:length(catgroupvar)) { subrow <- is.na(datgroupsub[, i]) & sapply(datgroupsub[,groupvar] == catgroupvar[j], isTRUE) calcrow <- datgroupsub[,groupvar] == catgroupvar[j] datgroupsub[subrow, i] <- mean(datgroupsub[calcrow, i], na.rm = TRUE) } } # Row-mean substitution f <- rnorm(n, 0, 1) y1 <- 0.7*f + rnorm(n, 0, sqrt(0.51)) y2 <- 0.7*f + rnorm(n, 0, sqrt(0.51)) y3 <- 0.7*f + rnorm(n, 0, sqrt(0.51)) y4 <- 0.7*f + rnorm(n, 0, sqrt(0.51)) a <- rnorm(n, 0, 1) dat2 <- data.frame(y1, y2, y3, y4, a) dat2[as.logical(rbinom(n, 1, 0.1)), 1] <- NA dat2[as.logical(rbinom(n, 1, 0.2)), 2] <- NA dat2[as.logical(rbinom(n, 1, 0.05)), 3] <- NA dat2[as.logical(rbinom(n, 1, 0.1)), 4] <- NA dat2[as.logical(rbinom(n, 1, 0.05)), 5] <- NA FUN <- function(x) { out <- mean(x[1:4], na.rm = TRUE) if(is.na(out)) { return(x) } else { impose <- is.na(x) impose[setdiff(1:length(x), 1:4)] <- FALSE x[impose] <- out return(x) } } dat2 <- t(apply(dat2, 1, FUN)) # Full information maximum likelihood library(lavaan) script <- " x3 ~~ x4 + z1 + z2 x4 ~~ z1 + z2 z1 ~~ z2 " out <- sem(script, data = dat, missing = "ml") summary(out, std = TRUE) # Multiple imputation library(mice) m <- 10 imp <- mice(dat, m = m) fit <- with(imp, lm(x4 ~ x1 + x2 + x3 + x5)) summary(pool(fit)) newdat <- as.data.frame(dat) newdat[,"x1"] <- as.factor(newdat[,"x1"]) newdat[,"x2"] <- ordered(newdat[,"x2"]) newdat[,"x5"] <- ordered(newdat[,"x5"]) newimp <- mice(newdat, m = m) dat.l <- NULL for(i in 1:m) dat.l[[i]] <- complete(newimp, action = i) FUN <- function(x) { result <- t.test(x4 ~ x5, data = x) means <- result$estimate diff <- means[1] - means[2] se <- diff / result$statistic c(diff, se) } stat <- sapply(dat.l, FUN) est <- mean(stat[1,]) bvar <- var(stat[1,]) wvar <- sum(stat[2,]^2)/m tvar <- wvar + ((m+1)/m)*bvar se <- sqrt(tvar) z <- est/se p <- 2 * (1 - pnorm(abs(z))) ############################# ### Debugging ############################# # Problematic function 1 addition <- function(a, b) a + b # Problematic function 2 findmaxmi <- function(n) { if(n < 100) warning("n might be too low for categorical CFA.") f <- rnorm(n, 0, 1) y1star <- 0.5 + 0.9*f + rnorm(n, 0, 1) y2star <- -0.5 + 0.7*f + rnorm(n, 0, 1) y3star <- 0.2 + 0.5*f + rnorm(n, 0, 1) y4star <- 0.3 + 0.3*f + rnorm(n, 0, 1) y1 <- y1star > 0 y2 <- y2star > 0 y3 <- y3star > 0 y4 <- y4star > 0 dat <- data.frame(y1, y2, y3, y4) script <- "f =~ y1 + y2 + y3 + y4" out <- cfa(script, data = dat, ordered = c("y1", "y2", "y3", "y4"), std.lv = TRUE) return(which.max(inspect(out, "mi")$mi)) } # Use try addition <- function(a, b) { result <- NA try(result <- a + b) result } # Check the try object addition <- function(a, b) { result <- NA tryout <- try(result <- a + b) if(is(tryout, "try-error")) { print("Two things cannot be added.") return(NA) } else { return(result) } } findmaxmi <- function(n) { if(n < 100) warning("n might be too low for categorical CFA.") f <- rnorm(n, 0, 1) y1star <- 0.5 + 0.9*f + rnorm(n, 0, 1) y2star <- -0.5 + 0.7*f + rnorm(n, 0, 1) y3star <- 0.2 + 0.5*f + rnorm(n, 0, 1) y4star <- 0.3 + 0.3*f + rnorm(n, 0, 1) y1 <- y1star > 0 y2 <- y2star > 0 y3 <- y3star > 0 y4 <- y4star > 0 dat <- data.frame(y1, y2, y3, y4) script <- "f =~ y1 + y2 + y3 + y4" out <- cfa(script, data = dat, ordered = c("y1", "y2", "y3", "y4"), std.lv = TRUE) tryout <- try(result <- return(which.max(inspect(out, "mi")$mi))) if(is(tryout, "try-error")) { print("Cannot calculate the modification indices because the model is not covergent.") return(NA) } else { return(result) } } # Use tryCatch addition <- function(a, b) { result <- NA tryCatch(result <- a + b, error = function(e) print("Two things cannot be added.")) result } findmaxmi <- function(n) { f <- rnorm(n, 0, 1) y1star <- 0.5 + 0.9*f + rnorm(n, 0, 1) y2star <- -0.5 + 0.7*f + rnorm(n, 0, 1) y3star <- 0.2 + 0.5*f + rnorm(n, 0, 1) y4star <- 0.3 + 0.3*f + rnorm(n, 0, 1) y1 <- y1star > 0 y2 <- y2star > 0 y3 <- y3star > 0 y4 <- y4star > 0 dat <- data.frame(y1, y2, y3, y4) script <- "f =~ y1 + y2 + y3 + y4" out <- cfa(script, data = dat, ordered = c("y1", "y2", "y3", "y4"), std.lv = TRUE) result <- NA tryCatch(result <- return(which.max(inspect(out, "mi")$mi)), error = function(e) print("Cannot calculate modification indices")) return(result) } # Set the warning or stop addition <- function(a, b) { if(!is.numeric(a)) warning("a is not numeric") if(!is.numeric(b)) warning("b is not numeric") result <- NA tryCatch(result <- a + b, error = function(e) print("Two things cannot be added together.")) result } addition <- function(a, b) { if(!is.numeric(a)) stop("a is not numeric") if(!is.numeric(b)) stop("b is not numeric") result <- NA tryCatch(result <- a + b, error = function(e) print("Two things cannot be added together.")) result } findmaxmi <- function(n) { if(n < 100) warning("n might be too low for categorical CFA.") f <- rnorm(n, 0, 1) y1star <- 0.5 + 0.9*f + rnorm(n, 0, 1) y2star <- -0.5 + 0.7*f + rnorm(n, 0, 1) y3star <- 0.2 + 0.5*f + rnorm(n, 0, 1) y4star <- 0.3 + 0.3*f + rnorm(n, 0, 1) y1 <- y1star > 0 y2 <- y2star > 0 y3 <- y3star > 0 y4 <- y4star > 0 dat <- data.frame(y1, y2, y3, y4) script <- "f =~ y1 + y2 + y3 + y4" out <- cfa(script, data = dat, ordered = c("y1", "y2", "y3", "y4"), std.lv = TRUE) result <- NA tryCatch(result <- return(which.max(inspect(out, "mi")$mi)), error = function(e) print("Cannot calculate modification indices")) return(result) } ############################# ### Simulation Example ############################# library(mice) set.seed(123321) pms <- c(0.05, 0.1, 0.2, 0.4) mps <- c(1, 2) # MCAR and MAR n <- 200 nrep <- 1000 S <- matrix(c(1, 0.5, 0.5, 1), 2, 2) ovresult <- NULL for(k in 1:length(pms)) { pm <- pms[k] for(j in 1:length(mps)) { mp <- mps[j] result <- matrix(NA, nrep, 6) for(i in 1:nrep) { XS <- mvrnorm(n, rep(0, 2), S) x <- XS[,1] y <- 0.5 * x + rnorm(n, 0, sqrt(0.75)) z <- XS[,2] if(mp == 1) { mx <- as.logical(rbinom(n, 1, pm)) } else { mhat <- log(pm/(1 - pm)) + 0.5 * z phat <- 1 / (1 + exp(-mhat)) mx <- as.logical(rbinom(n, 1, phat)) } compdat <- data.frame(x, y) result[i, 1:2] <- summary(lm(y ~ x, data = compdat))[["coefficients"]][2, 1:2] x[mx] <- NA dat <- data.frame(x, y, z) # Listwise result[i, 3:4] <- summary(lm(y ~ x, data = dat))[["coefficients"]][2, 1:2] # mice, MI imps <- mice(dat, m = 5, print = FALSE) poolthing <- with(imps, lm(y ~ x)) result[i, 5:6] <- summary(pool(poolthing))[2, 1:2] } result <- cbind(pm, mp, result) ovresult <- rbind(ovresult, result) } } colnames(ovresult) <- c("pm", "mp", "b", "seb", "blist", "seblist", "bmi", "sebmi") ave <- aggregate(cbind(b, seb, blist, seblist, bmi, sebmi) ~ pm + mp, data = ovresult, FUN = mean) empse <- aggregate(cbind(b, blist, bmi) ~ pm + mp, data = ovresult, FUN = sd) conds <- ave[,1:2] relbiasest <- cbind(conds, blist = (ave[,"blist"] - 0.5)/0.5, bmi = (ave[,"bmi"] - 0.5)/0.5) relbiasest2 <- cbind(conds, blist = (ave[,"blist"] - ave[,"b"])/ave[,"b"], bmi = (ave[,"bmi"] - ave[,"b"])/ave[,"b"]) round(relbiasest, 4) round(relbiasest2, 4) relbiasse <- cbind(conds, blist = (ave[,"seblist"] - empse[,"blist"])/empse[,"blist"], bmi = (ave[,"sebmi"] - empse[,"bmi"])/empse[,"bmi"]) relbiasse2 <- cbind(conds, blist = (ave[,"seblist"] - empse[,"b"])/empse[,"b"], bmi = (ave[,"sebmi"] - empse[,"b"])/empse[,"b"]) round(relbiasse, 4) round(relbiasse2, 4)