# R for simulation study ### Generate a data set and analyze it using multiple regression library(MASS) set.seed(123321) n <- 100 miv <- rep(0, 3) siv <- matrix(0.5, 3, 3) diag(siv) <- 1 dat <- mvrnorm(n, miv, siv) y <- 0.3*dat[,1] + 0.4*dat[,2] + 0.2*dat[,3] + rnorm(n, 0, sqrt(0.45)) dat <- cbind(dat, y) colnames(dat) <- c("x1", "x2", "x3", "y") dat <- as.data.frame(dat) out <- lm(y ~ x1 + x2 + x3, data = dat) sumout <- summary(out) names(sumout) sumout[["coefficients"]] out2 <- lm(y ~ x1 + x2, data = dat) anova(out2, out) ### Generate multiple data sets nrep <- 3 n <- 100 miv <- rep(0, 3) siv <- matrix(0.5, 3, 3) diag(siv) <- 1 dat.l <- list() set.seed(123321) for(i in 1:nrep) { dat <- mvrnorm(n, miv, siv) y <- 0.3*dat[,1] + 0.4*dat[,2] + 0.2*dat[,3] + rnorm(n, 0, sqrt(0.45)) dat <- cbind(dat, y) colnames(dat) <- c("x1", "x2", "x3", "y") dat.l[[i]] <- as.data.frame(dat) } ### Analyze them using multiple regression by for loop resultcoef <- matrix(NA, nrep, 4) for(i in 1:nrep) { out <- lm(y ~ x1 + x2 + x3, data = dat.l[[i]]) sumout <- summary(out) resultcoef[i,] <- sumout$coefficients[,1] } apply(resultcoef, 2, mean) ### sapply function sapply(dat.l, nrow) sapply(dat.l, colMeans) dvstat <- function(dat) c(mean(dat[,4]), sd(dat[,4])) sapply(dat.l, dvstat) sapply(dat.l, function(dat) c(mean(dat[,4]), sd(dat[,4]))) analysisfun <- function(dat) { out <- lm(y ~ x1 + x2 + x3, data = dat) sumout <- summary(out) sumout$coefficients[,1] } sapply(dat.l, analysisfun) ### Time it ########## Data Generation ########## nrep2 <- 10000 miv <- rep(0, 3) siv <- matrix(0.5, 3, 3) diag(siv) <- 1 dat.l2 <- list() set.seed(123321) for(i in 1:nrep2) { dat <- mvrnorm(n, miv, siv) y <- 0.3*dat[,1] + 0.4*dat[,2] + 0.2*dat[,3] + rnorm(n, 0, sqrt(0.45)) dat <- cbind(dat, y) colnames(dat) <- c("x1", "x2", "x3", "y") dat.l2[[i]] <- as.data.frame(dat) } ########## For loop ########## system.time({ resultcoef <- matrix(NA, nrep2, 4) for(i in 1:nrep2) { out <- lm(y ~ x1 + x2 + x3, data = dat.l2[[i]]) sumout <- summary(out) resultcoef[i,] <- sumout$coefficients[,1] } }) ########## sapply ########## system.time(resultcoef2 <- sapply(dat.l2, analysisfun)) ### lapply lapply(dat.l, colMeans) lapply(dat.l, cor) lapply(dat.l, cor, method = "spearman") output.l <- lapply(dat.l, lm, formula = y ~ x1 + x2 + x3) output.l sumoutput.l <- lapply(output.l, summary) sumoutput.l extractcoef <- function(out) out[["coefficients"]][,2] coef.l <- lapply(sumoutput.l, extractcoef) coef.l do.call(rbind, coef.l) coef.l <- lapply(sumoutput.l, "[[", "coefficients") sapply(coef.l, "[", 1:4, 1) residual.l <- lapply(sumoutput.l, "[[", "residuals") ### mapply output1.l <- lapply(dat.l, lm, formula = y ~ x1 + x2 + x3) output2.l <- lapply(dat.l, lm, formula = y ~ x1 + x2) mapply(anova, output1.l, output2.l, SIMPLIFY = FALSE) diffrsquared <- function(out1, out2, adjust = FALSE) { name <- "r.squared" if(adjust) name <- "adj.r.squared" rsquared1 <- summary(out1)[[name]] rsquared2 <- summary(out2)[[name]] abs(rsquared1 - rsquared2) } mapply(diffrsquared, output1.l, output2.l) mapply(diffrsquared, output1.l, output2.l, MoreArgs = list(adjust = TRUE)) corpredict <- function(out1, out2) { cor(predict(out1), predict(out2)) } mapply(corpredict, output1.l, output2.l) ### Parallel Processing output.l <- lapply(dat.l, lm, formula = y ~ x1 + x2 + x3) library(parallel) output.l <- mclapply(dat.l, lm, formula = y ~ x1 + x2 + x3, mc.cores = 2) cl <- makeCluster(rep("localhost", 2), type = "PSOCK") output.l <- parLapply(cl, dat.l, lm, formula = y ~ x1 + x2 + x3) stopCluster(cl) detectCores() ### Exercises g1 <- cbind(1, rnorm(100, 0, 1)) g2 <- cbind(2, rnorm(100, 0.5, 1)) dat <- data.frame(rbind(g1, g2)) colnames(dat) <- c("group", "y") outeq <- t.test(y ~ group, data = dat, var.equal = TRUE) outneq <- t.test(y ~ group, data = dat) cieq <- outeq[["conf.int"]] cineq <- outneq[["conf.int"]] widtheq <- cieq[2] - cieq[1] widthneq <- cineq[2] - cineq[1] diffwidth <- widtheq - widthneq