# R for simulation study

### Generate a data set and analyze it using multiple regression
library(MASS)
set.seed(123321)
n <- 100
miv <- rep(0, 3)
siv <- matrix(0.5, 3, 3)
diag(siv) <- 1
dat <- mvrnorm(n, miv, siv)

y <- 0.3*dat[,1] + 0.4*dat[,2] + 0.2*dat[,3] + rnorm(n, 0, sqrt(0.45))
dat <- cbind(dat, y)
colnames(dat) <- c("x1", "x2", "x3", "y")
dat <- as.data.frame(dat)

out <- lm(y ~ x1 + x2 + x3, data = dat)
sumout <- summary(out)
names(sumout)
sumout[["coefficients"]]

out2 <- lm(y ~ x1 + x2, data = dat)
anova(out2, out)

### Generate multiple data sets

nrep <- 3
n <- 100
miv <- rep(0, 3)
siv <- matrix(0.5, 3, 3)
diag(siv) <- 1

dat.l <- list()
set.seed(123321)
for(i in 1:nrep) {
  dat <- mvrnorm(n, miv, siv)
  y <- 0.3*dat[,1] + 0.4*dat[,2] + 0.2*dat[,3] + rnorm(n, 0, sqrt(0.45))
  dat <- cbind(dat, y)
  colnames(dat) <- c("x1", "x2", "x3", "y")
  dat.l[[i]] <- as.data.frame(dat)
}

### Analyze them using multiple regression by for loop

resultcoef <- matrix(NA, nrep, 4)
for(i in 1:nrep) {
  out <- lm(y ~ x1 + x2 + x3, data = dat.l[[i]])
  sumout <- summary(out)
  resultcoef[i,] <- sumout$coefficients[,1]
}
apply(resultcoef, 2, mean)

### sapply function

sapply(dat.l, nrow)

sapply(dat.l, colMeans)

dvstat <- function(dat) c(mean(dat[,4]), sd(dat[,4]))
sapply(dat.l, dvstat)

sapply(dat.l, function(dat) c(mean(dat[,4]), sd(dat[,4])))

analysisfun <- function(dat) {
  out <- lm(y ~ x1 + x2 + x3, data = dat)
  sumout <- summary(out)
  sumout$coefficients[,1]
}
sapply(dat.l, analysisfun)

### Time it


########## Data Generation ##########
nrep2 <- 10000
miv <- rep(0, 3)
siv <- matrix(0.5, 3, 3)
diag(siv) <- 1
dat.l2 <- list()
set.seed(123321)
for(i in 1:nrep2) {
  dat <- mvrnorm(n, miv, siv)
  y <- 0.3*dat[,1] + 0.4*dat[,2] + 0.2*dat[,3] + rnorm(n, 0, sqrt(0.45))
  dat <- cbind(dat, y)
  colnames(dat) <- c("x1", "x2", "x3", "y")
  dat.l2[[i]] <- as.data.frame(dat)
}
########## For loop ##########
system.time({
resultcoef <- matrix(NA, nrep2, 4)
for(i in 1:nrep2) {
  out <- lm(y ~ x1 + x2 + x3, data = dat.l2[[i]])
  sumout <- summary(out)
  resultcoef[i,] <- sumout$coefficients[,1]
}
})
########## sapply ##########
system.time(resultcoef2 <- sapply(dat.l2, analysisfun))

### lapply

lapply(dat.l, colMeans)

lapply(dat.l, cor)

lapply(dat.l, cor, method = "spearman")

output.l <- lapply(dat.l, lm, formula = y ~ x1 + x2 + x3)
output.l

sumoutput.l <- lapply(output.l, summary)
sumoutput.l

extractcoef <- function(out) out[["coefficients"]][,2]
coef.l <- lapply(sumoutput.l, extractcoef)
coef.l

do.call(rbind, coef.l)

coef.l <- lapply(sumoutput.l, "[[", "coefficients")
sapply(coef.l, "[", 1:4, 1)

residual.l <- lapply(sumoutput.l, "[[", "residuals") 

### mapply

output1.l <- lapply(dat.l, lm, formula = y ~ x1 + x2 + x3)
output2.l <- lapply(dat.l, lm, formula = y ~ x1 + x2)
mapply(anova, output1.l, output2.l, SIMPLIFY = FALSE)

diffrsquared <- function(out1, out2, adjust = FALSE) {
  name <- "r.squared"
  if(adjust) name <- "adj.r.squared"
  rsquared1 <- summary(out1)[[name]]
  rsquared2 <- summary(out2)[[name]]
  abs(rsquared1 - rsquared2)
}
mapply(diffrsquared, output1.l, output2.l)
mapply(diffrsquared, output1.l, output2.l, MoreArgs = list(adjust = TRUE))

corpredict <- function(out1, out2) {
  cor(predict(out1), predict(out2))
}
mapply(corpredict, output1.l, output2.l)

### Parallel Processing

output.l <- lapply(dat.l, lm, formula = y ~ x1 + x2 + x3)

library(parallel)
output.l <- mclapply(dat.l, lm, formula = y ~ x1 + x2 + x3, mc.cores = 2)

cl <- makeCluster(rep("localhost", 2), type = "PSOCK")
output.l <- parLapply(cl, dat.l, lm, formula = y ~ x1 + x2 + x3)
stopCluster(cl)

detectCores()

### Exercises

g1 <- cbind(1, rnorm(100, 0, 1))
g2 <- cbind(2, rnorm(100, 0.5, 1))
dat <- data.frame(rbind(g1, g2))
colnames(dat) <- c("group", "y")
outeq <- t.test(y ~ group, data = dat, var.equal = TRUE)
outneq <- t.test(y ~ group, data = dat)
cieq <- outeq[["conf.int"]]
cineq <- outneq[["conf.int"]]
widtheq <- cieq[2] - cieq[1]
widthneq <- cineq[2] - cineq[1]
diffwidth <- widtheq - widthneq