library(car)
library(psych)

sat.act

mod1 <- lm(ACT ~ SATV, data = sat.act)  
mod2 <- lm(cbind(SATV, SATQ) ~ ACT, data = sat.act) 

summary(mod1)
summary(mod2)

# Y-hat = 13.87 + 0.024(SATV)

coef(mod1)
vc <- vcov(mod1)
sqrt(diag(vc))

tvalue <- coef(mod1)/sqrt(diag(vc))
pt(tvalue, df = 698)

confint(mod1)

# Confidence interval of R-squared -> Bootstrap (required a function to extract R-squared)

predict(mod1)
predict(mod1, data.frame(SATV = 499))
pval <- predict(mod1, data.frame(SATV = 200:800))
result <- data.frame(SATV = 200:800, predicted = pval)

# Confidence interval of predicted values
pval <- predict(mod1, data.frame(SATV = 200:800), interval = "prediction")

anova(mod1)

plot(ACT ~ SATV, data = sat.act)
abline(mod1, col = "red", lty = 2, lwd = 3)


mod3 <- lm(ACT ~ SATV + SATQ, data = sat.act)
mod4 <- lm(ACT ~ SATV + SATQ + age + education, data = sat.act)
summary(mod3)
summary(mod4)

anova(mod3, mod4)

# Find delta R-squared
summary(mod4)[["r.squared"]] - summary(mod3)[["r.squared"]]


sat.act$gender <- factor(sat.act$gender, labels=c("male","female"))
levels(sat.act$gender)
contrasts(sat.act$gender)

mod5 <- lm(SATQ ~ gender, data = sat.act)
summary(mod5)

sat.act$gender.new <- relevel(sat.act$gender, "female") # Treat female as baseline
contrasts(sat.act$gender.new)

mod6 <- lm(SATQ ~ gender.new, data = sat.act)
summary(mod6)

mod7 <- lm(SATQ ~ gender.new + education, data = sat.act)
summary(mod7)

mod8 <- lm(SATQ ~ education, data = sat.act)
anova(mod8, mod7)

# Moderators

modint <- lm(ACT ~ SATV*SATQ, data = sat.act)

modnoint <- lm(ACT ~ SATV + SATQ, data = sat.act)

summary(modint)
summary(modnoint)
anova(modnoint, modint)

# Simple slope
# Probing interaction

library(rockchalk)
plotSlopes(modint, plotx="SATV", modx="SATQ")
plotSlopes(modint, plotx="SATV", modx="SATQ")
plotSlopes(modint, plotx="SATV", modx="SATQ", modxVals ="std.dev.")

obj <- plotSlopes(modint, plotx="SATV", modx="SATQ", modxVals=c(200,400,600,800))
testSlopes(obj)
plot(testSlopes(obj))

# Interaction between categorical and continuous variables
modint2 <- lm(ACT ~ SATV * gender + age + education, data = sat.act)
summary(modint2)

modnoint2 <- lm(ACT ~ SATV + gender + age + education, data = sat.act)
summary(modnoint2)
anova(modnoint2, modint2)

obj2 <- plotSlopes(modint2, plotx="SATV", modx="gender")
testSlopes(obj2)

# Centering

mod8 <- lm(ACT ~ SATQ + SATV, data = sat.act)
sat.act$SATQC <- sat.act$SATQ - mean(sat.act$SATQ)
sat.act$SATQC <- scale(sat.act$SATQ, center = TRUE, scale = FALSE)
sat.act$SATVC <- sat.act$SATV - mean(sat.act$SATV)
sat.act$SATVC <- scale(sat.act$SATV, center = TRUE, scale = FALSE)
mod9 <- lm(ACT ~ SATQC + SATVC, data = sat.act)
summary(mod9)

mod10 <- lm(ACT ~ I(SATQ - 500) + I(SATV - 500), data = sat.act)

std <- lm(scale(ACT) ~  scale(SATV) + scale(SATQ), data = sat.act)
summary(std)

mod8 <- lm(ACT ~ SATQ + SATV, data = sat.act)
standardize(mod8)

# Nonlinear regression

mod11 <- lm(ACT ~ SATQ + I(SATQ^2), data = sat.act)
summary(mod11)

# Simple slope
mod12 <- lm(ACT ~ I(SATQ - 500) + I((SATQ - 500)^2), data = sat.act)
summary(mod12)

x <- 200:800
pred <- predict(mod11, data.frame(SATQ = x))
plot(x, pred)
plot(x, pred, type = "l")


# log
sat.act$logSATQ <- log(sat.act$SATQ) 
sat.act$logACT <- log(sat.act$ACT) 
nReg3 <- lm(logACT ~ logSATQ, data = sat.act)
summary(nReg3)


###################################################################
## Regression: Checking Assumptions about the Residuals
mod1 <- lm(ACT ~ SATV * SATQ + age, data = sat.act)
mod1$residuals

## check the normality assumption
qqnorm(mod1$residuals)
qqline(mod1$residuals)
library(QuantPsyc)
plotNormX(mod1$residuals)

## check the heteroscedasticity assumption:

## check that the residuals are random across predicted values
plot(mod1$residuals ~ mod1$fitted.values)
## check that the residuals are random across predictors
plot(mod1$residuals ~ mod1$model$SATQ)
plot(mod1$residuals ~ mod1$model$SATV)
plot(mod1$residuals ~ mod1$model$age)


# Regression: Checking for Linearity and Multicollinearity

## a scatterplot matrix is a quick way to see several scatterplots at once
pairs(sat.act[ , 3:6])

## add a loess curve and 95% confidence lines using the "car" package
scatterplot(sat.act$SATQ, sat.act$SATV)


scatterplotMatrix(sat.act[ , 3:6])


?pairs.panels
## as does "psych"
pairs.panels(sat.act[ , 3:6])

# Collinearity

## check correlation matrix
cor(sat.act[ , 3:6], use = "pairwise.complete")

mod2 <- lm(ACT ~ gender + age + SATQ + SATV, data = sat.act)

vif(mod2) # any values > 10 are problematic

## tolerance is the reciprocal of the variance inflation factor
1 / vif(mod2) # any values < 0.1 are problematic


# Outliers

mReg1 <- lm(ACT ~ SATV + SATQ, data = sat.act) 
summary(mReg1)

dat <- influence.measures(mReg1)

newdata <- sat.act[,c("ACT", "SATV", "SATQ")]
meaniv <- colMeans(newdata, na.rm = TRUE)
coviv <- cov(newdata, use = "complete.obs")
d2 <- mahalanobis(newdata, meaniv, coviv)

d2 <- d2[!is.na(d2)]
plot(density(d2, bw = 0.5),
     main="Squared Mahalanobis distances, n=700, p=3") ; 
rug(d2) # Show the data points