x <- 1:5
y <- seq(1, 100, length.out = 5)
z <- c("p", "f", "p", "p", "f")

dat <- data.frame(x, y, z)
summary(dat)

install.packages("psych")
library(psych)
describe(dat)

class(dat)

airquality
summary(airquality)

dim(airquality)

n <- dim(airquality)[1]
p <- dim(airquality)[2]

n <- nrow(airquality)
p <- ncol(airquality)

cat("\"airquality\" is a data frame with", n, "rows and", p, "columns.")
cat("\"airquality\" is a data frame with", n, "rows and", p, "columns.\n")

head(airquality)
tail(airquality)

head(airquality, 5)
tail(airquality, 10)

airquality[4, 2]
airquality[4:6, 2:4]
airquality[,2]
airquality[1,]

airquality$Ozone
airquality[,1]

colnames(airquality)
rownames(airquality)

airquality$Ozone
airOzone <- airquality$Ozone

mean(airquality)
colMeans(airquality, na.rm = TRUE)
colSums(airquality, na.rm = TRUE)
apply(airquality, MARGIN = 2, FUN = sd)
apply(airquality, 2, range)

result <- lm(Temp ~ Wind, data = airquality)
summary(result)


# Alt + 126

# 2.64e-09 --> 2.64 * 10^(-9)

#############################
# Mathematical operations, Boolean Operations

50 + 70
(50 - 2) * (10 ^ 3) + (2 * 4)
(airquality$Temp - 32) * 5 / 9

2^3
sqrt(2)
2^(1/6)
sqrt(airquality$Temp)

x <- 7
x <- x + 1

pi
exp(1)

20 > 5
5 > 9
x <= 8
y <- 25
x == y
x != y
x <- 5:15
y <- 15:5
x > y

(x > 6) & (x < 12)
(x < 7) | (x > 13)
!(x < 8)
select <- (airquality$Temp > 90) | (airquality$Wind > 15)
airquality[select, ]

1:5
8:-2

seq(1, 20, 5)
seq(100, 1, -7)

rep(20, 7)
rep(NA, 4)

c(3, 5)
x <- c(5, 2, 7, -9)
y <- c(rep(1,5), rep(2,5))

new <- c(18, 150, 10.2, 70, 10, 1) 
airquality2 <- rbind(airquality, new)

firsthalf <- airquality$Day < 16
airquality3 <- cbind(airquality, firsthalf)
airquality4 <- data.frame(airquality, firsthalf)





############################
# Exercise 2.2
############################

# 1. Save a number 5 in the object a. Create a new object b that is equal to 1-((a^2)/5).
# 2. Check whether the object a is in between 2 and 4 (include 2 and 4 too).
# 3. Select attitude data that have rating over 50.
# 4. Select attitude data that have rating and complaints over their means.
# 5. Select attitude data that the number of rows is the multiplicity of 3.
# 6. Add a new variable in the attitude data indicating whether rating score was greater than 50 or not.
# 7. Create a vector vec1 containing values 3, 6, 8, and 9. Create a vector vec2 containing values 1 to 4. Then, add them together.
# 8. Let’s check ?attitude, ?seq, and ?sum.
# 9. Search “Centering R” in any search engines and try the example from the help page of the function you found. (?scale)

######################
# Importing Data
##########################


getwd()  # show us the working directory
list.files() # show us what files are in the current working directory

setwd("C:/Users/Sunthud/Dropbox/Teaching/RINTRO")

data.csv <- read.table(file = "Ch2.csv", header = TRUE, sep = ",",
                       na.strings = c(" ", "-999"))
data.txt <- read.table(file = "Ch2.txt", sep = "\t")

install.packages("foreign")
library(foreign)

?read.spss

data.spss <- read.spss(file = "Ch2.sav", to.data.frame = TRUE)

dim(data.csv)
colnames(data.csv)
data.csv[1:10,]  # show the first 10 rows
data.csv[,1:5]   # show the first 5 columns

# ch2.xlsx
# Read excel data, write to csv
# Read spss data, save it in csv

######################
# Exporting Data
##########################

write.table(data.txt, file = "ch2out.txt", sep = "\t", row.names = FALSE, col.names = TRUE)
write.table(data.csv, file = "ch2out.csv", sep = ",", na = "-999", row.names = FALSE)

# R binary

save(data.spss, file = "dat.Rdata")

rm(data.spss)
data.spss <- load("dat.Rdata")

############################
# Exercise 2.3
############################

# 1. Export the attitude data to file "ex1_5.csv" using comma separated value and set NA equal to 999. Then, read the file back in and save to an object, RSem.

######################
# Exploring Data
##########################

hist(airquality$Temp)
hist(airquality$Temp, breaks = 15)
qqnorm(airquality$Temp)
qqline(airquality$Temp)
hist(airquality$Temp, ylab = "Frequency", xlab = "Air Temperature", 
   main="New York Air Quality in 1973", sub = "Figure 1", col = "yellow")
boxplot(airquality$Temp)
boxplot(airquality$Temp ~ airquality$Month)
plot(airquality$Ozone, airquality$Wind)
fit <- lm(airquality$Wind ~ airquality$Ozone)	# Create linear model object (Y ~ X)
abline(fit, col = "red")			# Put the fit line on the plot

fit

######################
# Packages
##########################

install.packages("coefplot")
require(coefplot)
coefplot(fit)

install.packages("arm")
arm::coefplot(fit)
coefplot::coefplot(fit)

# sem and lavaan packages have the sem function

######################
# Factor
##########################

g <- rep(c("toyota", "honda", "mazda", "ford"), each = 5)
y <- c(9, 8, 6, 7, 5, 8, 9, 6, 5, 4, 8, 6, 4, 5, 5, 5, 6, 7, 4, 3)
summary(lm(y ~ g))

g <- c(rep(1:4, each = 5))
y <- c(9, 8, 6, 7, 5, 8, 9, 6, 5, 4, 8, 6, 4, 5, 5, 5, 6, 7, 4, 3)
summary(lm(y ~ g))

g <- as.factor(rep(1:4, each = 5))
y <- c(9, 8, 6, 7, 5, 8, 9, 6, 5, 4, 8, 6, 4, 5, 5, 5, 6, 7, 4, 3)
summary(lm(y ~ g))

g <- factor(rep(4:1, each = 5), labels = c("toyota", "honda", "mazda", "ford"))
y <- c(9, 8, 6, 7, 5, 8, 9, 6, 5, 4, 8, 6, 4, 5, 5, 5, 6, 7, 4, 3)
summary(lm(y ~ g))

g <- relevel(g, ref = "toyota")
y <- c(9, 8, 6, 7, 5, 8, 9, 6, 5, 4, 8, 6, 4, 5, 5, 5, 6, 7, 4, 3)
summary(lm(y ~ g))

as.numeric(g)

ses <- c(3, 1, 2, 3, 1, 2, 3, 1, 2, 1, 1, 3)
ses <- factor(ses, labels = c("low", "medium", "high"), ordered = TRUE)
att <- c(52, 45, 44, 43, 56, 57, 45, 44, 50, 43, 55, 46)
summary(lm(att ~ ses)) # Quantitative factor -> Use polynomial

######################
# list
##########################

list(1, 2, 3)
list(c(1, 2, 3))
names(list)
aa <- list(abd = 1, see = 1, meo = 1)
aa$abd
aa[[1]]
length(aa)
aa[[5]] <- 123
result <- summary(lm(att ~ ses))
is.list(result)
names(result)
result$sigma

######################
# NULL
##########################

is.null(aa[[1]])
is.null(aa[[4]])

c(NULL, 1)
c(aa, NULL)

######################
# Matrices
##########################

A <- matrix(1:10, nrow = 5)
B <- matrix(21:30, nrow = 5)
C <- matrix(21:40, nrow = 2)

nrow(A)
ncol(B)
dim(C)

A + B

A * B

A == B

all.equal(A, B)

A %*% t(B)

L <- matrix(0, 6, 2)
L[1:3, 1] <- 0.7
L[4:6, 2] <- 0.7
P <- matrix(0.5, 2, 2)
diag(P) <- 1
T <- diag(rep(0.51, 6))
C <- L %*% P %*% t(L) + T

solve(C)


######################
# Group Statistics
##########################

aggregate(Temp ~ Month, data = airquality, mean)
aggregate(Temp ~ Month, data = airquality, sd)
FirstHalf <- airquality$Day <= 15
airquality2 <- cbind(airquality, FirstHalf)
aggregate(Temp ~ Month + FirstHalf, data = airquality2, mean)
aggregate(Temp ~ Month + FirstHalf, data = airquality2, sd)
aggregate(cbind(Temp, Ozone, Wind) ~ Month, data = airquality, mean)

############################
# Exercise 2.4
############################

# 1. Create a histogram of rating variable in the attitude data.
# 2. Create a scatterplot of complaints (on X axis) and rating (on Y axis) variables and impose the regression line in the scatterplot.
# 3. Create a new variable representing a median split of complaints (check ?median). Then, create a boxplot of rating by two groups (high vs. low rating on handling employee complaints).
# 4. Create a new variable dividing raing into three groups. Then, use the new variable to predict raises as qualitative factors and quantitative factors. Create a list saving the results of qualitative factor and quantitative factor
# 5. Check the new dataset, Orange. Find group means and standard deviations of circumference by Tree.
# 6. Replicate the following matrix results in R


######################
# Basic Inferential Statistics
##########################

##### One-sample t-test

# We will use the t.test function for all t-test for comparing means. The function 
# attributes are different across type of analysis. First, one-sample t-test

t.test(airquality$Temp, mu = 70)

# This analysis would like to see whether the population mean of the temperature
# is different from 70. The result showed that the average temperature is 77.88.
# t(152) = 10.30, p < .001, in the result p < 2.2 * 10^(-16).

##### Independent t-test

# For the indepedent samples t-test, I would like to create the "summer" variable in
# the airquality data. May, June, July will be true in the summer variable and August,
# September will be false in the variable. Next, compare the temperature between true
# and false in the summer variable, by independent t-test

summer <- airquality$Month <= 7
t.test(airquality$Temp ~ summer)

# CAUTION: the default independent t-test assumes that the variance are not equal across groups.
# Thus, R uses the corrected t-test. To use the traditional independent t-test, the var.equal
# attribute should be specified as TRUE

t.test(airquality$Temp ~ summer, var.equal = TRUE)

# Assuming equal variances, the mean of the true and false group is 76.15 and 80.49, respectively.
# The difference is significant, t(151) = 2.84, p = .005.

##### Paired-sample t-test

# I will introduce new data, called attitude

?attitude

# In the attitude data, I would like to compare the average attitudes toward opportunity to learn
# and advancement. The paired sample t-test will be used by specifying the paired attribute as TRUE.

t.test(attitude$learning, attitude$advance, paired = TRUE)

# The average difference is 13.43. The difference is significant, t(29) = 6.85, p < .001.

##### Correlation, Covariance

# We can analyze correlation test by the cor.test function.

cor.test(attitude$learning, attitude$advance)

# The attitudes toward opportunity to learn and advancement are significantly positively correlated,
# r(28) = .53, p = .003.

# Sometimes, we do not want to test the correlation. We only want to create a correlation matrix for
# further analysis. We can use the cor function and apply for a data frame.

cor(attitude)
# just the lower diagonal
as.dist(cor(attitude))

# Missing data handling #################

# "all.obs"
# "complete.obs"
# "pairwise.complete.obs"


# We can do the same thing for variance-covariance matrix by

cov(attitude)

##### Chi-square: Goodness-of-Fit

# For the chi-square test, we may analyze for the Goodness-of-Fit and contingency table. For the
# Goodness-of-Fit test, we have a variable and would like to see the observed frequencies in each
# cell in the population are different from expected frequencies. For example, we obtained 90 males
# and 83 females in our sample. We want to test the bias in the sampling process, assumed that
# the population proportions of each sex are equal.

chisq.test(c(90, 83))

# The result found that the sampling process is not biased, X^2(1, N = 173) = 0.28, p = .59
# We may change the expected proportion

chisq.test(c(90, 83), p = c(0.6, 0.4))

# Sometimes, we do not have frequencies. We have raw data instead, such as Month in the airquality
# data. We need to transform the raw data to frequecies before using the Goodness-of-fit test,
# by the table function.

chisq.test(table(airquality$Month))

##### Chi-square: Contingency table

# For the contingency table, I will introduce a new dataset, HairEyeColor data.

?HairEyeColor

# This data is arranged in three-dimension crosstabs. The first dimension is on row, Hair. The second
# dimension is on column, Eye. The third dimension is on layer, Sex. I would like to ignore row
# dimension, hair. Let's say that I am interested in people who have black hair only. So, I make the
# new data by

black <- HairEyeColor[1, , ]
black

# Then, the chi-square test is used to see whether, in the sample with black hair, the proportions of 
# eye color are different across sex.

chisq.test(black)

# The result found that the proportions of eye color is not significantly different across sex, 
# X^2(3, N = 108) = 2.16, p = .54. Note that some cells will have expected values less than 5. 
# The approximate chi-square may be not correct. We may use the Fisher's Exact test by

fisher.test(black)

# Sometimes, we have raw data. We need to make the contingency table before running the chi-square test.
# The table function is still helpful. For example, I would like to see whether the missing value of the
# Ozone variable in the airquality data is systematically related to Month. First, we create a new
# variable called Ozone.NA to be true when the Ozone is missing in a particular case

Ozone.NA <- is.na(airquality$Ozone)	# is.na is the function to see whether the value is missing
Ozone.NA

# Next, make the contingency tables of Month and the missingness of Ozone

MonthOzoneNA <- table(airquality$Month, Ozone.NA)
MonthOzoneNA

# Finally, analyze by the chi-square test

chisq.test(MonthOzoneNA)

# The result found that the missingness of the Ozone variable and month when the data obtained are
# significantly related, X^2(4, N = 153) = 44.75, p < .001.

############################
# Exercise 1.7
############################

# 1. XXXXXXXXXXXXXXXXXXXX

############################
# Assignment 2
############################

# 1. XXXXXXXXXXXXXXXXXXXX