x <- 1:5 y <- seq(1, 100, length.out = 5) z <- c("p", "f", "p", "p", "f") dat <- data.frame(x, y, z) summary(dat) install.packages("psych") library(psych) describe(dat) class(dat) airquality summary(airquality) dim(airquality) n <- dim(airquality)[1] p <- dim(airquality)[2] n <- nrow(airquality) p <- ncol(airquality) cat("\"airquality\" is a data frame with", n, "rows and", p, "columns.") cat("\"airquality\" is a data frame with", n, "rows and", p, "columns.\n") head(airquality) tail(airquality) head(airquality, 5) tail(airquality, 10) airquality[4, 2] airquality[4:6, 2:4] airquality[,2] airquality[1,] airquality$Ozone airquality[,1] colnames(airquality) rownames(airquality) airquality$Ozone airOzone <- airquality$Ozone mean(airquality) colMeans(airquality, na.rm = TRUE) colSums(airquality, na.rm = TRUE) apply(airquality, MARGIN = 2, FUN = sd) apply(airquality, 2, range) result <- lm(Temp ~ Wind, data = airquality) summary(result) # Alt + 126 # 2.64e-09 --> 2.64 * 10^(-9) ############################# # Mathematical operations, Boolean Operations 50 + 70 (50 - 2) * (10 ^ 3) + (2 * 4) (airquality$Temp - 32) * 5 / 9 2^3 sqrt(2) 2^(1/6) sqrt(airquality$Temp) x <- 7 x <- x + 1 pi exp(1) 20 > 5 5 > 9 x <= 8 y <- 25 x == y x != y x <- 5:15 y <- 15:5 x > y (x > 6) & (x < 12) (x < 7) | (x > 13) !(x < 8) select <- (airquality$Temp > 90) | (airquality$Wind > 15) airquality[select, ] 1:5 8:-2 seq(1, 20, 5) seq(100, 1, -7) rep(20, 7) rep(NA, 4) c(3, 5) x <- c(5, 2, 7, -9) y <- c(rep(1,5), rep(2,5)) new <- c(18, 150, 10.2, 70, 10, 1) airquality2 <- rbind(airquality, new) firsthalf <- airquality$Day < 16 airquality3 <- cbind(airquality, firsthalf) airquality4 <- data.frame(airquality, firsthalf) ############################ # Exercise 2.2 ############################ # 1. Save a number 5 in the object a. Create a new object b that is equal to 1-((a^2)/5). # 2. Check whether the object a is in between 2 and 4 (include 2 and 4 too). # 3. Select attitude data that have rating over 50. # 4. Select attitude data that have rating and complaints over their means. # 5. Select attitude data that the number of rows is the multiplicity of 3. # 6. Add a new variable in the attitude data indicating whether rating score was greater than 50 or not. # 7. Create a vector vec1 containing values 3, 6, 8, and 9. Create a vector vec2 containing values 1 to 4. Then, add them together. # 8. Let’s check ?attitude, ?seq, and ?sum. # 9. Search “Centering R” in any search engines and try the example from the help page of the function you found. (?scale) ###################### # Importing Data ########################## getwd() # show us the working directory list.files() # show us what files are in the current working directory setwd("C:/Users/Sunthud/Dropbox/Teaching/RINTRO") data.csv <- read.table(file = "Ch2.csv", header = TRUE, sep = ",", na.strings = c(" ", "-999")) data.txt <- read.table(file = "Ch2.txt", sep = "\t") install.packages("foreign") library(foreign) ?read.spss data.spss <- read.spss(file = "Ch2.sav", to.data.frame = TRUE) dim(data.csv) colnames(data.csv) data.csv[1:10,] # show the first 10 rows data.csv[,1:5] # show the first 5 columns # ch2.xlsx # Read excel data, write to csv # Read spss data, save it in csv ###################### # Exporting Data ########################## write.table(data.txt, file = "ch2out.txt", sep = "\t", row.names = FALSE, col.names = TRUE) write.table(data.csv, file = "ch2out.csv", sep = ",", na = "-999", row.names = FALSE) # R binary save(data.spss, file = "dat.Rdata") rm(data.spss) data.spss <- load("dat.Rdata") ############################ # Exercise 2.3 ############################ # 1. Export the attitude data to file "ex1_5.csv" using comma separated value and set NA equal to 999. Then, read the file back in and save to an object, RSem. ###################### # Exploring Data ########################## hist(airquality$Temp) hist(airquality$Temp, breaks = 15) qqnorm(airquality$Temp) qqline(airquality$Temp) hist(airquality$Temp, ylab = "Frequency", xlab = "Air Temperature", main="New York Air Quality in 1973", sub = "Figure 1", col = "yellow") boxplot(airquality$Temp) boxplot(airquality$Temp ~ airquality$Month) plot(airquality$Ozone, airquality$Wind) fit <- lm(airquality$Wind ~ airquality$Ozone) # Create linear model object (Y ~ X) abline(fit, col = "red") # Put the fit line on the plot fit ###################### # Packages ########################## install.packages("coefplot") require(coefplot) coefplot(fit) install.packages("arm") arm::coefplot(fit) coefplot::coefplot(fit) # sem and lavaan packages have the sem function ###################### # Factor ########################## g <- rep(c("toyota", "honda", "mazda", "ford"), each = 5) y <- c(9, 8, 6, 7, 5, 8, 9, 6, 5, 4, 8, 6, 4, 5, 5, 5, 6, 7, 4, 3) summary(lm(y ~ g)) g <- c(rep(1:4, each = 5)) y <- c(9, 8, 6, 7, 5, 8, 9, 6, 5, 4, 8, 6, 4, 5, 5, 5, 6, 7, 4, 3) summary(lm(y ~ g)) g <- as.factor(rep(1:4, each = 5)) y <- c(9, 8, 6, 7, 5, 8, 9, 6, 5, 4, 8, 6, 4, 5, 5, 5, 6, 7, 4, 3) summary(lm(y ~ g)) g <- factor(rep(4:1, each = 5), labels = c("toyota", "honda", "mazda", "ford")) y <- c(9, 8, 6, 7, 5, 8, 9, 6, 5, 4, 8, 6, 4, 5, 5, 5, 6, 7, 4, 3) summary(lm(y ~ g)) g <- relevel(g, ref = "toyota") y <- c(9, 8, 6, 7, 5, 8, 9, 6, 5, 4, 8, 6, 4, 5, 5, 5, 6, 7, 4, 3) summary(lm(y ~ g)) as.numeric(g) ses <- c(3, 1, 2, 3, 1, 2, 3, 1, 2, 1, 1, 3) ses <- factor(ses, labels = c("low", "medium", "high"), ordered = TRUE) att <- c(52, 45, 44, 43, 56, 57, 45, 44, 50, 43, 55, 46) summary(lm(att ~ ses)) # Quantitative factor -> Use polynomial ###################### # list ########################## list(1, 2, 3) list(c(1, 2, 3)) names(list) aa <- list(abd = 1, see = 1, meo = 1) aa$abd aa[[1]] length(aa) aa[[5]] <- 123 result <- summary(lm(att ~ ses)) is.list(result) names(result) result$sigma ###################### # NULL ########################## is.null(aa[[1]]) is.null(aa[[4]]) c(NULL, 1) c(aa, NULL) ###################### # Matrices ########################## A <- matrix(1:10, nrow = 5) B <- matrix(21:30, nrow = 5) C <- matrix(21:40, nrow = 2) nrow(A) ncol(B) dim(C) A + B A * B A == B all.equal(A, B) A %*% t(B) L <- matrix(0, 6, 2) L[1:3, 1] <- 0.7 L[4:6, 2] <- 0.7 P <- matrix(0.5, 2, 2) diag(P) <- 1 T <- diag(rep(0.51, 6)) C <- L %*% P %*% t(L) + T solve(C) ###################### # Group Statistics ########################## aggregate(Temp ~ Month, data = airquality, mean) aggregate(Temp ~ Month, data = airquality, sd) FirstHalf <- airquality$Day <= 15 airquality2 <- cbind(airquality, FirstHalf) aggregate(Temp ~ Month + FirstHalf, data = airquality2, mean) aggregate(Temp ~ Month + FirstHalf, data = airquality2, sd) aggregate(cbind(Temp, Ozone, Wind) ~ Month, data = airquality, mean) ############################ # Exercise 2.4 ############################ # 1. Create a histogram of rating variable in the attitude data. # 2. Create a scatterplot of complaints (on X axis) and rating (on Y axis) variables and impose the regression line in the scatterplot. # 3. Create a new variable representing a median split of complaints (check ?median). Then, create a boxplot of rating by two groups (high vs. low rating on handling employee complaints). # 4. Create a new variable dividing raing into three groups. Then, use the new variable to predict raises as qualitative factors and quantitative factors. Create a list saving the results of qualitative factor and quantitative factor # 5. Check the new dataset, Orange. Find group means and standard deviations of circumference by Tree. # 6. Replicate the following matrix results in R ###################### # Basic Inferential Statistics ########################## ##### One-sample t-test # We will use the t.test function for all t-test for comparing means. The function # attributes are different across type of analysis. First, one-sample t-test t.test(airquality$Temp, mu = 70) # This analysis would like to see whether the population mean of the temperature # is different from 70. The result showed that the average temperature is 77.88. # t(152) = 10.30, p < .001, in the result p < 2.2 * 10^(-16). ##### Independent t-test # For the indepedent samples t-test, I would like to create the "summer" variable in # the airquality data. May, June, July will be true in the summer variable and August, # September will be false in the variable. Next, compare the temperature between true # and false in the summer variable, by independent t-test summer <- airquality$Month <= 7 t.test(airquality$Temp ~ summer) # CAUTION: the default independent t-test assumes that the variance are not equal across groups. # Thus, R uses the corrected t-test. To use the traditional independent t-test, the var.equal # attribute should be specified as TRUE t.test(airquality$Temp ~ summer, var.equal = TRUE) # Assuming equal variances, the mean of the true and false group is 76.15 and 80.49, respectively. # The difference is significant, t(151) = 2.84, p = .005. ##### Paired-sample t-test # I will introduce new data, called attitude ?attitude # In the attitude data, I would like to compare the average attitudes toward opportunity to learn # and advancement. The paired sample t-test will be used by specifying the paired attribute as TRUE. t.test(attitude$learning, attitude$advance, paired = TRUE) # The average difference is 13.43. The difference is significant, t(29) = 6.85, p < .001. ##### Correlation, Covariance # We can analyze correlation test by the cor.test function. cor.test(attitude$learning, attitude$advance) # The attitudes toward opportunity to learn and advancement are significantly positively correlated, # r(28) = .53, p = .003. # Sometimes, we do not want to test the correlation. We only want to create a correlation matrix for # further analysis. We can use the cor function and apply for a data frame. cor(attitude) # just the lower diagonal as.dist(cor(attitude)) # Missing data handling ################# # "all.obs" # "complete.obs" # "pairwise.complete.obs" # We can do the same thing for variance-covariance matrix by cov(attitude) ##### Chi-square: Goodness-of-Fit # For the chi-square test, we may analyze for the Goodness-of-Fit and contingency table. For the # Goodness-of-Fit test, we have a variable and would like to see the observed frequencies in each # cell in the population are different from expected frequencies. For example, we obtained 90 males # and 83 females in our sample. We want to test the bias in the sampling process, assumed that # the population proportions of each sex are equal. chisq.test(c(90, 83)) # The result found that the sampling process is not biased, X^2(1, N = 173) = 0.28, p = .59 # We may change the expected proportion chisq.test(c(90, 83), p = c(0.6, 0.4)) # Sometimes, we do not have frequencies. We have raw data instead, such as Month in the airquality # data. We need to transform the raw data to frequecies before using the Goodness-of-fit test, # by the table function. chisq.test(table(airquality$Month)) ##### Chi-square: Contingency table # For the contingency table, I will introduce a new dataset, HairEyeColor data. ?HairEyeColor # This data is arranged in three-dimension crosstabs. The first dimension is on row, Hair. The second # dimension is on column, Eye. The third dimension is on layer, Sex. I would like to ignore row # dimension, hair. Let's say that I am interested in people who have black hair only. So, I make the # new data by black <- HairEyeColor[1, , ] black # Then, the chi-square test is used to see whether, in the sample with black hair, the proportions of # eye color are different across sex. chisq.test(black) # The result found that the proportions of eye color is not significantly different across sex, # X^2(3, N = 108) = 2.16, p = .54. Note that some cells will have expected values less than 5. # The approximate chi-square may be not correct. We may use the Fisher's Exact test by fisher.test(black) # Sometimes, we have raw data. We need to make the contingency table before running the chi-square test. # The table function is still helpful. For example, I would like to see whether the missing value of the # Ozone variable in the airquality data is systematically related to Month. First, we create a new # variable called Ozone.NA to be true when the Ozone is missing in a particular case Ozone.NA <- is.na(airquality$Ozone) # is.na is the function to see whether the value is missing Ozone.NA # Next, make the contingency tables of Month and the missingness of Ozone MonthOzoneNA <- table(airquality$Month, Ozone.NA) MonthOzoneNA # Finally, analyze by the chi-square test chisq.test(MonthOzoneNA) # The result found that the missingness of the Ozone variable and month when the data obtained are # significantly related, X^2(4, N = 153) = 44.75, p < .001. ############################ # Exercise 1.7 ############################ # 1. XXXXXXXXXXXXXXXXXXXX ############################ # Assignment 2 ############################ # 1. XXXXXXXXXXXXXXXXXXXX