#Nathaniel Mark
#Recitation January 27, 2017
#Intro to R
#A Note: For this recitation, I assume we are using RStudio.
#You should use RStudio, as it is much easier to use as beginners.
#If you want to learn more about using R without RStudio, see the
#Professor's R notes.
#PART I: Basics of writing up an R Script-
# #excludes the line from code
# <- assigns values to an object
a <- 2
a
b <- 3
c <- 4
d <- 8
#c(, ) combines to generate a vector
Vec <- c(a, b)
Vec
#We can transform whole vectors of data:
Vecsr <- sqrt(Vec)
Vecsr
Vec2 <- 2*Vec
Vec2
#matrix() combines to generate a matrix
Mat <- matrix(data = c(a, b, c, d), nrow = 2, ncol = 2)
Mat
#array() generates an object with more than 2 dimensions.
# functions
mean(Vec)
cat("Our Vector is", Vec)
# help()
help(mean)
#Packages:
# google for packages if the function you want to use is not installed
# You only need to install once on your computer, but use library every time you load R.
install.packages("foreign")
library(foreign)
# Ctrl+Enter is "run the highlighted code"
#Removing all objects:
rm(list = ls(all = TRUE))
#PART II: Loading Data and observing data
#Usually, in RStudio, we can use: Environment>Import Dataset>auto.dta (in the box to the right)
#The data we are using in this exercise is .dta, however, so we have to be a little fancier.
auto = read.dta("C:/users/ndm2125/Downloads/auto.dta")
#Another way is to set the working directory to be a certain folder, then load it from that folder.
setwd("C:/users/ndm2125/Downloads/")
auto = read.dta("auto.dta")
#This generates what is called a dataframe. It is a collection of named vectors (i.e. variables)
#Variables can be obtained in two ways:
#1)
auto$price
#2)
attach(auto)
price
#This makes it so you are working within this specific dataframe. Be Careful!
detach(auto) #Takes you out of that dataframe
#Use the str or head function to describe the dataframe:
str(auto, give.attr = FALSE) #Note: you do not need the give.attr option at the end.
#This gives us, for each variable: the variable name, type and the first few data points.
#Also note, this information is in the environment window in RStudio.
head(auto)
#this gives us a look into the first couple values for each variable.
#PART III: Analyzing the raw data
#Most simply, summary() gives you summary statistics of each variable that is numeric.
summary(auto)
#Summary can also be used to give summary statistics for a specific variable:
summary(auto$mpg)
#More specific summary statistics can be found with:
describe(auto$mpg)
#ERROR! This is to remind you about packages. Load the package needed!
??describe #searching for functions in a non-loaded package
install.packages("psych")
library(psych)
describe(auto$mpg)
#Subsets: Say we only want to see the MPG of foreign cars
subset(auto$mpg, auto$foreign == "Foreign")
#Or the MPG of cars that weigh over 3500 pounds:
subset(auto$mpg, auto$weight >= 3500)
#We can also take a subset of the entire dataframe, if we only want to keep a few variables:
NewDataFrame <- subset(auto, select = c("mpg", "weight", "foreign"))
str(NewDataFrame)
#PART IV: GRAPHING
hist(auto$mpg)
plot(auto$weight, auto$mpg)
plot(auto$weight, auto$mpg, main="Relation between MPG and Weight", ylab="Miles Per Gallon", pch=4, col=2)
#Or, if we want to be fancy:
install.packages("ggplot2")
library(ggplot2)
plot1 <- ggplot ( data = auto ) # Scatter plot
plot1 <- plot1 + geom_point( aes( x = weight , y = mpg ) )
plot1
#PART V: Testing
#Testing whether MPG for foreign and domestic cars are the same:
TEST1 <- t.test(mpg ~ foreign, DATA = auto, var.equal = TRUE)
#OR:
TEST1 <- t.test(auto$mpg ~ auto$foreign, var.equal = TRUE)
TEST1
#This generates a list (something like a dataframe, but the objects in it are not of the same dimension)
#with a number of pertinent values in it. Do we reject?
TEST1$p.value
#This is much lower than .05, so it is unlikely the null is correct - reject the null! Foreign cars are
#likely of higher efficiency than domestic cars.
#An Aside: Lets make sure a manual test would match this.
Mean1 <- mean(subset(auto$mpg, auto$foreign == "Foreign"))
Mean2 < -mean(subset(auto$mpg, auto$foreign == "Domestic"))
v1 <- var(subset(auto$mpg, auto$foreign == "Foreign"))
v2 <- var(subset(auto$mpg, auto$foreign == "Domestic"))
n1 <- length(subset(auto$mpg, auto$foreign == "Foreign"))
n2 <- length(subset(auto$mpg, auto$foreign == "Domestic"))
SE <- (((n1 - 1) * v1) + ((n2 - 1) * v2)) / (n1 + n2 - 2)
tstat <- (Mean1 - Mean2) / sqrt(SE * ((1 / n1) + (1 / n2)))
tstat
pval <- pt(-abs(tstat), df = (n1 + n2 - 2)) * 2 #Uses student t rather than normal.
pval
#Regression!
#Lets regress MPG on weight to get an estimate of the effect of weight on MPG
MODEL <- lm(mpg ~ weight)
#This generates a list in MODEL
#Using the function summary gives us all the values we want:
summary(MODEL)
#The values here are described in the Professor's Intro to R sheet
# I will describe them here to some extent, but I will not discuss the descriptions of
#overall fit, as we have not learned that yet in class. I will discuss these next week.
#Regression extensions:
#After a regression, you can calculate fitted values by
mpghat <- fitted(MODEL)
#,estimated residuals of the model:
ehat <- residuals(MODEL)
#and coefficients:
Coefs <- coefficients(MODEL)
#Alternatively, coefficient estimates, fitted values, residuals and other variables can be pulled directly from the MODEL list: e.g.
Coefs <- MODEL$coefficients
ehat <- MODEL$residuals
mpghat <- MODEL$fitted.values