#Nathaniel Mark #Recitation January 27, 2017 #Intro to R #A Note: For this recitation, I assume we are using RStudio. #You should use RStudio, as it is much easier to use as beginners. #If you want to learn more about using R without RStudio, see the #Professor's R notes. #PART I: Basics of writing up an R Script- # #excludes the line from code # <- assigns values to an object a<-2 a b<-3 c<-4 d<-8 #c(,) combines to generate a vector Vec<-c(a,b) Vec #We can transform whole vectors of data: Vecsr<-sqrt(Vec) Vecsr Vec2<-2*Vec Vec2 #matrix() combines to generate a matrix Mat<-matrix(data=c(a,b,c,d),nrow=2,ncol=2) Mat #array() generates an object with more than 2 dimensions. # functions mean(Vec) cat("Our Vector is",Vec) # help() help(mean) #Packages: # google for packages if the function you want to use is not installed # You only need to install once on your computer, but use library every time you load R. install.packages("foreign") library(foreign) # Ctrl+Enter is "run the highlighted code" #Removing all objects: rm(list=ls(all=TRUE)) #PART II: Loading Data and observing data #Usually, in RStudio, we can use: Environment>Import Dataset>auto.dta (in the box to the right) #The data we are using in this exercise is .dta, however, so we have to be a little fancier. auto=read.dta("C:/users/ndm2125/Downloads/auto.dta") #Another way is to set the working directory to be a certain folder, then load it from that folder. setwd("C:/users/ndm2125/Downloads/") auto=read.dta("auto.dta") #This generates what is called a dataframe. It is a collection of named vectors (i.e. variables) #Variables can be obtained in two ways: #1) auto$price #2) attach(auto) price #This makes it so you are working within this specific dataframe. Be Careful! detach(auto) #Takes you out of that dataframe #Use the str or head function to describe the dataframe: str(auto, give.attr=FALSE) #Note: you do not need the give.attr option at the end. #This gives us, for each variable: the variable name, type and the first few data points. #Also note, this information is in the environment window in RStudio. head(auto) #this gives us a look into the first couple values for each variable. #PART III: Analyzing the raw data #Most simply, summary() gives you summary statistics of each variable that is numeric. summary(auto) #Summary can also be used to give summary statistics for a specific variable: summary(auto$mpg) #More specific summary statistics can be found with: describe(auto$mpg) #ERROR! This is to remind you about packages. Load the package needed! ??describe #searching for functions in a non-loaded package install.packages("psych") library(psych) describe(auto$mpg) #Subsets: Say we only want to see the MPG of foreign cars subset(auto$mpg,auto$foreign=="Foreign") #Or the MPG of cars that weigh over 3500 pounds: subset(auto$mpg,auto$weight>=3500) #We can also take a subset of the entire dataframe, if we only want to keep a few variables: NewDataFrame<-subset(auto,select=c("mpg","weight","foreign")) str(NewDataFrame) #PART IV: GRAPHING hist(auto$mpg) plot(auto$weight,auto$mpg) plot(auto$weight,auto$mpg,main="Relation between MPG and Weight",ylab="Miles Per Gallon",pch=4,col=2) #Or, if we want to be fancy: install.packages("ggplot2") library(ggplot2) plot1 <- ggplot ( data = auto ) # Scatter plot plot1 <- plot1 + geom_point( aes( x = weight , y = mpg ) ) plot1 #PART V: Testing #Testing whether MPG for foreign and domestic cars are the same: TEST1<-t.test(mpg~foreign,DATA=auto,var.equal=TRUE) #OR: TEST1<-t.test(auto$mpg~auto$foreign,var.equal=TRUE) TEST1 #This generates a list (something like a dataframe, but the objects in it are not of the same dimension) #with a number of pertinent values in it. Do we reject? TEST1$p.value #This is much lower than .05, so it is unlikely the null is correct - reject the null! Foreign cars are #likely of higher efficiency than domestic cars. #An Aside: Lets make sure a manual test would match this. Mean1<-mean(subset(auto$mpg,auto$foreign=="Foreign")) Mean2<-mean(subset(auto$mpg,auto$foreign=="Domestic")) v1<-var(subset(auto$mpg,auto$foreign=="Foreign")) v2<-var(subset(auto$mpg,auto$foreign=="Domestic")) n1<-length(subset(auto$mpg,auto$foreign=="Foreign")) n2<-length(subset(auto$mpg,auto$foreign=="Domestic")) SE<-(((n1-1)*v1)+((n2-1)*v2))/(n1+n2-2) tstat<-(Mean1-Mean2)/sqrt(SE*((1/n1)+(1/n2))) tstat pval<-pt(-abs(tstat),df=(n1+n2-2))*2 #Uses student t rather than normal. pval #Regression! #Lets regress MPG on weight to get an estimate of the effect of weight on MPG MODEL<-lm(mpg~weight) #This generates a list in MODEL #Using the function summary gives us all the values we want: summary(MODEL) #The values here are described in the Professor's Intro to R sheet # I will describe them here to some extent, but I will not discuss the descriptions of #overall fit, as we have not learned that yet in class. I will discuss these next week. #Regression extensions: #After a regression, you can calculate fitted values by mpghat<-fitted(MODEL) #,estimated residuals of the model: ehat<-residuals(MODEL) #and coefficients: Coefs<-coefficients(MODEL) #Alternatively, coefficient estimates, fitted values, residuals and other variables can be pulled directly from the MODEL list: e.g. Coefs<-MODEL$coefficients ehat<-MODEL$residuals mpghat<-MODEL$fitted.values