#Nathaniel Mark
#Recitation January 27, 2017
#Intro to R

#A Note: For this recitation, I assume we are using RStudio.
#You should use RStudio, as it is much easier to use as beginners.
#If you want to learn more about using R without RStudio, see the
#Professor's R notes.

#PART I: Basics of writing up an R Script-
#  #excludes the line from code
#  <- assigns values to an object
a<-2
a
b<-3
c<-4
d<-8
#c(,) combines to generate a vector
Vec<-c(a,b)
Vec
#We can transform whole vectors of data:
Vecsr<-sqrt(Vec)
Vecsr
Vec2<-2*Vec
Vec2
#matrix() combines to generate a matrix
Mat<-matrix(data=c(a,b,c,d),nrow=2,ncol=2)
Mat
#array() generates an object with more than 2 dimensions.
# functions
mean(Vec)
cat("Our Vector is",Vec)
#  help()
help(mean)
#Packages:
#  google for packages if the function you want to use is not installed
# You only need to install once on your computer, but use library every time you load R.
install.packages("foreign")
library(foreign)
#  Ctrl+Enter is "run the highlighted code"
#Removing all objects: 
rm(list=ls(all=TRUE))

#PART II: Loading Data and observing data
#Usually, in RStudio, we can use: Environment>Import Dataset>auto.dta (in the box to the right)
#The data we are using in this exercise is .dta, however, so we have to be a little fancier.
auto=read.dta("C:/users/ndm2125/Downloads/auto.dta")

#Another way is to set the working directory to be a certain folder, then load it from that folder.
setwd("C:/users/ndm2125/Downloads/")
auto=read.dta("auto.dta")

#This generates what is called a dataframe.  It is a collection of named vectors (i.e. variables)
#Variables can be obtained in two ways:
#1)
auto$price
#2)
attach(auto)
price
#This makes it so you are working within this specific dataframe.  Be Careful!
detach(auto) #Takes you out of that dataframe

#Use the str or head function to describe the dataframe:
str(auto, give.attr=FALSE) #Note: you do not need the give.attr option at the end.
  #This gives us, for each variable: the variable name, type and the first few data points.
  #Also note, this information is in the environment window in RStudio.
head(auto) 
  #this gives us a look into the first couple values for each variable.

#PART III: Analyzing the raw data
#Most simply, summary() gives you summary statistics of each variable that is numeric.
summary(auto)
#Summary can also be used to give summary statistics for a specific variable:
summary(auto$mpg)
#More specific summary statistics can be found with: 
describe(auto$mpg)
#ERROR! This is to remind you about packages. Load the package needed!
??describe #searching for functions in a non-loaded package
install.packages("psych")
library(psych)
describe(auto$mpg)

#Subsets: Say we only want to see the MPG of foreign cars
subset(auto$mpg,auto$foreign=="Foreign")
    #Or the MPG of cars that weigh over 3500 pounds:
subset(auto$mpg,auto$weight>=3500)
    #We can also take a subset of the entire dataframe, if we only want to keep a few variables:
NewDataFrame<-subset(auto,select=c("mpg","weight","foreign"))
str(NewDataFrame)

#PART IV: GRAPHING
hist(auto$mpg)
plot(auto$weight,auto$mpg)
plot(auto$weight,auto$mpg,main="Relation between MPG and Weight",ylab="Miles Per Gallon",pch=4,col=2)

#Or, if we want to be fancy: 
install.packages("ggplot2")
library(ggplot2)
plot1 <- ggplot ( data = auto )  # Scatter plot
plot1 <- plot1 + geom_point( aes( x = weight , y = mpg ) )
plot1


#PART V: Testing
#Testing whether MPG for foreign and domestic cars are the same:
TEST1<-t.test(mpg~foreign,DATA=auto,var.equal=TRUE)
#OR:
TEST1<-t.test(auto$mpg~auto$foreign,var.equal=TRUE)
TEST1
#This generates a list (something like a dataframe, but the objects in it are not of the same dimension) 
#with a number of pertinent values in it. Do we reject?
TEST1$p.value
#This is much lower than .05, so it is unlikely the null is correct - reject the null! Foreign cars are 
#likely of higher efficiency than domestic cars.

#An Aside: Lets make sure a manual test would match this.
Mean1<-mean(subset(auto$mpg,auto$foreign=="Foreign"))
Mean2<-mean(subset(auto$mpg,auto$foreign=="Domestic"))
v1<-var(subset(auto$mpg,auto$foreign=="Foreign"))
v2<-var(subset(auto$mpg,auto$foreign=="Domestic"))
n1<-length(subset(auto$mpg,auto$foreign=="Foreign"))
n2<-length(subset(auto$mpg,auto$foreign=="Domestic"))
SE<-(((n1-1)*v1)+((n2-1)*v2))/(n1+n2-2)
tstat<-(Mean1-Mean2)/sqrt(SE*((1/n1)+(1/n2)))
tstat
pval<-pt(-abs(tstat),df=(n1+n2-2))*2 #Uses student t rather than normal.
pval

#Regression!
#Lets regress MPG on weight to get an estimate of the effect of weight on MPG
MODEL<-lm(mpg~weight)
#This generates a list in MODEL
#Using the function summary gives us all the values we want:
summary(MODEL)
#The values here are described in the Professor's Intro to R sheet
# I will describe them here to some extent, but I will not discuss the descriptions of
#overall fit, as we have not learned that yet in class. I will discuss these next week.

#Regression extensions:
#After a regression, you can calculate fitted values by 
mpghat<-fitted(MODEL)
#,estimated residuals of the model:
ehat<-residuals(MODEL)
#and coefficients:
Coefs<-coefficients(MODEL)

#Alternatively, coefficient estimates, fitted values, residuals and other variables can be pulled directly from the MODEL list: e.g.
Coefs<-MODEL$coefficients
ehat<-MODEL$residuals
mpghat<-MODEL$fitted.values