#Nathaniel Mark
#Recitation 2 R Code
#Columbia University Introduction to Econometricsi
#Prelinaries:
install.packages("psych")
library(psych)
install.packages("ggplot2")
library(ggplot2)
install.packages("TeachingDemos")
library(TeachingDemos)
set.seed(1)
#First: Any Questions about R?
#In this lesson, some new functions we will learn are:
#i)data.frame
#ii)rm
#iii)qplot
#iv)length
#v)z.test
#vi)pnorm
#vii)qnorm
#viii)predict
#Topic 1) Sample Problem to Help with homework
#-Generating a dataset
#Say people are either healthy or sick. 80% are healthy:
Sick<-rbinom(200,1,.2) #generating random draws of 0 or 1
#Also, people smoke a certain amount, given their health.
Smoke<-20*Sick+runif(200,0,40) #generating random draws between 0 and 40
#Putting these into a dataframe:
SmokeHealthData<-data.frame(Sick,Smoke)
#deleting the other variables:
rm(Sick,Smoke)
#-Looking at the data
describe(SmokeHealthData)
qplot(SmokeHealthData$Smoke,SmokeHealthData$Sick,xlab="# of Cigarettes smoked per Dat",ylab="Sickness Status (1=sick)")
qplot(SmokeHealthData$Smoke,geom="histogram")
#-Confidence interval around the mean of smoking:
attach(SmokeHealthData)
sdS<-sd(Smoke)
meanS<-mean(Smoke)
n<-length(Smoke)
CI<-c(meanS-(sdS/sqrt(n))*1.96,meanS+(sdS/sqrt(n))*1.96)
CI
#Checking/ Doing it another way:
z.test(Smoke,mu=meanS,sd=sdS)
#What is going on here? Recall our discussion earlier in the class.
#-Testing if the mean of smoking is different from 1 pack a day (20)
t.stat<-(meanS-20)/(sdS/sqrt(n))
t.stat
#Recall, there are then two ways to reject/not reject the hypothesis
#Method 1: Is the p-value less than the level of significance (usually .05)
#To calculate p-value, we look at your normal distribution chart OR you can do this in R, by:
p.val<-2*pnorm(-abs(t.stat))
#pnorm(x) gives you the probability the a draw from a std. normal dist. would be less than x.
p.val
#Reject null of meansmoke=20
#Method 2: Is the t-stat larger (in absolute value) than the critical value
#To calculate critical value, welook at your normal distribution chart OR you can do this in R, by:
crit.val<-abs(qnorm(.05/2))
#qnorm(x) gives you the value at which x= the probability a draw from a std. normal dist. is less than x
crit.val #approx 1.96
#Checking/ Doing it another way:
z.test(Smoke, mu=20,sd=sdS)
#-Subsetting the data to test seperately:
detach(SmokeHealthData)
SmokeHealthData.ofsmokers<-subset(SmokeHealthData,Smoke==1)
SmokeHealthData.ofNONsmokers<-subset(SmokeHealthData,Smoke==0)
#Then we can run the same tests above using just these two (show in recitation)
#Topic 2: Regression
#-First, lets add a new variable:
SmokeHealthData$weight<-130-rnorm(200,5,5)*Sick+rnorm(200,0,12)
qplot(SmokeHealthData$weight,geom="histogram")
#-First, writing out the theoretical "true" model
#Here, question is "does smoking associate with lower body weight?"
#weight_i=b0+b1*smoke_i+u_i
#-Second, ask yourself: are the regression assumptions satisfied?
#-Running regression
attach(SmokeHealthData)
Model<-lm(weight~Smoke)
summary(Model)
#Interpretations:
#b0 estimate is the expectation of the weight of a non smoker.
#b1 estimate is the expected increase in someone's weight if they smoke one more cigarette per day.
#Two important questions that are often asked:
#Is b1 large in a statistical sense? i.e. Is it statistically significant?
#Is b1 large in a real-world sense? i.e. Does the size of this "effect" matter to policy/people?
#Does the model explain much?
#low R^2 says there is a very low percent of variation in weight that is explained by variation in smoking.
#-Predictions: Say we want to predict the weight of someone who smokes 200 cigarettes a day.
#Method 1: copy and paste
prediction<-128.999796-0.008579*200
prediction
#Method 2: use values from list
prediction<-Model$coefficients[1]+Model$coefficients[2]*200
prediction
#Method3: predict function
New<-data.frame(Smoke=200)
prediction<-predict(Model,newdata=New)
#Discussion: Is this prediction reliable?
#Thanks for coming!
#-Last note: You may want to check out the binom.test function. This is a specific function for data that take values of 0 or 1.
#-Also, you may want to look into Publishing your R scripts, to make your homeworks look nice and fancy.