#Nathaniel Mark
#Recitation 3 R Code
#Columbia University Introduction to Econometrics

#Prelinaries:
install.packages("psych")
library(psych)
install.packages("ggplot2")
library(ggplot2)
install.packages("lmtest")
library(lmtest)
install.packages("sandwich")
library(sandwich)
set.seed(1)

#-Generating data to work with in recitation
Smoke<-rgeom(2000,.1) #drawing #of cigaretts people smoke per day
Sick<-rbinom(2000,1,Smoke/(max(Smoke))) #If they smoke more, they are more likely to be sick
#Finally, sicker people have yellower teeth:(0 is average yellowness of healthy people)
yellow<-rnorm(2000,8,5)*Sick+rnorm(2000,0,12)
#Putting these into a dataframe:
SmokeHealthData<-data.frame(Sick,Smoke,yellow)
#deleting the other variables:
rm(Sick,Smoke,yellow)

#Looking over the data:
describe(SmokeHealthData)
qplot(SmokeHealthData$Smoke,SmokeHealthData$Sick,xlab="# of Cigarettes smoked per Day",ylab="Sickness Status (1=sick)")
qplot(SmokeHealthData$Smoke,geom="histogram")
qplot(SmokeHealthData$Smoke,SmokeHealthData$yellow,xlab="# of Cigarettes smoked per Day",ylab="Yellowness of Teeth")

#Regression

#-First, writing out the theoretical "true" model
#Here, question is "does smoking associate with yellower teeth?"
#yellow_i=b0+b1*smoke_i+u_i
#What is the interpretation of these variables?

#Interpretations:
#b0 is the expectation of the teeth yellowness of a non smoker.
#b1 is the expected increase in someone's teeth yellowness if they smoke one more cigarette per day.

#-Second, ask yourself: are the regression assumptions satisfied?
#Go over from notes.

#-Running regression
attach(SmokeHealthData)
Model<-lm(yellow~Smoke)
summary(Model)

#Two important questions that are often asked: 
#Is b1 large in a statistical sense? i.e. Is it statistically significant?
#Is b1 large in a real-world sense? i.e. Does the size of this "effect" matter to policy/people?

#Does the model explain much? 
#low R^2 says there is a very low percent of variation in weight that is explained by variation in smoking.

#-Predictions: Say we want to predict the weight of someone who smokes 200 cigarettes a day.
#Method 1:
#The one given in your problem set. Try to use it! I offer other methods:
#Method 2: use values from list
prediction<-Model$coefficients[1]+Model$coefficients[2]*200
prediction
#Method 3: copy and paste
prediction<-0.21816+0.10102*200
prediction
#Method 4: predict function
New<-data.frame(Smoke=200)
prediction<-predict(Model,newdata=New)
prediction

#Discussion: Is this prediction reliable?


#-Confidence intervals and tests:
#To get the matrix that includes coefficient estimates/se/etc., we use:
coeftest(Model,vcovHC(Model,type="HC1"))
#Then, we can use brackets to pull certain values from it.
se.b1.hat<-coeftest(Model,vcovHC(Model,type="HC1"))[2,2]
se.b1.hat

#Asides:
#1) vcovHC gives the heteroskedastic estimate for standard error. If we assume homoskedasticity, we can use:
coeftest(Model)
#2)We can also use "names" as this is a "named matrix":
coeftest(Model,vcovHC(Model,type="HC1"))["Smoke","Std. Error"]

#-Running tests on subsets of data:
MODELSick<-lm(yellow~Smoke,data=subset(SmokeHealthData,Sick==1))
summary(MODELSick)
MODELNotSick<-lm(yellow~Smoke,data=subset(SmokeHealthData,Sick==0))
summary(MODELNotSick)

#-Hint on the difference test: from these regressions, you can collect all of the values you need to construct the relevant t-test.
#You may, but not necessarily, want to use the n used in each subsetted