#Prelinaries:
install.packages("psych")
library(psych)
#install.packages("ggplot2")
library(ggplot2)
#install.packages("lmtest")
library(lmtest)
#install.packages("sandwich")
library(sandwich)
set.seed(1)

#-Generating data to work with in recitation
Smoke<-rgeom(2000,.1) #drawing #of cigaretts people smoke per day
Heavy_Drinker<-rbinom(2000,1,Smoke/(max(Smoke))) #If they smoke more, they are more likely to be heavy drinkers.
#Finally, sicker people have yellower teeth: (0 is average yellowness of healthy people)
Liver_Health<-rnorm(2000,-8,5)*Heavy_Drinker+rnorm(2000,0,12)
#Putting these into a dataframe:
SmokeHealthData<-data.frame(Heavy_Drinker,Smoke,Liver_Health)
#deleting the other variables:
rm(Heavy_Drinker,Smoke,Liver_Health)

#Looking over the data:
describe(SmokeHealthData)
qplot(SmokeHealthData$Smoke, SmokeHealthData$Heavy_Drinker, xlab= "# of Cigarettes smoked per Day", ylab = "Heavy_Drinker Status (1=Heavy_Drinker)")
qplot(SmokeHealthData$Smoke, geom = "histogram", xlab = "# of Cigarettes smoked per Day")
qplot(SmokeHealthData$Smoke, SmokeHealthData$Liver_Health, xlab = "# of Cigarettes smoked per Day", ylab = "Liver Health Index")

#Regression

#-First, writing out the theoretical "true" model
#Here, question is "does smoking cause liver failure?"
#yellow_i=b0+b1*smoke_i+u_i
#What is the interpretation of these variables?

#Interpretations:
#b0 is the expectation of the teeth yellowness of a non smoker.
#b1 is the expected increase in someone's teeth yellowness if they smoke one more cigarette per day.

#-Second, ask yourself: are the regression assumptions satisfied?
#Go over from notes.

#-Running regressions
Model <- lm(Liver_Health ~ Smoke, data = SmokeHealthData)
summary(Model)

Model1 <- lm(Liver_Health ~ Smoke + Heavy_Drinker, data = SmokeHealthData)
summary(Model1)

#Two important questions that are often asked: 
#Is b1 large in a statistical sense? i.e. Is it statistically significant?
#Is b1 large in a real-world sense? i.e. Does the size of this "effect" matter to policy/people?

#Does the model explain much? 
#low R^2 says there is a very low percent of variation in weight that is explained by variation in smoking.

#-Predictions: Say we want to predict the weight of someone who smokes 200 cigarettes a day.
#Method 1:
#The one given in your problem set. Try to use it! I offer other methods:
#Method 2: use values from list
prediction <- Model$coefficients[1] + Model$coefficients[2]*200
prediction
#Method 3: copy and paste
prediction <- 0.21816+0.10102*200
prediction
#Method 4: predict function
New <- data.frame(Smoke=200)
prediction <- predict(Model,newdata=New)
prediction

#Discussion: Is this prediction reliable?


#-Confidence intervals and tests:
#To get the matrix that includes coefficient estimates/se/etc., we use:
coeftest(Model, vcovHC(Model,type="HC1"))
#Then, we can use brackets to pull certain values from it.
se.b1.hat <- coeftest(Model, vcovHC(Model, type= "HC1" ))[2, 2]
se.b1.hat

#Asides:
#1) vcovHC gives the heteroskedastic estimate for standard error. If we assume homoskedasticity, we can use:
coeftest(Model)
#2)We can also use "names" as this is a "named matrix":
coeftest(Model,vcovHC(Model,type="HC1"))["Smoke","Std. Error"]

#-Running tests on subsets of data:
MODELHeavy_Drinker <- lm(Liver_Health ~ Smoke, data = subset(SmokeHealthData, Heavy_Drinker == 1))
summary(MODELHeavy_Drinker)
MODELNotHeavy_Drinker <- lm(Liver_Health ~ Smoke, data = subset(SmokeHealthData, Heavy_Drinker == 0))
summary(MODELNotHeavy_Drinker)

#-Hint on the difference test: from these regressions, you can collect all of the values you need to construct the relevant t-test.
#You may, but not necessarily, want to use the n used in each subsetted