#Prelinaries: install.packages("psych") library(psych) #install.packages("ggplot2") library(ggplot2) #install.packages("lmtest") library(lmtest) #install.packages("sandwich") library(sandwich) set.seed(1) #-Generating data to work with in recitation Smoke<-rgeom(2000,.1) #drawing #of cigaretts people smoke per day Heavy_Drinker<-rbinom(2000,1,Smoke/(max(Smoke))) #If they smoke more, they are more likely to be heavy drinkers. #Finally, sicker people have yellower teeth: (0 is average yellowness of healthy people) Liver_Health<-rnorm(2000,-8,5)*Heavy_Drinker+rnorm(2000,0,12) #Putting these into a dataframe: SmokeHealthData<-data.frame(Heavy_Drinker,Smoke,Liver_Health) #deleting the other variables: rm(Heavy_Drinker,Smoke,Liver_Health) #Looking over the data: describe(SmokeHealthData) qplot(SmokeHealthData$Smoke, SmokeHealthData$Heavy_Drinker, xlab= "# of Cigarettes smoked per Day", ylab = "Heavy_Drinker Status (1=Heavy_Drinker)") qplot(SmokeHealthData$Smoke, geom = "histogram", xlab = "# of Cigarettes smoked per Day") qplot(SmokeHealthData$Smoke, SmokeHealthData$Liver_Health, xlab = "# of Cigarettes smoked per Day", ylab = "Liver Health Index") #Regression #-First, writing out the theoretical "true" model #Here, question is "does smoking cause liver failure?" #yellow_i=b0+b1*smoke_i+u_i #What is the interpretation of these variables? #Interpretations: #b0 is the expectation of the teeth yellowness of a non smoker. #b1 is the expected increase in someone's teeth yellowness if they smoke one more cigarette per day. #-Second, ask yourself: are the regression assumptions satisfied? #Go over from notes. #-Running regressions Model <- lm(Liver_Health ~ Smoke, data = SmokeHealthData) summary(Model) Model1 <- lm(Liver_Health ~ Smoke + Heavy_Drinker, data = SmokeHealthData) summary(Model1) #Two important questions that are often asked: #Is b1 large in a statistical sense? i.e. Is it statistically significant? #Is b1 large in a real-world sense? i.e. Does the size of this "effect" matter to policy/people? #Does the model explain much? #low R^2 says there is a very low percent of variation in weight that is explained by variation in smoking. #-Predictions: Say we want to predict the weight of someone who smokes 200 cigarettes a day. #Method 1: #The one given in your problem set. Try to use it! I offer other methods: #Method 2: use values from list prediction <- Model$coefficients[1] + Model$coefficients[2]*200 prediction #Method 3: copy and paste prediction <- 0.21816+0.10102*200 prediction #Method 4: predict function New <- data.frame(Smoke=200) prediction <- predict(Model,newdata=New) prediction #Discussion: Is this prediction reliable? #-Confidence intervals and tests: #To get the matrix that includes coefficient estimates/se/etc., we use: coeftest(Model, vcovHC(Model,type="HC1")) #Then, we can use brackets to pull certain values from it. se.b1.hat <- coeftest(Model, vcovHC(Model, type= "HC1" ))[2, 2] se.b1.hat #Asides: #1) vcovHC gives the heteroskedastic estimate for standard error. If we assume homoskedasticity, we can use: coeftest(Model) #2)We can also use "names" as this is a "named matrix": coeftest(Model,vcovHC(Model,type="HC1"))["Smoke","Std. Error"] #-Running tests on subsets of data: MODELHeavy_Drinker <- lm(Liver_Health ~ Smoke, data = subset(SmokeHealthData, Heavy_Drinker == 1)) summary(MODELHeavy_Drinker) MODELNotHeavy_Drinker <- lm(Liver_Health ~ Smoke, data = subset(SmokeHealthData, Heavy_Drinker == 0)) summary(MODELNotHeavy_Drinker) #-Hint on the difference test: from these regressions, you can collect all of the values you need to construct the relevant t-test. #You may, but not necessarily, want to use the n used in each subsetted