#Nathaniel Mark #Recitation 3 R Code #Columbia University Introduction to Econometrics #Prelinaries: install.packages("psych") library(psych) install.packages("ggplot2") library(ggplot2) install.packages("lmtest") library(lmtest) install.packages("sandwich") library(sandwich) set.seed(1) #-Generating data to work with in recitation Smoke<-rgeom(2000,.1) #drawing #of cigaretts people smoke per day Sick<-rbinom(2000,1,Smoke/(max(Smoke))) #If they smoke more, they are more likely to be sick #Finally, sicker people have yellower teeth:(0 is average yellowness of healthy people) yellow<-rnorm(2000,8,5)*Sick+rnorm(2000,0,12) #Putting these into a dataframe: SmokeHealthData<-data.frame(Sick,Smoke,yellow) #deleting the other variables: rm(Sick,Smoke,yellow) #Looking over the data: describe(SmokeHealthData) qplot(SmokeHealthData$Smoke,SmokeHealthData$Sick,xlab="# of Cigarettes smoked per Day",ylab="Sickness Status (1=sick)") qplot(SmokeHealthData$Smoke,geom="histogram") qplot(SmokeHealthData$Smoke,SmokeHealthData$yellow,xlab="# of Cigarettes smoked per Day",ylab="Yellowness of Teeth") #Regression #-First, writing out the theoretical "true" model #Here, question is "does smoking associate with yellower teeth?" #yellow_i=b0+b1*smoke_i+u_i #What is the interpretation of these variables? #Interpretations: #b0 is the expectation of the teeth yellowness of a non smoker. #b1 is the expected increase in someone's teeth yellowness if they smoke one more cigarette per day. #-Second, ask yourself: are the regression assumptions satisfied? #Go over from notes. #-Running regression attach(SmokeHealthData) Model<-lm(yellow~Smoke) summary(Model) #Two important questions that are often asked: #Is b1 large in a statistical sense? i.e. Is it statistically significant? #Is b1 large in a real-world sense? i.e. Does the size of this "effect" matter to policy/people? #Does the model explain much? #low R^2 says there is a very low percent of variation in weight that is explained by variation in smoking. #-Predictions: Say we want to predict the weight of someone who smokes 200 cigarettes a day. #Method 1: #The one given in your problem set. Try to use it! I offer other methods: #Method 2: use values from list prediction<-Model$coefficients[1]+Model$coefficients[2]*200 prediction #Method 3: copy and paste prediction<-0.21816+0.10102*200 prediction #Method 4: predict function New<-data.frame(Smoke=200) prediction<-predict(Model,newdata=New) prediction #Discussion: Is this prediction reliable? #-Confidence intervals and tests: #To get the matrix that includes coefficient estimates/se/etc., we use: coeftest(Model,vcovHC(Model,type="HC1")) #Then, we can use brackets to pull certain values from it. se.b1.hat<-coeftest(Model,vcovHC(Model,type="HC1"))[2,2] se.b1.hat #Asides: #1) vcovHC gives the heteroskedastic estimate for standard error. If we assume homoskedasticity, we can use: coeftest(Model) #2)We can also use "names" as this is a "named matrix": coeftest(Model,vcovHC(Model,type="HC1"))["Smoke","Std. Error"] #-Running tests on subsets of data: MODELSick<-lm(yellow~Smoke,data=subset(SmokeHealthData,Sick==1)) summary(MODELSick) MODELNotSick<-lm(yellow~Smoke,data=subset(SmokeHealthData,Sick==0)) summary(MODELNotSick) #-Hint on the difference test: from these regressions, you can collect all of the values you need to construct the relevant t-test. #You may, but not necessarily, want to use the n used in each subsetted