#Nathaniel D. Mark #Introduction to Econometrics #Columbia University, Spring, 2017 #Recitation 4 R Code #Preliminaries install.packages("sandwich") library(sandwich) install.packages("ggplot2") library(ggplot2) set.seed(1000) #Generating Data to work with: #Our Question: Does medicare pay more or less than other insurers? Age<-rbinom(20000, 120,.5) Medicare<-ifelse(Age>=65,1,0) HMO<-ifelse(Age>=65,0,rbinom(20000,1,.3)) PPO<-ifelse(rep(1,20000)-Medicare-HMO==1,rbinom(20000,1,.9),0) Other<-rep(1,20000)-Medicare-HMO-PPO HealthIndex<-rnorm(20000,0,.5)*Age/100+rnorm(20000,0,.75) Cost<-apply(matrix(c(3000+4000*HealthIndex-300*Medicare+100*rgeom(20000,.1),rep(0,20000)),20000,2,byrow = FALSE),1,max) #Looking at the Data: hist(Age) hist(HealthIndex) hist(Cost) pie(c(sum(Medicare),sum(HMO), sum(PPO),sum(Other)), c("Medicare","HMO","PPO","Other"),main="Insurance Policy Make-up") #Topic One: Multicollinearity: First, discuss on board/notes the theory. #Now example in R: CollinearModel<-lm(Cost~Age+Medicare+HMO+PPO+Other) summary(CollinearModel) #What did it do? It fixed perfect multi-collinearity in the model automatically! #To solve, we drop "other" Model1<-lm(Cost~Age+Medicare+HMO+PPO) summary(Model1) #Topic Two: Omitted Variable Bias #In our case, health status is omitted from the above regression. Does this cause omitted variable bias? #Go over omitted variable notes here. #So, in this case, first think about it: Is Health status correlated with Medicare status? Yes, negatively. #Is Health Status correlated with Cost? Yes, negatively. #So, we expect the coefficient on medicare to be an OVERESTIMATE. #How do we look at the dad to see if our regression suffered from OVB (omitted variable bias)? #If we have the data, add the omitted variable and see if the coefficient estimate changes much and in the direction we thought. Model2<-lm(Cost~Age+Medicare+HMO+PPO+HealthIndex) summary(Model2) #The estimate did indeed go down, as we expected. #Additionally, note that R^2 went way up-- implying that health is a very important explanatory variable. #Topic 3: R-squared v adjusted R-squared #Go over R-squared topic on board #Pulling out the R-squared and adjusted R-squared: #r-squared: summary(Model1)$r.squared summary(Model2)$r.squared #adjusted r-squared: summary(Model1)$adj.r.squared summary(Model2)$adj.r.squared #Aside: in general, you can see what you can pull from the summary table in the Value page of: help(summary.lm) #Topic 4: F-tests #Packages needed: install.packages("systemfit") library(systemfit) #Say we want to test the joint hypothesis that all insurance types have the same costs. #Another way to say this is the effect of switching from "Other" insurance to a different type has zero effect. #General Notation: #linearHypothesis(Yourmodel,hypothesis/hypotheses,test=test type,vcov=variance type (homo/heteroskedasticity) ) #Methods for our estimation: #Homoskedastic #Method One: Write out each null in a vector in the second place: linearHypothesis(Model2,c("Medicare=0","HMO=0","PPO=0"),test="F") #Method Two: Write out the hypothesis with the left hand side in second place and #right hand side under rhs linearHypothesis(Model2,c("Medicare","HMO","PPO"),rhs=c(0,0,0),test="F") #Method Three: Explicitly saying homoskedasticity: linearHypothesis(Model2,c("Medicare=0","HMO=0","PPO=0"),test="F",vcov=vcovHC(Model2, type = "const")) #Adding Heteroskedasticity: linearHypothesis(Model2,c("Medicare=0","HMO=0","PPO=0"),test="F",vcov=vcovHC(Model2, type = "HC1")) #How about the test of the null hypothesis "is the cost of an HMO the same as a PPO?": linearHypothesis(Model2,c("HMO=PPO"),test="F",vcov=vcovHC(Model2, type = "HC1"))