#Nathaniel D. Mark
#Introduction to Econometrics
#Columbia University, Spring, 2017
#Recitation 4 R Code
#Preliminaries
install.packages("sandwich")
library(sandwich)
install.packages("ggplot2")
library(ggplot2)
set.seed(1000)
#Generating Data to work with:
#Our Question: Does medicare pay more or less than other insurers?
Age<-rbinom(20000, 120,.5)
Medicare<-ifelse(Age>=65,1,0)
HMO<-ifelse(Age>=65,0,rbinom(20000,1,.3))
PPO<-ifelse(rep(1,20000)-Medicare-HMO==1,rbinom(20000,1,.9),0)
Other<-rep(1,20000)-Medicare-HMO-PPO
HealthIndex<-rnorm(20000,0,.5)*Age/100+rnorm(20000,0,.75)
Cost<-apply(matrix(c(3000+4000*HealthIndex-300*Medicare+100*rgeom(20000,.1),rep(0,20000)),20000,2,byrow = FALSE),1,max)
#Looking at the Data:
hist(Age)
hist(HealthIndex)
hist(Cost)
pie(c(sum(Medicare),sum(HMO), sum(PPO),sum(Other)), c("Medicare","HMO","PPO","Other"),main="Insurance Policy Make-up")
#Topic One: Multicollinearity: First, discuss on board/notes the theory.
#Now example in R:
CollinearModel<-lm(Cost~Age+Medicare+HMO+PPO+Other)
summary(CollinearModel)
#What did it do? It fixed perfect multi-collinearity in the model automatically!
#To solve, we drop "other"
Model1<-lm(Cost~Age+Medicare+HMO+PPO)
summary(Model1)
#Topic Two: Omitted Variable Bias
#In our case, health status is omitted from the above regression. Does this cause omitted variable bias?
#Go over omitted variable notes here.
#So, in this case, first think about it: Is Health status correlated with Medicare status? Yes, negatively.
#Is Health Status correlated with Cost? Yes, negatively.
#So, we expect the coefficient on medicare to be an OVERESTIMATE.
#How do we look at the dad to see if our regression suffered from OVB (omitted variable bias)?
#If we have the data, add the omitted variable and see if the coefficient estimate changes much and in the direction we thought.
Model2<-lm(Cost~Age+Medicare+HMO+PPO+HealthIndex)
summary(Model2)
#The estimate did indeed go down, as we expected.
#Additionally, note that R^2 went way up-- implying that health is a very important explanatory variable.
#Topic 3: R-squared v adjusted R-squared
#Go over R-squared topic on board
#Pulling out the R-squared and adjusted R-squared:
#r-squared:
summary(Model1)$r.squared
summary(Model2)$r.squared
#adjusted r-squared:
summary(Model1)$adj.r.squared
summary(Model2)$adj.r.squared
#Aside: in general, you can see what you can pull from the summary table in the Value page of:
help(summary.lm)
#Topic 4: F-tests
#Packages needed:
install.packages("systemfit")
library(systemfit)
#Say we want to test the joint hypothesis that all insurance types have the same costs.
#Another way to say this is the effect of switching from "Other" insurance to a different type has zero effect.
#General Notation:
#linearHypothesis(Yourmodel,hypothesis/hypotheses,test=test type,vcov=variance type (homo/heteroskedasticity) )
#Methods for our estimation:
#Homoskedastic
#Method One: Write out each null in a vector in the second place:
linearHypothesis(Model2,c("Medicare=0","HMO=0","PPO=0"),test="F")
#Method Two: Write out the hypothesis with the left hand side in second place and
#right hand side under rhs
linearHypothesis(Model2,c("Medicare","HMO","PPO"),rhs=c(0,0,0),test="F")
#Method Three: Explicitly saying homoskedasticity:
linearHypothesis(Model2,c("Medicare=0","HMO=0","PPO=0"),test="F",vcov=vcovHC(Model2, type = "const"))
#Adding Heteroskedasticity:
linearHypothesis(Model2,c("Medicare=0","HMO=0","PPO=0"),test="F",vcov=vcovHC(Model2, type = "HC1"))
#How about the test of the null hypothesis "is the cost of an HMO the same as a PPO?":
linearHypothesis(Model2,c("HMO=PPO"),test="F",vcov=vcovHC(Model2, type = "HC1"))