#Nathaniel D. Mark
#Introduction to Econometrics
#Columbia University, Spring, 2017
#Recitation 5 R Code
#Preliminaries
install.packages("sandwich")
library(sandwich)
install.packages("ggplot2")
library(ggplot2)
install.packages("systemfit")
library(systemfit)
set.seed(1000)
#Generating Data to work with:
#Our Question For the day: How much does Age effect Cost of Care?
Age<-rbinom(20000, 120,.5)
Medicare<-ifelse(Age>=65,1,0)
HMO<-ifelse(Age>=65,0,rbinom(20000,1,.3))
PPO<-ifelse(rep(1,20000)-Medicare-HMO==1,rbinom(20000,1,.9),0)
Other<-rep(1,20000)-Medicare-HMO-PPO
SickIndex<-Age-mean(Age)+rnorm(20000,0,6)
Cost<-apply(matrix(c(.7*SickIndex+.3*sign(SickIndex)*(SickIndex^2)-1.5*Medicare-1*HMO+2*Other+.5*rgeom(20000,.1),rep(0,20000)),20000,2,byrow = FALSE),1,max)
Data<-data.frame(Age=Age,Medicare=Medicare,HMO=HMO,PPO=PPO,Other=Other,Cost=Cost)
#Cost is in thousands of dollars
plot(Age,Cost)
#Looking at the Data:
#Assume we do not observe the SickIndex, just age and insurance type.
ggplot(Data, aes(Age))+geom_histogram()
ggplot(Data, aes(Cost))+geom_histogram()
df<-data.frame(sums=c(sum(Medicare),sum(HMO), sum(PPO),sum(Other)),names=c("Medicare","HMO","PPO","Other"))
ggplot(df, aes(x="", y=sums, fill=names))+geom_bar(width = 1, stat = "identity")+coord_polar("y", start=0)
qplot(Age,Cost)
qplot(SickIndex,Cost)
#TOPIC ONE: Nonlinear models: Techniques and Interpretations
#First, from notes, I go over a general way to look at this problem.
#Model 1: Estimating the model with Age and Insurance Type
model1<-lm(Cost~Age+Medicare+HMO+PPO)
summary(model1)
#What is the estimated expected effect on cost of Age increasing by 1 year?
#Model 2: Estimating the model with Interactions between Age and Insurance Type
#Before: does it make sense to have interactions in this setting?
#Step One: Generate the interactions:
AgeMed<-Age*Medicare
AgeHMO<-Age*HMO
AgePPO<-Age*PPO
#Step Two: Run the model with all variables:
model2<-lm(Cost~Age+Medicare+HMO+PPO+AgeMed+AgeHMO+AgePPO)
summary(model2)
#What is the estimated expected effect on cost of Age increasing by 1 year?
#Is this model better?
#1) Look at if the R^2 went up. Yes, it did!
#2) Test the joint null hypothesis that the model is linear against the null that interactions are jointly significant.
#Performing #2):
linearHypothesis(model2,c("AgeMed=0","AgeHMO=0","AgePPO=0"),test="F",vcov=vcovHC(model2, type = "HC1"))
#Model Three: Adding a quadratic term:
#Before: does it make sense to a quadratic term in this setting?
#Step One:
AgeSq<-Age^2
#Step Two:
model3<-lm(Cost~Age+AgeSq+Medicare+HMO+PPO)
summary(model3)
#What is the estimated expected effect on cost of Age increasing by 1 year?
#Is this model better?
#1) Look at if the R^2 went up.
#2) Test the null that the effect of AgeSq is equal to zero. How do we do this here? T-test from regression is all we need!
#Model Four: Performing a linear-log model
#Before: does it make sense to use a log-linear model (or any other log models) in this setting?
#Step One:
logAge<-log(Age)
#Step Two:
model4<-lm(Cost~logAge+Medicare+HMO+PPO)
summary(model4)
#What is the estimated expected effect on cost of Age increasing by 1 percentage point?
#Is this model better?
#1) Look at if the R^2 went up.
#An IMPORTANT note: If we a using a log-linear model, we can no londer use this step. Why?
#2) There is no null to test, so we cannot use this step.
#TOPIC TWO: Cubic functions and using the predict function.
#Model Five: Cubic Model
#Step 1:
Agecu<-Age^3
#Step 2:
model5<-lm(Cost~Age+AgeSq+Agecu+Medicare+HMO+PPO)
summary(model5)
#Is this model better?
#R^2 much better-- this is the best model so far.
#Test the null that the model is truly linear.
linearHypothesis(model5,c("AgeSq=0","Agecu=0"),test="F",vcov=vcovHC(model5, type = "HC1"))
#Lesson: Do not just stop at the quadratic.
#What is the estimated expected effect on cost of Age increasing by 1 year?
#This is pretty difficult to determine, so we turn to graphing the answer:
#We choose and individual type, here Medicare recipient:
agenew1=seq(50,80,1)
agesqnew1<-agenew1^2
agecunew1<-agenew1^3
agenew2=agenew1+1
agesqnew2<-agenew2^2
agecunew2<-agenew2^3
newdata1<-data.frame(Age=agenew1,AgeSq=agesqnew1,Agecu=agecunew1,Medicare=rep(1,31),HMO=rep(0,31),PPO=rep(0,31))
newdata2<- data.frame(Age=agenew2,AgeSq=agesqnew2,Agecu=agecunew2,Medicare=rep(1,31),HMO=rep(0,31),PPO=rep(0,31))
Pred1<-predict(model5,newdata1)
Pred2<-predict(model5,newdata2)
Difference<-Pred2-Pred1
qplot(seq(50,80,1),Difference,ylab="Effect of Aging by 1 Year on Cost", xlab="Age", main="Effect of Aging 1 Year on Cost of Care At Different Ages",sub="Assuming Patient in Medicare")
#TOPIC THREE: Imposing restrictions.