#Nathaniel D. Mark #Introduction to Econometrics #Columbia University, Spring, 2017 #Recitation 5 R Code #Preliminaries install.packages("sandwich") library(sandwich) install.packages("ggplot2") library(ggplot2) install.packages("systemfit") library(systemfit) set.seed(1000) #Generating Data to work with: #Our Question For the day: How much does Age effect Cost of Care? Age<-rbinom(20000, 120,.5) Medicare<-ifelse(Age>=65,1,0) HMO<-ifelse(Age>=65,0,rbinom(20000,1,.3)) PPO<-ifelse(rep(1,20000)-Medicare-HMO==1,rbinom(20000,1,.9),0) Other<-rep(1,20000)-Medicare-HMO-PPO SickIndex<-Age-mean(Age)+rnorm(20000,0,6) Cost<-apply(matrix(c(.7*SickIndex+.3*sign(SickIndex)*(SickIndex^2)-1.5*Medicare-1*HMO+2*Other+.5*rgeom(20000,.1),rep(0,20000)),20000,2,byrow = FALSE),1,max) Data<-data.frame(Age=Age,Medicare=Medicare,HMO=HMO,PPO=PPO,Other=Other,Cost=Cost) #Cost is in thousands of dollars plot(Age,Cost) #Looking at the Data: #Assume we do not observe the SickIndex, just age and insurance type. ggplot(Data, aes(Age))+geom_histogram() ggplot(Data, aes(Cost))+geom_histogram() df<-data.frame(sums=c(sum(Medicare),sum(HMO), sum(PPO),sum(Other)),names=c("Medicare","HMO","PPO","Other")) ggplot(df, aes(x="", y=sums, fill=names))+geom_bar(width = 1, stat = "identity")+coord_polar("y", start=0) qplot(Age,Cost) qplot(SickIndex,Cost) #TOPIC ONE: Nonlinear models: Techniques and Interpretations #First, from notes, I go over a general way to look at this problem. #Model 1: Estimating the model with Age and Insurance Type model1<-lm(Cost~Age+Medicare+HMO+PPO) summary(model1) #What is the estimated expected effect on cost of Age increasing by 1 year? #Model 2: Estimating the model with Interactions between Age and Insurance Type #Before: does it make sense to have interactions in this setting? #Step One: Generate the interactions: AgeMed<-Age*Medicare AgeHMO<-Age*HMO AgePPO<-Age*PPO #Step Two: Run the model with all variables: model2<-lm(Cost~Age+Medicare+HMO+PPO+AgeMed+AgeHMO+AgePPO) summary(model2) #What is the estimated expected effect on cost of Age increasing by 1 year? #Is this model better? #1) Look at if the R^2 went up. Yes, it did! #2) Test the joint null hypothesis that the model is linear against the null that interactions are jointly significant. #Performing #2): linearHypothesis(model2,c("AgeMed=0","AgeHMO=0","AgePPO=0"),test="F",vcov=vcovHC(model2, type = "HC1")) #Model Three: Adding a quadratic term: #Before: does it make sense to a quadratic term in this setting? #Step One: AgeSq<-Age^2 #Step Two: model3<-lm(Cost~Age+AgeSq+Medicare+HMO+PPO) summary(model3) #What is the estimated expected effect on cost of Age increasing by 1 year? #Is this model better? #1) Look at if the R^2 went up. #2) Test the null that the effect of AgeSq is equal to zero. How do we do this here? T-test from regression is all we need! #Model Four: Performing a linear-log model #Before: does it make sense to use a log-linear model (or any other log models) in this setting? #Step One: logAge<-log(Age) #Step Two: model4<-lm(Cost~logAge+Medicare+HMO+PPO) summary(model4) #What is the estimated expected effect on cost of Age increasing by 1 percentage point? #Is this model better? #1) Look at if the R^2 went up. #An IMPORTANT note: If we a using a log-linear model, we can no londer use this step. Why? #2) There is no null to test, so we cannot use this step. #TOPIC TWO: Cubic functions and using the predict function. #Model Five: Cubic Model #Step 1: Agecu<-Age^3 #Step 2: model5<-lm(Cost~Age+AgeSq+Agecu+Medicare+HMO+PPO) summary(model5) #Is this model better? #R^2 much better-- this is the best model so far. #Test the null that the model is truly linear. linearHypothesis(model5,c("AgeSq=0","Agecu=0"),test="F",vcov=vcovHC(model5, type = "HC1")) #Lesson: Do not just stop at the quadratic. #What is the estimated expected effect on cost of Age increasing by 1 year? #This is pretty difficult to determine, so we turn to graphing the answer: #We choose and individual type, here Medicare recipient: agenew1=seq(50,80,1) agesqnew1<-agenew1^2 agecunew1<-agenew1^3 agenew2=agenew1+1 agesqnew2<-agenew2^2 agecunew2<-agenew2^3 newdata1<-data.frame(Age=agenew1,AgeSq=agesqnew1,Agecu=agecunew1,Medicare=rep(1,31),HMO=rep(0,31),PPO=rep(0,31)) newdata2<- data.frame(Age=agenew2,AgeSq=agesqnew2,Agecu=agecunew2,Medicare=rep(1,31),HMO=rep(0,31),PPO=rep(0,31)) Pred1<-predict(model5,newdata1) Pred2<-predict(model5,newdata2) Difference<-Pred2-Pred1 qplot(seq(50,80,1),Difference,ylab="Effect of Aging by 1 Year on Cost", xlab="Age", main="Effect of Aging 1 Year on Cost of Care At Different Ages",sub="Assuming Patient in Medicare") #TOPIC THREE: Imposing restrictions.