#Nathaniel Mark #Intro to Econometrics #Spring 2017 #Recitation Seven #Packages install.packages("lmtest") library(lmtest) install.packages("sandwich") library(sandwich) install.packages("ggplot2") library(ggplot2) #Generating data: n<-500 age<-rnorm(n,30,5) income<-rep(20000,n)+1000*age+rnorm(n,0,3000) college<-rbinom(n,1,income/100000) prob<-.6*(age/50)+.5*(income/100000)-.1*college ownhome<-rbinom(n,1,prob) #ESTIMATING #Linear Probability model: LPM<-lm(ownhome~age+income+college) coeftest(LPM,vcov.=vcovHC(LPM,type="HC0")) #Logit model: Logit<-glm(ownhome~age+income+college,family=binomial(link="logit")) summary(Logit) #PREDICTING #Say we want to predict probabilities of owning a home for all deciles (i.e. quantiles of values 0,.1,.2,...) of age, holding income and college steady. #We need to assume values of income and age. To make it easy, use the median for all other variables. #step one: generate a new dataframe of independent vars to predict with: cpredictdata<-data.frame(age=as.vector(quantile(age,probs=seq(0,1,.1))),income=as.vector(quantile(income,probs=rep(.5,11))),college=as.vector(quantile(college,probs=rep(.5,11)))) #step two: plug into the predict function: predict.lm(LPM,cpredictdata) predict.glm(Logit,cpredictdata,type="response") #PREDICTED EFFECTS #Now say we want to predict the effect of aging by one year on the probability of owning a home for someone of median age, income, college attendance (mode): #step one: generate new dataframes of independent vars to predict with: cpredictdata<-data.frame(age=median(age),income=median(income),college=median(college)) cpredictdataplus1<-data.frame(age=median(age)+1,income=median(income),college=median(college)) #step two: Plug into the predict functions with plus1 first: predict.glm(Logit,cpredictdataplus1,type="response")-predict.glm(Logit,cpredictdata,type="response") # Since this is a non-linear function, note that if we assume the "base" values differ, then this effect differs. #That is, say we want to predict the effect of aging by one year on the probability of owning a home for someone at the 90th quantile of age, income, college attendance (mode): #step one: generate new dataframes of independent vars to predict with: cpredictdata<-data.frame(age=as.vector(quantile(age,.9)),income=as.vector(quantile(income,.9)),college=as.vector(quantile(college,.9))) cpredictdataplus1<-data.frame(age=as.vector(quantile(age,.9))+1,income=as.vector(quantile(income,.9)),college=as.vector(quantile(college,.9))) #step two: Plug into the predict functions with plus1 first: predict.glm(Logit,cpredictdataplus1,type="response")-predict.glm(Logit,cpredictdata,type="response") #So, as income and age go up, the effect of aging by one year on the probability of owning a home goes down. #Question: ask yourself: if we did this with the LPM, what would our result be? #PREDICTING IN-SAMPLE: #Say we want a prediction of the estimated probability that Y=1 for all of the observations we used in our estimation. #This is simply using the predict with the correct model and type="response" predictions<-predict.glm(Logit,type="response") qplot(predictions,geom="histogram") #Final R tricks: #If we want to generate a dummy for age<40, we can use: dummy1<-as.numeric(age<40) #Or age>=40: dummy2<-as.numeric(age<=40) plot(age,dummy2)