#Nathaniel Mark
#Intro to Econometrics 
#Spring 2017
#Recitation Seven

#Packages
install.packages("lmtest")
library(lmtest)
install.packages("sandwich")
library(sandwich)
install.packages("ggplot2")
library(ggplot2)

#Generating data:
n<-500
age<-rnorm(n,30,5)
income<-rep(20000,n)+1000*age+rnorm(n,0,3000)
college<-rbinom(n,1,income/100000)
prob<-.6*(age/50)+.5*(income/100000)-.1*college
ownhome<-rbinom(n,1,prob)

#ESTIMATING

#Linear Probability model:
LPM<-lm(ownhome~age+income+college)
coeftest(LPM,vcov.=vcovHC(LPM,type="HC0"))

#Logit model:
Logit<-glm(ownhome~age+income+college,family=binomial(link="logit"))
summary(Logit)

#PREDICTING

#Say we want to predict probabilities of owning a home for all deciles (i.e. quantiles of values 0,.1,.2,...) of age, holding income and college steady.
#We need to assume values of income and age.  To make it easy, use the median for all other variables.

#step one: generate a new dataframe of independent vars to predict with:
cpredictdata<-data.frame(age=as.vector(quantile(age,probs=seq(0,1,.1))),income=as.vector(quantile(income,probs=rep(.5,11))),college=as.vector(quantile(college,probs=rep(.5,11))))

#step two: plug into the predict function:
predict.lm(LPM,cpredictdata)
predict.glm(Logit,cpredictdata,type="response")

#PREDICTED EFFECTS

#Now say we want to predict the effect of aging by one year on the probability of owning a home for someone of median age, income, college attendance (mode):

#step one: generate new dataframes of independent vars to predict with:
cpredictdata<-data.frame(age=median(age),income=median(income),college=median(college))
cpredictdataplus1<-data.frame(age=median(age)+1,income=median(income),college=median(college))

#step two: Plug into the predict functions with plus1 first:
predict.glm(Logit,cpredictdataplus1,type="response")-predict.glm(Logit,cpredictdata,type="response")

# Since this is a non-linear function, note that if we assume the "base" values differ, then this effect differs. 
#That is, say we want to predict the effect of aging by one year on the probability of owning a home for someone at the 90th quantile of age, income, college attendance (mode):

#step one: generate new dataframes of independent vars to predict with:
cpredictdata<-data.frame(age=as.vector(quantile(age,.9)),income=as.vector(quantile(income,.9)),college=as.vector(quantile(college,.9)))
cpredictdataplus1<-data.frame(age=as.vector(quantile(age,.9))+1,income=as.vector(quantile(income,.9)),college=as.vector(quantile(college,.9)))

#step two: Plug into the predict functions with plus1 first:
predict.glm(Logit,cpredictdataplus1,type="response")-predict.glm(Logit,cpredictdata,type="response")

#So, as income and age go up, the effect of aging by one year on the probability of owning a home goes down.
#Question: ask yourself: if we did this with the LPM, what would our result be?

#PREDICTING IN-SAMPLE:
#Say we want a prediction of the estimated probability that Y=1 for all of the observations we used in our estimation. 
#This is simply using the predict with the correct model and type="response"
predictions<-predict.glm(Logit,type="response")
qplot(predictions,geom="histogram")

#Final R tricks:
#If we want to generate a dummy for age<40, we can use:
dummy1<-as.numeric(age<40)
#Or age>=40:
dummy2<-as.numeric(age<=40)
plot(age,dummy2)