#Nathaniel Mark
#Recitation 2 R Code
#Columbia University Introduction to Econometricsi

#Prelinaries:
install.packages("psych")
library(psych)
install.packages("ggplot2")
library(ggplot2)
install.packages("TeachingDemos")
library(TeachingDemos)
set.seed(1)

#First: Any Questions about R?

#In this lesson, some new functions we will learn are:
#i)data.frame
#ii)rm
#iii)qplot
#iv)length
#v)z.test
#vi)pnorm
#vii)qnorm
#viii)predict

#Topic 1) Sample Problem to Help with homework

#-Generating a dataset
  #Say people are either healthy or sick. 80% are healthy:
Sick<-rbinom(200,1,.2) #generating random draws of 0 or 1
  #Also, people smoke a certain amount, given their health.
Smoke<-20*Sick+runif(200,0,40) #generating random draws between 0 and 40
  #Putting these into a dataframe:
SmokeHealthData<-data.frame(Sick,Smoke)
  #deleting the other variables:
rm(Sick,Smoke)

#-Looking at the data
describe(SmokeHealthData)
qplot(SmokeHealthData$Smoke,SmokeHealthData$Sick,xlab="# of Cigarettes smoked per Dat",ylab="Sickness Status (1=sick)")
qplot(SmokeHealthData$Smoke,geom="histogram")

#-Confidence interval around the mean of smoking:
attach(SmokeHealthData)
sdS<-sd(Smoke)
meanS<-mean(Smoke)
n<-length(Smoke)
CI<-c(meanS-(sdS/sqrt(n))*1.96,meanS+(sdS/sqrt(n))*1.96)
CI
#Checking/ Doing it another way:
z.test(Smoke,mu=meanS,sd=sdS)
  #What is going on here? Recall our discussion earlier in the class.

#-Testing if the mean of smoking is different from 1 pack a day (20)
t.stat<-(meanS-20)/(sdS/sqrt(n))
t.stat
  #Recall, there are then two ways to reject/not reject the hypothesis
  #Method 1: Is the p-value less than the level of significance (usually .05) 
  #To calculate p-value, we look at your normal distribution chart OR you can do this in R, by:
p.val<-2*pnorm(-abs(t.stat))
  #pnorm(x) gives you the probability the a draw from a std. normal dist. would be less than x.
p.val 
  #Reject null of meansmoke=20

  #Method 2: Is the t-stat larger (in absolute value) than the critical value
  #To calculate critical value, welook at your normal distribution chart OR you can do this in R, by:
crit.val<-abs(qnorm(.05/2))
  #qnorm(x) gives you the value at which x= the probability a draw from a std. normal dist. is less than x 
crit.val #approx 1.96

#Checking/ Doing it another way:
z.test(Smoke, mu=20,sd=sdS)

#-Subsetting the data to test seperately:
detach(SmokeHealthData)
SmokeHealthData.ofsmokers<-subset(SmokeHealthData,Smoke==1)
SmokeHealthData.ofNONsmokers<-subset(SmokeHealthData,Smoke==0)
  #Then we can run the same tests above using just these two (show in recitation)

#Topic 2: Regression

#-First, lets add a new variable: 
SmokeHealthData$weight<-130-rnorm(200,5,5)*Sick+rnorm(200,0,12)
qplot(SmokeHealthData$weight,geom="histogram")

#-First, writing out the theoretical "true" model
  #Here, question is "does smoking associate with lower body weight?"
  #weight_i=b0+b1*smoke_i+u_i

#-Second, ask yourself: are the regression assumptions satisfied?

#-Running regression
attach(SmokeHealthData)
Model<-lm(weight~Smoke)
summary(Model)
  #Interpretations:
    #b0 estimate is the expectation of the weight of a non smoker.
    #b1 estimate is the expected increase in someone's weight if they smoke one more cigarette per day.

    #Two important questions that are often asked: 
                         #Is b1 large in a statistical sense? i.e. Is it statistically significant?
                         #Is b1 large in a real-world sense? i.e. Does the size of this "effect" matter to policy/people?

    #Does the model explain much? 
                         #low R^2 says there is a very low percent of variation in weight that is explained by variation in smoking.

#-Predictions: Say we want to predict the weight of someone who smokes 200 cigarettes a day.
  #Method 1: copy and paste
prediction<-128.999796-0.008579*200
prediction
  #Method 2: use values from list
prediction<-Model$coefficients[1]+Model$coefficients[2]*200
prediction
  #Method3: predict function
New<-data.frame(Smoke=200)
prediction<-predict(Model,newdata=New)
  
  #Discussion: Is this prediction reliable?

#Thanks for coming!

#-Last note: You may want to check out the binom.test function.  This is a specific function for data that take values of 0 or 1. 
#-Also, you may want to look into Publishing your R scripts, to make your homeworks look nice and fancy.