## section 9 - multiple testing (f-test)
## gov 2k
## nov 12, 2009
## section notes prepared by msen


#################################
## 1) Working with lm() output ##
#################################

## There is a lot packed into output from the lm function.
## It is often fruitful to write functions that use lm output
## to do some task that we need.

## Let's load the
## Duncan data from the car library for a running example.
## This dataset looks at the prestige of jobs based on their
## income and education levels.

library(car)
data(Duncan)

my.mod <- lm(prestige ~ education + income, data=Duncan)
names(my.mod)
names(summary(my.mod))

## there are a number of useful functions that extract what we
## want from the lm output.

## The beta hats:
coef(my.mod)

## The variance-covariance matrix of the beta-hats:
vcov(my.mod)

## The model matrix [1,X]:
head(model.matrix(my.mod))

## so you can see everything
model.matrix(my.mod)
Duncan


#######################
## 2) F distribution ##
#######################

## Let's start off with a quick demonstration of what the F
## distribution looks like for some parameters:

curve(df(x=x,df1=5,df2=200), from=0, to=15, col="chocolate", lwd=2)

## We can use the same pXXXX functions from the normal and t
## with the F:

pf(q=2, df1=5, df2=200)
pf(q=2, df1=5, df2=200, lower.tail=FALSE)
	## if lower.tail = T, probabilities are P[X <= x], otherwise, P[X > x]

## We can also use the qXXXX functions to find the point where
## x% of the distribution is higher or lower:

qf(p=0.95, df1=5, df2=200)
## or
qf(p=0.05,  df1=5, df2=200, lower.tail=FALSE)

curve(df(x=x,df1=5,df2=200), from=0, to=15, col="chocolate", lwd=2)
abline(v=qf(p=0.95, df1=5, df2=200), lwd=2)

## We can do F-tests in R a few different ways.

#######################
## 2) F Omnibus test ##
#######################

## A special case of the F-test that is often used is the "omnibus" test
## of all of the variables in the regression. R (and most other
## software packages) gives you this in the regression output:

lm.out <- lm(prestige ~ education + income, data=Duncan)
summary(lm.out)

## We can plot this test-statistic against its null distribution
## and we can see that it is very unlikely.

curve(df(x, 2, 42), from=0, to=110, col="chocolate")
abline(v=summary(lm.out)$fstatistic[1])
	## extracting the omnibus f stat from the lm output

## note that df1 = 2 (two restrictions)
## and df2 = n-(k+1) -- > 45 - 2 - 1

## We can also calculate the F-statistic "manually."
## We know from lecture that 

# F test stat = (RSSnull - RSSfull)/q _divided_ by RSSfull/n-(k+1)

lm.null <- lm(prestige ~ 1, data = Duncan)
	## note that this is basically taking the mean of x
lm.full <- lm(prestige ~ education + income, data = Duncan)
	## and this is the fullmodel

RSSnull <- sum(lm.null$residuals^2)
RSSfull <- sum(lm.full$residuals^2)

df1 <- length(lm.full$coef) - 1
df2 <- nrow(model.matrix(lm.full)) - (length(lm.full$coef) -1 + 1)

((RSSnull - RSSfull)/df1)/(RSSfull/df2)
summary(lm.full)$fstatistic[1]

## Ta da!

#########################################
## 2) F General Linear Hypothesis Test ##
#########################################

## general linear hypothesis tests are more complicated.
## You'll be writing a function that does this in your pset.

## let's take a break to discuss

## One function that will help is 
## linear.hypothesis() in the car library

## linear.hypothesis() permits a broad set of restrictions.  
## hint: you can
## use this to test your code this week.

lm.out <- lm(prestige ~ income + education, data=Duncan)
linear.hypothesis(lm.out, matrix(c(0,1,0,0,0,1),2,3, byrow=T), rhs=c(0,0))
summary(lm.out)

## rhs=c(0,0) is our c vector.  
## The matrix() input is a our hypothesis
## matrix.

############################
## 3) Confidence Ellipses ##
############################

## confidence.ellipses are hard to compute on your own and are impossible
## with more than two coefficients, but they can be useful. in the car
## library there is a function called, unsurprisingly, confidence.ellipse():

lm.out <- lm(prestige ~ income + education + type, data=Duncan)
confidence.ellipse(lm.out, which.coef=c("income","education"), col="red")

## however, we often cannot see (0,0),
## which is an important point:

confidence.ellipse(lm.out, which.coef=c("income","education"), col="red",
                    xlim=c(-.1,1), ylim=c(-.1,1))

## we can add an additional point on the graph using the points() function

points(x=0, y=0, col="blue", pch=19, cex=1.5)

## cex=1.5 stands for "character expansion" which makes the point bigger.

## we also might want to add lines for the confidence intervals for each
## of the two coefficients. when you do this, make sure you are plotting
## them in the right direction. in this case the CI for income should be
## vertical and the CI for education should be horizontal.