### Matrix computing for extracting estimates in a multiple regression

## First example: state.x77 data set a helpful data sets R includes on 
## the states of the U.S

state.x77
str(state.x77)                       # clearly not a data frame! (it's a matrix)
st <- as.data.frame(state.x77)        # so we'll make it one
str(st)

colnames(st)[4] = "Life.Exp"         # no spaces in variable names, please
colnames(st)[6] = "HS.Grad"                    
st$Density = st$Population * 1000 / st$Area

summary(st)

round(cor(st), 3) 

pairs(st, col='blue')

par(mfrow=c(3,3))                    # examine the variables for outliers and distribution before proceeding
  for(i in 1:9) qqnorm(st[,i], main=colnames(st)[i])

### simply placing a dot after the tilde, which means "everything else."
model1 = lm(Life.Exp ~ ., data=st) 
summary(model1)

model2 = update(model1, .~. -1)      # a model without intercept
summary(model2)


### Matrix computations in multiple linear regression for computing estimates

X <- as.matrix(st[,-4])
XprimX <- t(X)%*%X
XprimX.inv <- solve(XprimX)
Xprimy <- t(X)%*%st[,4]

coef.est <- XprimX.inv%*%Xprimy

coef.est		# comparing results
coef(model2)

se.estimate <- sqrt(diag(vcov(model2))) # estiamting s.e. of beta estimates
se.estimate.matrix <- sqrt(diag((4.358^2)*XprimX.inv))


#########################################################
### Example 2: Prestige data set in car library

# Load the package that contains the full dataset.
library(car)
library(corrplot) # We'll use corrplot later on in this example too.

head(Prestige,5)

str(Prestige)

summary(Prestige)

newdata = Prestige[,c(1:4)]
summary(newdata)

plot(newdata, pch=16, col="blue", 
	main="Matrix Scatterplot of Income, Education, Women and Prestige")

education.c = scale(newdata$education, center=TRUE, scale=FALSE)
prestige.c = scale(newdata$prestige, center=TRUE, scale=FALSE)
women.c = scale(newdata$women, center=TRUE, scale=FALSE)

# bind these new variables into newdata and display a summary.
new.c.vars = cbind(education.c, prestige.c, women.c)
newdata = cbind(newdata, new.c.vars)
names(newdata)[5:7] = c("education.c", "prestige.c", "women.c" )
summary(newdata)

mod1 = lm(income ~ education.c + prestige.c + women.c, data=newdata)

summary(mod1)

newdatacor = cor(newdata[1:4])
corrplot(newdatacor, method = "number")

######
### Matrix computation again
X.n <- as.matrix(cbind(1,new.c.vars))
XpX <- t(X.n)%*%X.n
XpX.inv <- solve(XpX)
Xpy <- t(X.n)%*%newdata$income
coef.estimate <- XpX.inv%*%Xpy

coef.estimate   		# comparing the results
coef(mod1)





