*R code  for homework problems;

Copy and paste code one problem part at a time.


The easiest way to get slpack.txt and sldata.txt is to copy and paste the following 
two commands into R.

source("http://parker.ad.siu.edu/Olive/slpack.txt")
source("http://parker.ad.siu.edu/Olive/sldata.txt")


#The following commands are often used.

library(glmnet)
library(pls)
library(leaps)


#If slpack.txt and sldata.txt are on J drive, get lregpack and lregdata with
#the following commands:

#source("J:/slpack.txt") 
#source("J:/sldata.txt")

You will often need packages and may need to install packages
the first time you use a given computer.

install.packages("glmnet")
install.packages("pls")
install.packages("leaps")

install.packages("e1071")
install.packages("randomForest")
install.packages("ISLR")


#install.packages("gbm")
#install.packages("tree")
#install.packages("ROCR")

#These commands can fail if your computer has a firewall.
#Then an administrator may be needed to install the packages.

##HW1 A) Problem 1.10 a) 

n<-100000; q<-7
b <- 0 * 1:q + 1
x <- matrix(rnorm(n * q), nrow = n, ncol = q)
y <- 1 + x %*% b + rnorm(n)
out<-lsfit(x,y)
res<-out$res
yhat<-y-res
dd<-length(out$coef)
AERplot(yhat,y,res=res,d=dd,alph=1) #usual response plot

# 1.10 b) 

AERplot(yhat,y,res=res,d=dd,alph=0.01)
#plots data outside the 99% pointwise PIs


##HW1 C) Problem 1.11 a) 

#n > 10p
library(glmnet)
library(leaps)
library(pls)
x <- matrix(rnorm(300),nrow=100,ncol=3)
z <- x[,1] + rnorm(100,sd=0.1)
Y <- exp(z)
enet(x,Y)

#1.11b)

tplot2(x,Y,type=2) #right click Stop 7 times

#tplot2(x,Y,type=1) #right click Stop 7 times
#tplot2(x,Y,type=3) #right click Stop 7 times
#tplot2(x,Y,type=4) #right click Stop 7 times
#tplot2(x,Y,type=5) #right click Stop 7 times
#tplot2(x,Y,type=6) #right click Stop 7 times
#tplot2(x,Y,type=7) #right click Stop 7 times

#1.11d)

#n = p-1, sparse model #100 nontrivail predictors
x <- matrix(rnorm(10000),nrow=100,ncol=100)
z <- x[,1] + rnorm(100,sd=0.1)
Y <- exp(z)
tplot2(x,Y,type=2) #right click stop 7 times

#1.11f) 

tplot2(x,Y,type=5)  #PLS overfit, right click stop 7 times

#tplot2(x,Y,type=3)
#tplot2(x,Y,type=4) #ridge regression failed
#tplot2(x,Y,type=6)  #PCR may have overfit
#tplot2(x,Y,type=7)

#tplot2(x,Y,type=1) #should give an error since n < p


##HW1 D) ##Problem 1.12 a) 

#skewed data

x <- matrix(rnorm(3000),nrow=1000,ncol=3)
y <- x%*%c(1,1,1) + 2*rexp(1000)
out<-lsfit(x,y)
FIT <- y - out$resid
plot(FIT,y)
abline(0,1)
lines(lowess(FIT,y))


#Problem 1.12 b) and c) for R

plot(FIT,out$resid)
abline(0,0)
lines(lowess(FIT,out$resid))
out$coef

##HW2 B) Problem 1.13 a)

#Copy and paste two source commands and the three library commands
#from near the top of this file into R.

out <- cv.glmnet(buxx,buxy,alpha=1) #lasso
                   lam <- out$lambda.min
                   fit <- predict(out,s=lam,newx=buxx)
res <- buxy - fit
vars <- 1:4
lcoef <- predict(out,type="coefficients",s=lam)
                lcoef<-as.vector(lcoef)[-1]
                vin <- vars[lcoef!=0]
                pp <- length(vin)+1
AERplot2(yhat=fit,y=buxy,res=res,d=pp)
title("lasso")

#Problem 1.13 b) 

ind <-getB(buxx)$indx
yb <- buxy[ind]
xb <- buxx[ind,]
xnotinb <- buxx[-ind,]
ynotinb <- buxy[-ind]
out <- cv.glmnet(xb,yb,alpha=1) #lasso
                   lam <- out$lambda.min
                   fit <- predict(out,s=lam,newx=xb)
       fit2 <- predict(out,s=lam,newx=xnotinb)
yy <- c(yb,ynotinb)
yhat <- c(fit,fit2)
res <- yy - yhat
vars <- 1:4
lcoef <- predict(out,type="coefficients",s=lam)
                lcoef<-as.vector(lcoef)[-1]
                vin <- vars[lcoef!=0]
                pp <- length(vin)+1
AERplot2(yhat=yhat,y=yy,res=res,d=pp)
title("lasso using covmb set B")

#Problem 1.13 c)

ddplot5(buxx)

##HW2 C Problem 1.14 a)

out <- cv.glmnet(cbrainx,cbrainy,alpha=1) #lasso
                   lam <- out$lambda.min
                   fit <- predict(out,s=lam,newx=cbrainx)
res <- cbrainy - fit
vars <- 1:11
lcoef <- predict(out,type="coefficients",s=lam)
                lcoef<-as.vector(lcoef)[-1]
                vin <- vars[lcoef!=0]
                pp <- length(vin)+1
AERplot2(yhat=fit,y=cbrainy,res=res,d=pp)
title("lasso")

#Problem 1.14 b)

xcontin <- cbrainx[,-c(2,4,10)] #continuous predictors
ind <-getB(xcontin)$indx
yb <- cbrainy[ind]
xb <- cbrainx[ind,]
xnotinb <- cbrainx[-ind,]
ynotinb <- cbrainy[-ind]
out <- cv.glmnet(xb,yb,alpha=1) #lasso
                   lam <- out$lambda.min
                   fit <- predict(out,s=lam,newx=xb)
       fit2 <- predict(out,s=lam,newx=xnotinb)
yy <- c(yb,ynotinb)
yhat <- c(fit,fit2)
res <- yy - yhat
vars <- 1:4
lcoef <- predict(out,type="coefficients",s=lam)
                lcoef<-as.vector(lcoef)[-1]
                vin <- vars[lcoef!=0]
                pp <- length(vin)+1
AERplot2(yhat=yhat,y=yy,res=res,d=pp)
title("lasso using covmb set B")

#length(cbrainy) - length(ind) 
#8 cases with the largest distances are omitted from covmb2 set B

#Problem 1.14 c)

ddplot5(xcontin)


##HW2 D Problem 1.15 a) 

mldsim6(n=100,p=10,outliers=1,gam=0.25,pm=20, osteps=9)

mldsim6(n=100,p=45,outliers=1,gam=0.25,pm=60, osteps=9)

#Problem 1.15 b) #takes a few minutes since p = 1000

mldsim7(n=100,p=1000,outliers=2,pm=900,osteps=0)

mldsim7(n=100,p=1000,outliers=2,pm=900,osteps=9)

#Problem 1.15 c)

mldsim7(n=100,p=100,outliers=3,gam=0.25,pm=8,osteps=0)

mldsim7(n=100,p=100,outliers=3,gam=0.25,pm=8,osteps=9)


##HW3 D, Problem 2.9


predsim()


##HW3 E, Problem 2.10: 


regbootsim2(nruns=500,psi=0.0)

##HW4 D Problem 3.18
#Copy and paste two source commands from near the top of this file into R.

#a)

library(leaps)
n<- 1000
p<-4
q<-p-1
b <- 0 * 1:q
b[1] <- 1 #beta = (1,1,0,0)^T
x <- matrix(rnorm(n * q), nrow = n, ncol = q)
y <- 1 + x %*% b + rnorm(n)
outols <- lsfit(x,y)
outols$coef  #Tn = betahat

#b) 

tem<-regboot(x,y)
betas <- tem$betas
betas[1:5,]

#c) 

apply(betas,2,mean)

#d) 

temp<-fselboot(x,y)
vsbetas <- temp$betas
vsbetas[1:5,]

#e)

apply(vsbetas,2,mean)

##HW5 D Problem 3.19

library(leaps)

#person 1
regbootsim2(n=100,p=4,k=1,nruns=1000,type=1,psi=0)
vsbootsim3(n=100,p=4,k=1,nruns=1000,type=1,psi=0)

#person 2
regbootsim2(n=100,p=4,k=1,nruns=1000,type=2,psi=0)
vsbootsim3(n=100,p=4,k=1,nruns=1000,type=2,psi=0)

#person 3
regbootsim2(n=100,p=4,k=1,nruns=1000,type=3,psi=0)
vsbootsim3(n=100,p=4,k=1,nruns=1000,type=3,psi=0)

#person 4
regbootsim2(n=100,p=4,k=1,nruns=1000,type=4,psi=0)
vsbootsim3(n=100,p=4,k=1,nruns=1000,type=4,psi=0)

#person 5
regbootsim2(n=100,p=4,k=1,nruns=1000,type=5,psi=0)
vsbootsim3(n=100,p=4,k=1,nruns=1000,type=5,psi=0)

#person 6
regbootsim2(n=100,p=4,k=1,nruns=1000,type=1,psi=0.9)
vsbootsim3(n=100,p=4,k=1,nruns=1000,type=1,psi=0.9)

#person 7
regbootsim2(n=100,p=4,k=1,nruns=1000,type=2,psi=0.9)
vsbootsim3(n=100,p=4,k=1,nruns=1000,type=2,psi=0.9)

#person 8
regbootsim2(n=100,p=4,k=1,nruns=1000,type=3,psi=0.9)
vsbootsim3(n=100,p=4,k=1,nruns=1000,type=3,psi=0.9)

#person 9
regbootsim2(n=100,p=4,k=1,nruns=1000,type=4,psi=0.9)
vsbootsim3(n=100,p=4,k=1,nruns=1000,type=4,psi=0.9)

#person 10
regbootsim2(n=100,p=4,k=1,nruns=1000,type=5,psi=0.9)
vsbootsim3(n=100,p=4,k=1,nruns=1000,type=5,psi=0.9)

#person 11
regbootsim2(n=100,p=5,k=1,nruns=1000,type=1,psi=0)
vsbootsim3(n=100,p=5,k=1,nruns=1000,type=1,psi=0)

#person 12
regbootsim2(n=100,p=5,k=1,nruns=1000,type=2,psi=0)
vsbootsim3(n=100,p=5,k=1,nruns=1000,type=2,psi=0)

#person 13
regbootsim2(n=100,p=5,k=1,nruns=1000,type=3,psi=0)
vsbootsim3(n=100,p=5,k=1,nruns=1000,type=3,psi=0)

#person 14
regbootsim2(n=100,p=5,k=1,nruns=1000,type=4,psi=0)
vsbootsim3(n=100,p=5,k=1,nruns=1000,type=4,psi=0)

#person 15
regbootsim2(n=100,p=5,k=1,nruns=1000,type=5,psi=0)
vsbootsim3(n=100,p=5,k=1,nruns=1000,type=5,psi=0)

#person 16
regbootsim2(n=100,p=5,k=1,nruns=1000,type=1,psi=0.9)
vsbootsim3(n=100,p=5,k=1,nruns=1000,type=1,psi=0.9)

#person 17
regbootsim2(n=100,p=5,k=1,nruns=1000,type=2,psi=0.9)
vsbootsim3(n=100,p=5,k=1,nruns=1000,type=2,psi=0.9)

#person 18
regbootsim2(n=100,p=5,k=1,nruns=1000,type=3,psi=0.9)
vsbootsim3(n=100,p=5,k=1,nruns=1000,type=3,psi=0.9)

#person 19
regbootsim2(n=100,p=5,k=1,nruns=1000,type=4,psi=0.9)
vsbootsim3(n=100,p=5,k=1,nruns=1000,type=4,psi=0.9)

#person 20
regbootsim2(n=100,p=5,k=1,nruns=1000,type=5,psi=0.9)
vsbootsim3(n=100,p=5,k=1,nruns=1000,type=5,psi=0.9)

##HW6 B Problem 3.20 ridge regression versus OLS

library(leaps)
library(glmnet)

#person 1
regbootsim2(n=100,p=4,k=1,nruns=1000,type=1,psi=0)
lassobootsim3(n=100,p=4,k=1,nruns=100,type=1,psi=0,regtype=0,BB=250)

#person 2
regbootsim2(n=100,p=4,k=1,nruns=1000,type=2,psi=0)
lassobootsim3(n=100,p=4,k=1,nruns=100,type=2,psi=0,regtype=0,BB=250)

#person 3
regbootsim2(n=100,p=4,k=1,nruns=1000,type=3,psi=0)
lassobootsim3(n=100,p=4,k=1,nruns=100,type=3,psi=0,regtype=0,BB=250)

#person 4
regbootsim2(n=100,p=4,k=1,nruns=1000,type=4,psi=0)
lassobootsim3(n=100,p=4,k=1,nruns=100,type=4,psi=0,regtype=0,BB=250)

#person 5
regbootsim2(n=100,p=4,k=1,nruns=1000,type=5,psi=0)
lassobootsim3(n=100,p=4,k=1,nruns=100,type=5,psi=0,regtype=0,BB=250)

#person 6
regbootsim2(n=100,p=4,k=1,nruns=1000,type=1,psi=0.9)
lassobootsim3(n=100,p=4,k=1,nruns=100,type=1,psi=0.9,regtype=0,BB=250)

#person 7
regbootsim2(n=100,p=4,k=1,nruns=1000,type=2,psi=0.9)
lassobootsim3(n=100,p=4,k=1,nruns=100,type=2,psi=0.9,regtype=0,BB=250)

#person 8
regbootsim2(n=100,p=4,k=1,nruns=1000,type=3,psi=0.9)
lassobootsim3(n=100,p=4,k=1,nruns=100,type=3,psi=0.9,regtype=0,BB=250)

#person 9
regbootsim2(n=100,p=4,k=1,nruns=1000,type=4,psi=0.9)
lassobootsim3(n=100,p=4,k=1,nruns=100,type=4,psi=0.9,regtype=0,BB=250)

#person 10
regbootsim2(n=100,p=4,k=1,nruns=1000,type=5,psi=0.9)
lassobootsim3(n=100,p=4,k=1,nruns=100,type=5,psi=0.9,regtype=0,BB=250)

#person 11
regbootsim2(n=100,p=5,k=1,nruns=1000,type=1,psi=0)
lassobootsim3(n=100,p=5,k=1,nruns=100,type=1,psi=0,regtype=0,BB=250)

#person 12
regbootsim2(n=100,p=5,k=1,nruns=1000,type=2,psi=0)
lassobootsim3(n=100,p=5,k=1,nruns=100,type=2,psi=0,regtype=0,BB=250)

#person 13
regbootsim2(n=100,p=5,k=1,nruns=1000,type=3,psi=0)
lassobootsim3(n=100,p=5,k=1,nruns=100,type=3,psi=0,regtype=0,BB=250)

#person 14
regbootsim2(n=100,p=5,k=1,nruns=1000,type=4,psi=0)
lassobootsim3(n=100,p=5,k=1,nruns=100,type=4,psi=0,regtype=0,BB=250)

#person 15
regbootsim2(n=100,p=5,k=1,nruns=1000,type=5,psi=0)
lassobootsim3(n=100,p=5,k=1,nruns=100,type=5,psi=0,regtype=0,BB=250)

#person 16
regbootsim2(n=100,p=5,k=1,nruns=1000,type=1,psi=0.9)
lassobootsim3(n=100,p=5,k=1,nruns=100,type=1,psi=0.9,regtype=0,BB=250)

#person 17
regbootsim2(n=100,p=5,k=1,nruns=1000,type=2,psi=0.9)
lassobootsim3(n=100,p=5,k=1,nruns=100,type=2,psi=0.9,regtype=0,BB=250)

#person 18
regbootsim2(n=100,p=5,k=1,nruns=1000,type=3,psi=0.9)
lassobootsim3(n=100,p=5,k=1,nruns=100,type=3,psi=0.9,regtype=0,BB=250)

#person 19
regbootsim2(n=100,p=5,k=1,nruns=1000,type=4,psi=0.9)
lassobootsim3(n=100,p=5,k=1,nruns=100,type=4,psi=0.9,regtype=0,BB=250)

#person 20
regbootsim2(n=100,p=5,k=1,nruns=1000,type=5,psi=0.9)
lassobootsim3(n=100,p=5,k=1,nruns=100,type=5,psi=0.9,regtype=0,BB=250)

##HW6 C Problem 3.21 lasso versus OLS

#library(leaps)
#library(glmnet)

#person 1
regbootsim2(n=100,p=4,k=1,nruns=1000,type=1,psi=0)
lassobootsim4(n=100,p=4,k=1,nruns=100,type=1,psi=0,BB=250)

#person 2
regbootsim2(n=100,p=4,k=1,nruns=1000,type=2,psi=0)
lassobootsim4(n=100,p=4,k=1,nruns=100,type=2,psi=0,BB=250)

#person 3
regbootsim2(n=100,p=4,k=1,nruns=1000,type=3,psi=0)
lassobootsim4(n=100,p=4,k=1,nruns=100,type=3,psi=0,BB=250)

#person 4
regbootsim2(n=100,p=4,k=1,nruns=1000,type=4,psi=0)
lassobootsim4(n=100,p=4,k=1,nruns=100,type=4,psi=0,BB=250)

#person 5
regbootsim2(n=100,p=4,k=1,nruns=1000,type=5,psi=0)
lassobootsim4(n=100,p=4,k=1,nruns=100,type=5,psi=0,BB=250)

#person 6
regbootsim2(n=100,p=4,k=1,nruns=1000,type=1,psi=0.9)
lassobootsim4(n=100,p=4,k=1,nruns=100,type=1,psi=0.9,BB=250)

#person 7
regbootsim2(n=100,p=4,k=1,nruns=1000,type=2,psi=0.9)
lassobootsim4(n=100,p=4,k=1,nruns=100,type=2,psi=0.9,BB=250)

#person 8
regbootsim2(n=100,p=4,k=1,nruns=1000,type=3,psi=0.9)
lassobootsim4(n=100,p=4,k=1,nruns=100,type=3,psi=0.9,BB=250)

#person 9
regbootsim2(n=100,p=4,k=1,nruns=1000,type=4,psi=0.9)
lassobootsim4(n=100,p=4,k=1,nruns=100,type=4,psi=0.9,BB=250)

#person 10
regbootsim2(n=100,p=4,k=1,nruns=1000,type=5,psi=0.9)
lassobootsim4(n=100,p=4,k=1,nruns=100,type=5,psi=0.9,BB=250)

#person 11
regbootsim2(n=100,p=5,k=1,nruns=1000,type=1,psi=0)
lassobootsim4(n=100,p=5,k=1,nruns=100,type=1,psi=0,BB=250)

#person 12
regbootsim2(n=100,p=5,k=1,nruns=1000,type=2,psi=0)
lassobootsim4(n=100,p=5,k=1,nruns=100,type=2,psi=0,BB=250)

#person 13
regbootsim2(n=100,p=5,k=1,nruns=1000,type=3,psi=0)
lassobootsim4(n=100,p=5,k=1,nruns=100,type=3,psi=0,BB=250)

#person 14
regbootsim2(n=100,p=5,k=1,nruns=1000,type=4,psi=0)
lassobootsim4(n=100,p=5,k=1,nruns=100,type=4,psi=0,BB=250)

#person 15
regbootsim2(n=100,p=5,k=1,nruns=1000,type=5,psi=0)
lassobootsim4(n=100,p=5,k=1,nruns=100,type=5,psi=0,BB=250)

#person 16
regbootsim2(n=100,p=5,k=1,nruns=1000,type=1,psi=0.9)
lassobootsim4(n=100,p=5,k=1,nruns=100,type=1,psi=0.9,BB=250)

#person 17
regbootsim2(n=100,p=5,k=1,nruns=1000,type=2,psi=0.9)
lassobootsim4(n=100,p=5,k=1,nruns=1000,type=2,psi=0.9,BB=250)

#person 18
regbootsim2(n=100,p=5,k=1,nruns=1000,type=3,psi=0.9)
lassobootsim4(n=100,p=5,k=1,nruns=100,type=3,psi=0.9,BB=250)

#person 19
regbootsim2(n=100,p=5,k=1,nruns=1000,type=4,psi=0.9)
lassobootsim4(n=100,p=5,k=1,nruns=100,type=4,psi=0.9,BB=250)

#person 20
regbootsim2(n=100,p=5,k=1,nruns=1000,type=5,psi=0.9)
lassobootsim4(n=100,p=5,k=1,nruns=100,type=5,psi=0.9,BB=250)

##HW6 D) Problem 4.13 elastic net with outliers

#a)


library(glmnet)
out <- enet2(buxx,buxy) #elastic net
                   lam <- out$lam
                   tem <- out$out
                   fit <- predict(tem,s=lam,newx=buxx)
res <- buxy - fit
vars <- 1:4
lcoef <- predict(tem,type="coefficients",s=lam)
                lcoef<-as.vector(lcoef)[-1]
                vin <- vars[lcoef!=0]
                pp <- length(vin)+1
AERplot2(yhat=fit,y=buxy,res=res,d=pp)
title("a) elastic net")

#b)

#resistant elastic net

ind <-getB(buxx)$indx
yb <- buxy[ind]
xb <- buxx[ind,]
xnotinb <- buxx[-ind,]
ynotinb <- buxy[-ind]
out <- enet2(xb,yb) #elastic net
                   lam <- out$lam
                   tem <- out$out
                   fit <- predict(tem,s=lam,newx=xb)
       fit2 <- predict(tem,s=lam,newx=xnotinb)
yy <- c(yb,ynotinb)
yhat <- c(fit,fit2)
res <- yy - yhat
vars <- 1:4
lcoef <- predict(tem,type="coefficients",s=lam)
                lcoef<-as.vector(lcoef)[-1]
                vin <- vars[lcoef!=0]
                pp <- length(vin)+1
AERplot2(yhat=yhat,y=yy,res=res,d=pp)
title("b) elastic net using covmb set B")


#Also see 1.9 c). ddplot5(buxx)


##Problem 4.7  HW1 B

#a) 

belx <- 50:73
 
bely <-c(0.44,0.47,0.47,0.59,0.66,0.73,0.81,0.88,1.06,1.2,1.35,1.49,
1.61,2.12,11.9,12.4,14.2,15.9,18.2,21.2,4.3,2.4,2.7,2.9)

out <- lm(bely~belx)
ESP <- predict(out)
Y <- bely
plot(ESP,Y)
abline(0,1)

#b) 

library(mgcv)
outgam <- gam(bely~s(belx))
EAP <- predict(outgam)
plot(EAP,Y)
abline(0,1)

#Problem 4.8  HW7  A

#ha = number patients with heart attack for each ck group
#ok = number of patients without a heart attack

#a)

ck <- c(20,60,100,140,180,220,260,300,340,380,420,460)
ha <- c(2,13,30,30,21,19,18,13,19,15,7,8)
ok <- c(88,26,8,5,0,1,1,1,1,0,0,0)
heart <- as.data.frame(cbind(ck,ha,ok))
outw <- glm(cbind(ha,ok)~ck+I(ck^2)+I(ck^3),family=binomial, data=heart)
out <- glm(cbind(ha,ok)~log(ck),family=binomial, data=heart)
par(mfrow=c(2,2))

library(mgcv)
outgam <- gam(cbind(ha,ok)~s(ck),family=binomial, data=heart)
plot(outgam)
title("a) Shat")

outgam2 <- gam(cbind(ha,ok)~s(log(ck)),family=binomial, data=heart)
plot(outgam2)
title("b) Shat for log(ck) GAM")

EAP <- predict(outgam)
ESP <- predict(out)
plot(EAP,ESP)
abline(0,1)
title("c) EE Plot: log(ck) GLM")

Z <- ha/(ha+ok)
fit<-Z
plot(ESP,Z)
fit <- exp(ESP)/(1 + exp(ESP))
indx <- sort.list(ESP)
lines(ESP[indx], fit[indx])
title("d) Response Plot ")
par(mfrow=c(1,1))

#b)

AIC(outw)

AIC(out) 

#Problem 4.9 HW7 B

pisimspline(n=100,nruns=5000,type=5,modt=3)

#Problem 4.10 HW7 C: Binary logistic regression with lasso

#a)

set.seed(1976)  #Binary regression 
library(glmnet)
n<-100
m<-1 #binary regression
q <- 100 #100 nontrivial predictors, 95 inactive
k <- 5 #k_S = 5 population active predictors
y <- 1:n
mv <- m + 0 * y
vars <- 1:q
beta <- 0 * 1:q
beta[1:k] <- beta[1:k] + 1
beta
alpha <- 0
x <- matrix(rnorm(n * q), nrow = n, ncol = q)
SP <- alpha + x[,1:k] %*% beta[1:k]
pv <- exp(SP)/(1 + exp(SP))
y <- rbinom(n,size=m,prob=pv)
y
out<-cv.glmnet(x,y,family="binomial")
lam <- out$lambda.min
bhat <- as.vector(predict(out,type="coefficients",s=lam))
ahat <- bhat[1] #alphahat
bhat<-bhat[-1]
vin <- vars[bhat!=0] #want 1-5, overfit
# [1]  1  2  3  4  5  6 16 59 61 74 75 76 96
ind <- as.data.frame(cbind(y,x[,vin])) #relaxed lasso GLM
tem <- glm(y~.,family="binomial",data=ind)
tem$coef
lrplot3(tem=tem,x=x[,vin]) #binary response plot

#b)

#now use MLR lasso
outm<-cv.glmnet(x,y)
lamm <- outm$lambda.min
bm <- as.vector(predict(outm,type="coefficients",s=lamm))
am <- bm[1] #alphahat
bm<-bm[-1]
vm <- vars[bm!=0] #1 more variable than GLM lasso
vm
inm <- as.data.frame(cbind(y,x[,vm])) #relaxed lasso GLM
tm <- glm(y~.,family="binomial",data=inm)
lrplot3(tem=tm,x=x[,vm]) #binary response plot

#c)

library(leaps)
out<-fsel(x,y)
vin<-out$vin
vin  #severe underfit
#[1] 4
inm <- as.data.frame(cbind(y,x[,vin])) 
tm <- glm(y~.,family="binomial",data=inm)
lrplot3(tem=tm,x=x[,vin]) #binary response plot

#Problem 4.11 HW7 D lasso for Poisson regression

#a)

set.seed(1976)  #Poisson regression 
library(glmnet)
n<-100
q <- 100 #100 nontrivial predictors, 95 inactive
k <- 5 #k_S = 5 population active predictors
y <- 1:n
mv <- m + 0 * y
vars <- 1:q
beta <- 0 * 1:q
beta[1:k] <- beta[1:k] + 1
beta
alpha <- 0
x <- matrix(rnorm(n * q), nrow = n, ncol = q)
SP <- alpha + x[,1:k] %*% beta[1:k]
y <- rpois(n,lambda=exp(SP))
out<-cv.glmnet(x,y,family="poisson")
lam <- out$lambda.min
bhat <- as.vector(predict(out,type="coefficients",s=lam))
ahat <- bhat[1] #alphahat
bhat<-bhat[-1]
vin <- vars[bhat!=0] #want 1-5, overfit
vin
ind <- as.data.frame(cbind(y,x[,vin])) #relaxed lasso GLM
out <- glm(y~.,family="poisson",data=ind)
ESP <- predict(out)
prplot2(ESP,x=x[,vin],y) #response and OD plots

#b)

#now use MLR lasso
outm<-cv.glmnet(x,y)
lamm <- outm$lambda.min
bm <- as.vector(predict(outm,type="coefficients",s=lamm))
am <- bm[1] #alphahat
bm<-bm[-1]
vm <- vars[bm!=0] 
vm #less overfit than GLM lasso
inm <- as.data.frame(cbind(y,x[,vm])) #relaxed lasso GLM
out <- glm(y~.,family="poisson",data=inm)
ESP <- predict(out)
prplot2(ESP,x=x[,vm],y) #response and OD plots

#c) Now use MLR forward selection with EBIC since n < 10p.

library(leaps)
out<-fsel(x,y)
vin<-out$vin
vin  #severe underfit causes poor fit and overdispersion
#[1] 5
inm <- as.data.frame(cbind(y,x[,vin]))
out <- glm(y~.,family="poisson",data=inm)
ESP <- predict(out)
prplot2(ESP,x=x[,vin],y) #response and OD plots


## Problem 4.12,  HW 10 Problem C)   Regression Tree 

library(rpart)
cars <- car90[, -match(c("Rim", "Tires", "Model2"), names(car90))]
carfit <- rpart(Price/1000 ~ ., data=cars)
plot(carfit)
text(carfit)

#Problem 5.11, HW8 D) needs sldata

#a)

group <- pottery[pottery[,1]!=5,1]
group <- (as.integer(group!=1)) + 1
x <- pottery[pottery[,1]!=5,-1]

#b)

z <- x[,1:9]
library(MASS)
out <- qda(x=z,group)
1-mean(predict(out,z)$class==group)

#c)

#delete the first variable
out <- qda(x=z[,-c(1)],group)
1-mean(predict(out,z[,-c(1)])$class==group)

#delete the first two variables
out <- qda(x=z[,-c(1,2)],group)
1-mean(predict(out,z[,-c(1,2)])$class==group)

#deleting the second variable caused a classification error
#so delete the first and third variable
out <- qda(x=z[,-c(1,3)],group)
1-mean(predict(out,z[,-c(1,3)])$class==group)

#deleting the third variable caused a classification error
#so delete the first and fourth variable
out <- qda(x=z[,-c(1,4)],group)
1-mean(predict(out,z[,-c(1,4)])$class==group)

#do not need the 4th variable so continue deleting variables
out <- qda(x=z[,-c(1,4,5)],group)
1-mean(predict(out,z[,-c(1,4,5)])$class==group)


##Problem 5.12 HW8 E)

library(MASS)

group <- pottery[pottery[,1]!=5,1]
group <- (as.integer(group!=1)) + 1
x <- pottery[pottery[,1]!=5,-1]

out<-lda(x[,c(6,11,14,18)],group)
1-mean(predict(out,x[,c(6,11,14,18)])$class==group)

##Problem 5.13 HW8 F)

#a)

group <- pottery[pottery[,1]!=5,1]
group <- (as.integer(group!=1)) + 1
x <- pottery[pottery[,1]!=5,-1]

library(class)

out<-knn(x,x,group,k=1)
mean(group!=out)  #AER for 1NN is 0

#b) 

indx<-sample(1:28,replace=F)
train <- indx[13:28]
test <- indx[1:12]
ztest <- x[test,]   #12 test cases
grptest <- group[test]
ztrain <- x[train,]
grptrain <- group[train]

out<-knn(ztrain,ztest,grptrain,k=1)
mean(grptest!=out) #validation ER

#c) 

out<-knn(x,x,group,k=2)
mean(group!=out)  #AER for 2NN 

#d)

out<-knn(ztrain,ztest,grptrain,k=2)
mean(grptest!=out) #validation ER for 2NN

#e) 

out <- knn(x[,c(6,11,14,18)],
x[,c(6,11,14,18)],group,k=2)  
mean(group!=out)  # AER for good subset of variables

#f) 

out<-knn(ztrain[,c(6,11,14,18)],
ztest[,c(6,11,14,18)],grptrain,k=2)
mean(grptest!=out) #validation ER for good subset of variables

##Problem 5.14 HW9 B)

#a) backward elimination Logistic Regression 

y <- cbrainx[,10]
age <- cbrainx[,1]
lnag <- log(age)
brd <- cbrainx[,3]
fcau <- as.factor(cbrainx[,4])
ceph <- cbrainx[,5]
circ <- cbrainx[,6]
hdht <- cbrainx[,7]
ht <- cbrainx[,8]
len <- cbrainx[,9]
siz <- cbrainx[,11]
lnsz <- log(siz)
outf <- glm(y~age+lnag+brd+fcau+ceph+circ+hdht+ht+len+siz+lnsz,family=binomial)
summary(outf) #summary for the full data
back <- step(outf) #backward elimination
outn <- glm(y~1,family=binomial) # null model without predictors
AIC(outn)
AIC(outf)
summary(back) 
#step(outf,direction="both")
#did backwards
#stepAIC and step would not do forwards
#forw <- step(outn,scope=list(lower~1,
#upper~age+lnag+brd+fcau+ceph+circ+hdht+ht+len+siz+lnsz),direction="forward")

#b) w has the variables from the submodel

w <- cbind(cbrainx[,1],log(cbrainx[,1]),cbrainx[,c(5,6,7,8,9)],log(cbrainx[,11]))
lrplot3(tem=back,x=w)


#c) 

ESP <- predict(back)
yhat <- as.integer(ESP > 0)
AER <- 1 - mean(y==yhat)   #1 - proportion of correct classifications
AER

#ESP2 <- w%*%back$coef[-1] + back$coef[1]
#ESP-ESP2 #0 to 10 digits

#d) 

set.seed(1)
indx <- sample(1:267,replace=F)
train <- indx[68:267]
test <- indx[1:67]  #67 test cases
ytest <- y[test]
ytrain <- y[train]
wtest <- w[test,]
wtrain <- w[train,]
tem <- as.data.frame(cbind(ytrain,wtrain))
outtrain <- glm(ytrain~.,family=binomial,data=tem)

ESPtest <- wtest%*%outtrain$coef[-1] + outtrain$coef[1]
yhat <- as.integer(ESPtest > 0)
VER <- 1 - mean(ytest==yhat)   #1 - proportion of correct classifications
VER

#the following code also works
#z <- as.data.frame(cbind(y,w))
#ztest <- z[test,]
#ztrain<- z[train,]
#outtrain<-glm(y~.,family=binomial,data=ztrain)
#ESPtest2 <- predict(outtrain,newdata=ztest)

#this code causes errors
#wtest <- as.data.frame(wtest)
#ESPtest <- predict(outtrain,newdata=wtest)
#three variables get names V1, V2, and V8 which causes an error
#because they are called V1, V2 and V8 in the data.frame

#e) lasso with classification criterion


library(glmnet)

y <- cbrainx[,10]
x <-cbind(age,lnag,brd,ceph,circ,hdht,ht,len,siz,lnsz)
out<-cv.glmnet(x,y,family="binomial",type.measure="class")
lam <- out$lambda.min
kfoldER <- min(out$cvm)
kfoldER


#f) 
vars <- 1:dim(x)[2]
bhat <- as.vector(predict(out,type="coefficients",s=lam))
ahat <- bhat[1] #alphahat
bhat<-bhat[-1]
vin <- vars[bhat!=0] 
#[1]  1  2  4  5  6  7  9 10
ESPlasso <- predict(out,newx=x,s=lam)

#ESP2 <- x%*%bhat + ahat
#ESP - ESP2 = 0 to 10 digits

ind <- as.data.frame(cbind(y,x[,vin])) #relaxed lasso GLM
tem <- glm(y~.,family="binomial",data=ind)
tem$coef
lrplot3(tem=tem,x=x[,vin]) #binary response plot

#g)

ESPRL <- predict(tem)
plot(ESPRL,ESPlasso)
abline(0,1)
title("EE Plot")

##Problem 5.15 HW 10 Problem D) Classification  Tree

library(rpart)

fit <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis)
plot(fit)
text(fit)


#Problem 5.16 HW11 Problem A) pottery data with two groups: Bagging, Random Forests, SVM

#a) 

y <- pottery[pottery[,1]!=5,1]
y <- (as.integer(y!=1)) + 1
y <- 2*(y-1.5) #levels -1 and 1
x <- pottery[pottery[,1]!=5,-1]
shards <-data.frame(y=as.factor(y),x)

set.seed(1)
library(randomForest)

#bagging: use mtry = number of predictors = 20
bagshards <- randomForest(y~.,data=shards,mtry=20,importance=TRUE)
pred<-predict(bagshards)
table(predict=pred,truth=shards$y) 

#b) random forests default uses mtry approx sqrt(p)
rfshards <- randomForest(y~.,data=shards,importance=TRUE)
pred<-predict(rfshards)
table(predict=pred,truth=shards$y) 

#c) SVM with a fixed cost
library(e1071)
svmfit=svm(y~., data=shards, kernel="linear",cost=10,scale=FALSE)
pred <- predict(svmfit)
table(predict=pred,truth=shards$y) 

#d) SVM with cost chosen by 10 fold CV
set.seed(1) #tuneout takes a minute
tuneout <- tune(svm,y~.,data=shards,kernel="linear",
ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100)))
summary(tuneout) #10 fold CV on several models
pred<-predict(tuneout$best.mod)
table(predict=pred,truth=shards$y) 


##Problem 5.17 HW11 Problem B) Gladstone data with Gender as response

#a) bagging AER

brainwt <- cbrainy
y <- cbrainx[,10]; y[y==0] <- -1 
y<-as.factor(y) #-1=F, 1 = M
brainx<-cbind(cbrainx[,-10],brainwt)
brainx<-brainx[,-c(2,4)]
brainx<-brainx[-c(230,254:258),] #remove outliers
y<-y[-c(230,254:258)]
brain<-data.frame(y,brainx)
set.seed(1)
train=sample(1:nrow(brain),200)

library(randomForest)

#bagging: use mtry = number of predictors = 9
bagbrain <- randomForest(y~.,data=brain,mtry=9,importance=TRUE)
pred<-predict(bagbrain)
table(predict=pred,truth=brain$y) #get AER

#b) bagging  with a validation set error rate

bagbrain <- randomForest(y~.,data=brain,subset=train,mtry=9,importance=TRUE)
pred<-predict(bagbrain,newdata=brain[-train,])
table(predict=pred,truth=brain$y[-train]) 

#c) random forests default uses mtry approx sqrt(p)
rfbrain <- randomForest(y~.,data=brain,importance=TRUE)
pred<-predict(rfbrain)
table(predict=pred,truth=brain$y) 

#d) random forests with a validation set error rate

rfbrain <- randomForest(y~.,data=brain,subset=train,importance=TRUE)
pred<-predict(rfbrain,newdata=brain[-train,])
table(predict=pred,truth=brain$y[-train]) 

#e) SVM with  cost chosen by 10 fold CV
library(e1071)
set.seed(1) #tuneout takes a minute
tuneout <- tune(svm,y~.,data=brain,kernel="linear",
ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100)))
summary(tuneout) #10 fold CV on several models
pred<-predict(tuneout$best.mod)
table(predict=pred,truth=brain$y) 


#f) SVM with cost chosen by 10 fold CV: validation set error rate
set.seed(1) #tuneout takes a minute
tuneout <- tune(svm,y~.,data=brain,subset=train,kernel="linear",
ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100)))
summary(tuneout) #10 fold CV on several models
pred<-predict(tuneout$best.mod,newdata=brain[-train,])
table(predict=pred,truth=brain$y[-train]) 


##Problem 7.1, HW10 A) 

x <- cbind(buxx,buxy)
out<-kmeans(x,2,nstart=25)
plot(x, col = out$cluster)
points(out$centers, col = 1:2, pch = 8, cex=2)

##Problem 7.2

x <- cbind(buxx,buxy)
out <- hclust(dist(x),"complete") 
#complete is the default
plot(out)
#plot(out,hang=-1)


##Problem 7.3  HW10 B)

x1 <- c(-0.6,  0.1,-1.5,-1.4, 1.1,-0.9, 1.4, 0.6,0  )
x2 <- c(-1  ,-0.75,-0.4,-1.6,-0.3,-1.2,   0,-0.2,0.7)
x <- cbind(x1,x2)
##out<-hclust(x) #errors
out <- hclust(dist(x))
plot(out)
plot(x[,1],x[,2])
library(cluster)
out<-agnes(x)
plot(out) #right click twice


### Problem 7.4

#a)

mbb1415 <- read.csv(file.choose(), header=T)
#select mbb1415 from F drive, or where you stored the file

#b) 

out <- hclust(dist(mbb1415[,c(7,13)]))
plot(out) #uses index as label
plot(out, labels=as.character(mbb1415[,1]))
#uses position as label

#c) 

quantile(mbb1415[,3])
mcut <- mbb1415[mbb1415[,3]>182,]
out <- hclust(dist(mcut[,c(7,13)]))
plot(out, labels=as.character(mcut[,1]))


### Problem 7.5

#a)

wbb1415 <- read.csv(file.choose(), header=T)
#select wbb1415 from F drive, or where you stored the file

#b) 

out <- hclust(dist(wbb1415[,c(7,13)]))
plot(out) #uses index as label
plot(out, labels=as.character(wbb1415[,1]))
#uses position as label

#c) 

quantile(wbb1415[,3])
wcut <- wbb1415[wbb1415[,3]>182,]
out <- hclust(dist(wcut[,c(7,13)]))
plot(out, labels=as.character(wcut[,1]))