*R code for homework problems; Copy and paste code one problem part at a time. The easiest way to get slpack.txt and sldata.txt is to copy and paste the following two commands into R. source("http://parker.ad.siu.edu/Olive/slpack.txt") source("http://parker.ad.siu.edu/Olive/sldata.txt") #The following commands are often used. library(glmnet) library(pls) library(leaps) #If slpack.txt and sldata.txt are on J drive, get lregpack and lregdata with #the following commands: #source("J:/slpack.txt") #source("J:/sldata.txt") You will often need packages and may need to install packages the first time you use a given computer. install.packages("glmnet") install.packages("pls") install.packages("leaps") install.packages("e1071") install.packages("randomForest") install.packages("ISLR") #install.packages("gbm") #install.packages("tree") #install.packages("ROCR") #These commands can fail if your computer has a firewall. #Then an administrator may be needed to install the packages. ##HW1 A) Problem 1.10 a) n<-100000; q<-7 b <- 0 * 1:q + 1 x <- matrix(rnorm(n * q), nrow = n, ncol = q) y <- 1 + x %*% b + rnorm(n) out<-lsfit(x,y) res<-out$res yhat<-y-res dd<-length(out$coef) AERplot(yhat,y,res=res,d=dd,alph=1) #usual response plot # 1.10 b) AERplot(yhat,y,res=res,d=dd,alph=0.01) #plots data outside the 99% pointwise PIs ##HW1 C) Problem 1.11 a) #n > 10p library(glmnet) library(leaps) library(pls) x <- matrix(rnorm(300),nrow=100,ncol=3) z <- x[,1] + rnorm(100,sd=0.1) Y <- exp(z) enet(x,Y) #1.11b) tplot2(x,Y,type=2) #right click Stop 7 times #tplot2(x,Y,type=1) #right click Stop 7 times #tplot2(x,Y,type=3) #right click Stop 7 times #tplot2(x,Y,type=4) #right click Stop 7 times #tplot2(x,Y,type=5) #right click Stop 7 times #tplot2(x,Y,type=6) #right click Stop 7 times #tplot2(x,Y,type=7) #right click Stop 7 times #1.11d) #n = p-1, sparse model #100 nontrivail predictors x <- matrix(rnorm(10000),nrow=100,ncol=100) z <- x[,1] + rnorm(100,sd=0.1) Y <- exp(z) tplot2(x,Y,type=2) #right click stop 7 times #1.11f) tplot2(x,Y,type=5) #PLS overfit, right click stop 7 times #tplot2(x,Y,type=3) #tplot2(x,Y,type=4) #ridge regression failed #tplot2(x,Y,type=6) #PCR may have overfit #tplot2(x,Y,type=7) #tplot2(x,Y,type=1) #should give an error since n < p ##HW1 D) ##Problem 1.12 a) #skewed data x <- matrix(rnorm(3000),nrow=1000,ncol=3) y <- x%*%c(1,1,1) + 2*rexp(1000) out<-lsfit(x,y) FIT <- y - out$resid plot(FIT,y) abline(0,1) lines(lowess(FIT,y)) #Problem 1.12 b) and c) for R plot(FIT,out$resid) abline(0,0) lines(lowess(FIT,out$resid)) out$coef ##HW2 B) Problem 1.13 a) #Copy and paste two source commands and the three library commands #from near the top of this file into R. out <- cv.glmnet(buxx,buxy,alpha=1) #lasso lam <- out$lambda.min fit <- predict(out,s=lam,newx=buxx) res <- buxy - fit vars <- 1:4 lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 AERplot2(yhat=fit,y=buxy,res=res,d=pp) title("lasso") #Problem 1.13 b) ind <-getB(buxx)$indx yb <- buxy[ind] xb <- buxx[ind,] xnotinb <- buxx[-ind,] ynotinb <- buxy[-ind] out <- cv.glmnet(xb,yb,alpha=1) #lasso lam <- out$lambda.min fit <- predict(out,s=lam,newx=xb) fit2 <- predict(out,s=lam,newx=xnotinb) yy <- c(yb,ynotinb) yhat <- c(fit,fit2) res <- yy - yhat vars <- 1:4 lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 AERplot2(yhat=yhat,y=yy,res=res,d=pp) title("lasso using covmb set B") #Problem 1.13 c) ddplot5(buxx) ##HW2 C Problem 1.14 a) out <- cv.glmnet(cbrainx,cbrainy,alpha=1) #lasso lam <- out$lambda.min fit <- predict(out,s=lam,newx=cbrainx) res <- cbrainy - fit vars <- 1:11 lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 AERplot2(yhat=fit,y=cbrainy,res=res,d=pp) title("lasso") #Problem 1.14 b) xcontin <- cbrainx[,-c(2,4,10)] #continuous predictors ind <-getB(xcontin)$indx yb <- cbrainy[ind] xb <- cbrainx[ind,] xnotinb <- cbrainx[-ind,] ynotinb <- cbrainy[-ind] out <- cv.glmnet(xb,yb,alpha=1) #lasso lam <- out$lambda.min fit <- predict(out,s=lam,newx=xb) fit2 <- predict(out,s=lam,newx=xnotinb) yy <- c(yb,ynotinb) yhat <- c(fit,fit2) res <- yy - yhat vars <- 1:4 lcoef <- predict(out,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 AERplot2(yhat=yhat,y=yy,res=res,d=pp) title("lasso using covmb set B") #length(cbrainy) - length(ind) #8 cases with the largest distances are omitted from covmb2 set B #Problem 1.14 c) ddplot5(xcontin) ##HW2 D Problem 1.15 a) mldsim6(n=100,p=10,outliers=1,gam=0.25,pm=20, osteps=9) mldsim6(n=100,p=45,outliers=1,gam=0.25,pm=60, osteps=9) #Problem 1.15 b) #takes a few minutes since p = 1000 mldsim7(n=100,p=1000,outliers=2,pm=900,osteps=0) mldsim7(n=100,p=1000,outliers=2,pm=900,osteps=9) #Problem 1.15 c) mldsim7(n=100,p=100,outliers=3,gam=0.25,pm=8,osteps=0) mldsim7(n=100,p=100,outliers=3,gam=0.25,pm=8,osteps=9) ##HW3 D, Problem 2.9 predsim() ##HW3 E, Problem 2.10: regbootsim2(nruns=500,psi=0.0) ##HW4 D Problem 3.18 #Copy and paste two source commands from near the top of this file into R. #a) library(leaps) n<- 1000 p<-4 q<-p-1 b <- 0 * 1:q b[1] <- 1 #beta = (1,1,0,0)^T x <- matrix(rnorm(n * q), nrow = n, ncol = q) y <- 1 + x %*% b + rnorm(n) outols <- lsfit(x,y) outols$coef #Tn = betahat #b) tem<-regboot(x,y) betas <- tem$betas betas[1:5,] #c) apply(betas,2,mean) #d) temp<-fselboot(x,y) vsbetas <- temp$betas vsbetas[1:5,] #e) apply(vsbetas,2,mean) ##HW5 D Problem 3.19 library(leaps) #person 1 regbootsim2(n=100,p=4,k=1,nruns=1000,type=1,psi=0) vsbootsim3(n=100,p=4,k=1,nruns=1000,type=1,psi=0) #person 2 regbootsim2(n=100,p=4,k=1,nruns=1000,type=2,psi=0) vsbootsim3(n=100,p=4,k=1,nruns=1000,type=2,psi=0) #person 3 regbootsim2(n=100,p=4,k=1,nruns=1000,type=3,psi=0) vsbootsim3(n=100,p=4,k=1,nruns=1000,type=3,psi=0) #person 4 regbootsim2(n=100,p=4,k=1,nruns=1000,type=4,psi=0) vsbootsim3(n=100,p=4,k=1,nruns=1000,type=4,psi=0) #person 5 regbootsim2(n=100,p=4,k=1,nruns=1000,type=5,psi=0) vsbootsim3(n=100,p=4,k=1,nruns=1000,type=5,psi=0) #person 6 regbootsim2(n=100,p=4,k=1,nruns=1000,type=1,psi=0.9) vsbootsim3(n=100,p=4,k=1,nruns=1000,type=1,psi=0.9) #person 7 regbootsim2(n=100,p=4,k=1,nruns=1000,type=2,psi=0.9) vsbootsim3(n=100,p=4,k=1,nruns=1000,type=2,psi=0.9) #person 8 regbootsim2(n=100,p=4,k=1,nruns=1000,type=3,psi=0.9) vsbootsim3(n=100,p=4,k=1,nruns=1000,type=3,psi=0.9) #person 9 regbootsim2(n=100,p=4,k=1,nruns=1000,type=4,psi=0.9) vsbootsim3(n=100,p=4,k=1,nruns=1000,type=4,psi=0.9) #person 10 regbootsim2(n=100,p=4,k=1,nruns=1000,type=5,psi=0.9) vsbootsim3(n=100,p=4,k=1,nruns=1000,type=5,psi=0.9) #person 11 regbootsim2(n=100,p=5,k=1,nruns=1000,type=1,psi=0) vsbootsim3(n=100,p=5,k=1,nruns=1000,type=1,psi=0) #person 12 regbootsim2(n=100,p=5,k=1,nruns=1000,type=2,psi=0) vsbootsim3(n=100,p=5,k=1,nruns=1000,type=2,psi=0) #person 13 regbootsim2(n=100,p=5,k=1,nruns=1000,type=3,psi=0) vsbootsim3(n=100,p=5,k=1,nruns=1000,type=3,psi=0) #person 14 regbootsim2(n=100,p=5,k=1,nruns=1000,type=4,psi=0) vsbootsim3(n=100,p=5,k=1,nruns=1000,type=4,psi=0) #person 15 regbootsim2(n=100,p=5,k=1,nruns=1000,type=5,psi=0) vsbootsim3(n=100,p=5,k=1,nruns=1000,type=5,psi=0) #person 16 regbootsim2(n=100,p=5,k=1,nruns=1000,type=1,psi=0.9) vsbootsim3(n=100,p=5,k=1,nruns=1000,type=1,psi=0.9) #person 17 regbootsim2(n=100,p=5,k=1,nruns=1000,type=2,psi=0.9) vsbootsim3(n=100,p=5,k=1,nruns=1000,type=2,psi=0.9) #person 18 regbootsim2(n=100,p=5,k=1,nruns=1000,type=3,psi=0.9) vsbootsim3(n=100,p=5,k=1,nruns=1000,type=3,psi=0.9) #person 19 regbootsim2(n=100,p=5,k=1,nruns=1000,type=4,psi=0.9) vsbootsim3(n=100,p=5,k=1,nruns=1000,type=4,psi=0.9) #person 20 regbootsim2(n=100,p=5,k=1,nruns=1000,type=5,psi=0.9) vsbootsim3(n=100,p=5,k=1,nruns=1000,type=5,psi=0.9) ##HW6 B Problem 3.20 ridge regression versus OLS library(leaps) library(glmnet) #person 1 regbootsim2(n=100,p=4,k=1,nruns=1000,type=1,psi=0) lassobootsim3(n=100,p=4,k=1,nruns=100,type=1,psi=0,regtype=0,BB=250) #person 2 regbootsim2(n=100,p=4,k=1,nruns=1000,type=2,psi=0) lassobootsim3(n=100,p=4,k=1,nruns=100,type=2,psi=0,regtype=0,BB=250) #person 3 regbootsim2(n=100,p=4,k=1,nruns=1000,type=3,psi=0) lassobootsim3(n=100,p=4,k=1,nruns=100,type=3,psi=0,regtype=0,BB=250) #person 4 regbootsim2(n=100,p=4,k=1,nruns=1000,type=4,psi=0) lassobootsim3(n=100,p=4,k=1,nruns=100,type=4,psi=0,regtype=0,BB=250) #person 5 regbootsim2(n=100,p=4,k=1,nruns=1000,type=5,psi=0) lassobootsim3(n=100,p=4,k=1,nruns=100,type=5,psi=0,regtype=0,BB=250) #person 6 regbootsim2(n=100,p=4,k=1,nruns=1000,type=1,psi=0.9) lassobootsim3(n=100,p=4,k=1,nruns=100,type=1,psi=0.9,regtype=0,BB=250) #person 7 regbootsim2(n=100,p=4,k=1,nruns=1000,type=2,psi=0.9) lassobootsim3(n=100,p=4,k=1,nruns=100,type=2,psi=0.9,regtype=0,BB=250) #person 8 regbootsim2(n=100,p=4,k=1,nruns=1000,type=3,psi=0.9) lassobootsim3(n=100,p=4,k=1,nruns=100,type=3,psi=0.9,regtype=0,BB=250) #person 9 regbootsim2(n=100,p=4,k=1,nruns=1000,type=4,psi=0.9) lassobootsim3(n=100,p=4,k=1,nruns=100,type=4,psi=0.9,regtype=0,BB=250) #person 10 regbootsim2(n=100,p=4,k=1,nruns=1000,type=5,psi=0.9) lassobootsim3(n=100,p=4,k=1,nruns=100,type=5,psi=0.9,regtype=0,BB=250) #person 11 regbootsim2(n=100,p=5,k=1,nruns=1000,type=1,psi=0) lassobootsim3(n=100,p=5,k=1,nruns=100,type=1,psi=0,regtype=0,BB=250) #person 12 regbootsim2(n=100,p=5,k=1,nruns=1000,type=2,psi=0) lassobootsim3(n=100,p=5,k=1,nruns=100,type=2,psi=0,regtype=0,BB=250) #person 13 regbootsim2(n=100,p=5,k=1,nruns=1000,type=3,psi=0) lassobootsim3(n=100,p=5,k=1,nruns=100,type=3,psi=0,regtype=0,BB=250) #person 14 regbootsim2(n=100,p=5,k=1,nruns=1000,type=4,psi=0) lassobootsim3(n=100,p=5,k=1,nruns=100,type=4,psi=0,regtype=0,BB=250) #person 15 regbootsim2(n=100,p=5,k=1,nruns=1000,type=5,psi=0) lassobootsim3(n=100,p=5,k=1,nruns=100,type=5,psi=0,regtype=0,BB=250) #person 16 regbootsim2(n=100,p=5,k=1,nruns=1000,type=1,psi=0.9) lassobootsim3(n=100,p=5,k=1,nruns=100,type=1,psi=0.9,regtype=0,BB=250) #person 17 regbootsim2(n=100,p=5,k=1,nruns=1000,type=2,psi=0.9) lassobootsim3(n=100,p=5,k=1,nruns=100,type=2,psi=0.9,regtype=0,BB=250) #person 18 regbootsim2(n=100,p=5,k=1,nruns=1000,type=3,psi=0.9) lassobootsim3(n=100,p=5,k=1,nruns=100,type=3,psi=0.9,regtype=0,BB=250) #person 19 regbootsim2(n=100,p=5,k=1,nruns=1000,type=4,psi=0.9) lassobootsim3(n=100,p=5,k=1,nruns=100,type=4,psi=0.9,regtype=0,BB=250) #person 20 regbootsim2(n=100,p=5,k=1,nruns=1000,type=5,psi=0.9) lassobootsim3(n=100,p=5,k=1,nruns=100,type=5,psi=0.9,regtype=0,BB=250) ##HW6 C Problem 3.21 lasso versus OLS #library(leaps) #library(glmnet) #person 1 regbootsim2(n=100,p=4,k=1,nruns=1000,type=1,psi=0) lassobootsim4(n=100,p=4,k=1,nruns=100,type=1,psi=0,BB=250) #person 2 regbootsim2(n=100,p=4,k=1,nruns=1000,type=2,psi=0) lassobootsim4(n=100,p=4,k=1,nruns=100,type=2,psi=0,BB=250) #person 3 regbootsim2(n=100,p=4,k=1,nruns=1000,type=3,psi=0) lassobootsim4(n=100,p=4,k=1,nruns=100,type=3,psi=0,BB=250) #person 4 regbootsim2(n=100,p=4,k=1,nruns=1000,type=4,psi=0) lassobootsim4(n=100,p=4,k=1,nruns=100,type=4,psi=0,BB=250) #person 5 regbootsim2(n=100,p=4,k=1,nruns=1000,type=5,psi=0) lassobootsim4(n=100,p=4,k=1,nruns=100,type=5,psi=0,BB=250) #person 6 regbootsim2(n=100,p=4,k=1,nruns=1000,type=1,psi=0.9) lassobootsim4(n=100,p=4,k=1,nruns=100,type=1,psi=0.9,BB=250) #person 7 regbootsim2(n=100,p=4,k=1,nruns=1000,type=2,psi=0.9) lassobootsim4(n=100,p=4,k=1,nruns=100,type=2,psi=0.9,BB=250) #person 8 regbootsim2(n=100,p=4,k=1,nruns=1000,type=3,psi=0.9) lassobootsim4(n=100,p=4,k=1,nruns=100,type=3,psi=0.9,BB=250) #person 9 regbootsim2(n=100,p=4,k=1,nruns=1000,type=4,psi=0.9) lassobootsim4(n=100,p=4,k=1,nruns=100,type=4,psi=0.9,BB=250) #person 10 regbootsim2(n=100,p=4,k=1,nruns=1000,type=5,psi=0.9) lassobootsim4(n=100,p=4,k=1,nruns=100,type=5,psi=0.9,BB=250) #person 11 regbootsim2(n=100,p=5,k=1,nruns=1000,type=1,psi=0) lassobootsim4(n=100,p=5,k=1,nruns=100,type=1,psi=0,BB=250) #person 12 regbootsim2(n=100,p=5,k=1,nruns=1000,type=2,psi=0) lassobootsim4(n=100,p=5,k=1,nruns=100,type=2,psi=0,BB=250) #person 13 regbootsim2(n=100,p=5,k=1,nruns=1000,type=3,psi=0) lassobootsim4(n=100,p=5,k=1,nruns=100,type=3,psi=0,BB=250) #person 14 regbootsim2(n=100,p=5,k=1,nruns=1000,type=4,psi=0) lassobootsim4(n=100,p=5,k=1,nruns=100,type=4,psi=0,BB=250) #person 15 regbootsim2(n=100,p=5,k=1,nruns=1000,type=5,psi=0) lassobootsim4(n=100,p=5,k=1,nruns=100,type=5,psi=0,BB=250) #person 16 regbootsim2(n=100,p=5,k=1,nruns=1000,type=1,psi=0.9) lassobootsim4(n=100,p=5,k=1,nruns=100,type=1,psi=0.9,BB=250) #person 17 regbootsim2(n=100,p=5,k=1,nruns=1000,type=2,psi=0.9) lassobootsim4(n=100,p=5,k=1,nruns=1000,type=2,psi=0.9,BB=250) #person 18 regbootsim2(n=100,p=5,k=1,nruns=1000,type=3,psi=0.9) lassobootsim4(n=100,p=5,k=1,nruns=100,type=3,psi=0.9,BB=250) #person 19 regbootsim2(n=100,p=5,k=1,nruns=1000,type=4,psi=0.9) lassobootsim4(n=100,p=5,k=1,nruns=100,type=4,psi=0.9,BB=250) #person 20 regbootsim2(n=100,p=5,k=1,nruns=1000,type=5,psi=0.9) lassobootsim4(n=100,p=5,k=1,nruns=100,type=5,psi=0.9,BB=250) ##HW6 D) Problem 4.13 elastic net with outliers #a) library(glmnet) out <- enet2(buxx,buxy) #elastic net lam <- out$lam tem <- out$out fit <- predict(tem,s=lam,newx=buxx) res <- buxy - fit vars <- 1:4 lcoef <- predict(tem,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 AERplot2(yhat=fit,y=buxy,res=res,d=pp) title("a) elastic net") #b) #resistant elastic net ind <-getB(buxx)$indx yb <- buxy[ind] xb <- buxx[ind,] xnotinb <- buxx[-ind,] ynotinb <- buxy[-ind] out <- enet2(xb,yb) #elastic net lam <- out$lam tem <- out$out fit <- predict(tem,s=lam,newx=xb) fit2 <- predict(tem,s=lam,newx=xnotinb) yy <- c(yb,ynotinb) yhat <- c(fit,fit2) res <- yy - yhat vars <- 1:4 lcoef <- predict(tem,type="coefficients",s=lam) lcoef<-as.vector(lcoef)[-1] vin <- vars[lcoef!=0] pp <- length(vin)+1 AERplot2(yhat=yhat,y=yy,res=res,d=pp) title("b) elastic net using covmb set B") #Also see 1.9 c). ddplot5(buxx) ##Problem 4.7 HW1 B #a) belx <- 50:73 bely <-c(0.44,0.47,0.47,0.59,0.66,0.73,0.81,0.88,1.06,1.2,1.35,1.49, 1.61,2.12,11.9,12.4,14.2,15.9,18.2,21.2,4.3,2.4,2.7,2.9) out <- lm(bely~belx) ESP <- predict(out) Y <- bely plot(ESP,Y) abline(0,1) #b) library(mgcv) outgam <- gam(bely~s(belx)) EAP <- predict(outgam) plot(EAP,Y) abline(0,1) #Problem 4.8 HW7 A #ha = number patients with heart attack for each ck group #ok = number of patients without a heart attack #a) ck <- c(20,60,100,140,180,220,260,300,340,380,420,460) ha <- c(2,13,30,30,21,19,18,13,19,15,7,8) ok <- c(88,26,8,5,0,1,1,1,1,0,0,0) heart <- as.data.frame(cbind(ck,ha,ok)) outw <- glm(cbind(ha,ok)~ck+I(ck^2)+I(ck^3),family=binomial, data=heart) out <- glm(cbind(ha,ok)~log(ck),family=binomial, data=heart) par(mfrow=c(2,2)) library(mgcv) outgam <- gam(cbind(ha,ok)~s(ck),family=binomial, data=heart) plot(outgam) title("a) Shat") outgam2 <- gam(cbind(ha,ok)~s(log(ck)),family=binomial, data=heart) plot(outgam2) title("b) Shat for log(ck) GAM") EAP <- predict(outgam) ESP <- predict(out) plot(EAP,ESP) abline(0,1) title("c) EE Plot: log(ck) GLM") Z <- ha/(ha+ok) fit<-Z plot(ESP,Z) fit <- exp(ESP)/(1 + exp(ESP)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) title("d) Response Plot ") par(mfrow=c(1,1)) #b) AIC(outw) AIC(out) #Problem 4.9 HW7 B pisimspline(n=100,nruns=5000,type=5,modt=3) #Problem 4.10 HW7 C: Binary logistic regression with lasso #a) set.seed(1976) #Binary regression library(glmnet) n<-100 m<-1 #binary regression q <- 100 #100 nontrivial predictors, 95 inactive k <- 5 #k_S = 5 population active predictors y <- 1:n mv <- m + 0 * y vars <- 1:q beta <- 0 * 1:q beta[1:k] <- beta[1:k] + 1 beta alpha <- 0 x <- matrix(rnorm(n * q), nrow = n, ncol = q) SP <- alpha + x[,1:k] %*% beta[1:k] pv <- exp(SP)/(1 + exp(SP)) y <- rbinom(n,size=m,prob=pv) y out<-cv.glmnet(x,y,family="binomial") lam <- out$lambda.min bhat <- as.vector(predict(out,type="coefficients",s=lam)) ahat <- bhat[1] #alphahat bhat<-bhat[-1] vin <- vars[bhat!=0] #want 1-5, overfit # [1] 1 2 3 4 5 6 16 59 61 74 75 76 96 ind <- as.data.frame(cbind(y,x[,vin])) #relaxed lasso GLM tem <- glm(y~.,family="binomial",data=ind) tem$coef lrplot3(tem=tem,x=x[,vin]) #binary response plot #b) #now use MLR lasso outm<-cv.glmnet(x,y) lamm <- outm$lambda.min bm <- as.vector(predict(outm,type="coefficients",s=lamm)) am <- bm[1] #alphahat bm<-bm[-1] vm <- vars[bm!=0] #1 more variable than GLM lasso vm inm <- as.data.frame(cbind(y,x[,vm])) #relaxed lasso GLM tm <- glm(y~.,family="binomial",data=inm) lrplot3(tem=tm,x=x[,vm]) #binary response plot #c) library(leaps) out<-fsel(x,y) vin<-out$vin vin #severe underfit #[1] 4 inm <- as.data.frame(cbind(y,x[,vin])) tm <- glm(y~.,family="binomial",data=inm) lrplot3(tem=tm,x=x[,vin]) #binary response plot #Problem 4.11 HW7 D lasso for Poisson regression #a) set.seed(1976) #Poisson regression library(glmnet) n<-100 q <- 100 #100 nontrivial predictors, 95 inactive k <- 5 #k_S = 5 population active predictors y <- 1:n mv <- m + 0 * y vars <- 1:q beta <- 0 * 1:q beta[1:k] <- beta[1:k] + 1 beta alpha <- 0 x <- matrix(rnorm(n * q), nrow = n, ncol = q) SP <- alpha + x[,1:k] %*% beta[1:k] y <- rpois(n,lambda=exp(SP)) out<-cv.glmnet(x,y,family="poisson") lam <- out$lambda.min bhat <- as.vector(predict(out,type="coefficients",s=lam)) ahat <- bhat[1] #alphahat bhat<-bhat[-1] vin <- vars[bhat!=0] #want 1-5, overfit vin ind <- as.data.frame(cbind(y,x[,vin])) #relaxed lasso GLM out <- glm(y~.,family="poisson",data=ind) ESP <- predict(out) prplot2(ESP,x=x[,vin],y) #response and OD plots #b) #now use MLR lasso outm<-cv.glmnet(x,y) lamm <- outm$lambda.min bm <- as.vector(predict(outm,type="coefficients",s=lamm)) am <- bm[1] #alphahat bm<-bm[-1] vm <- vars[bm!=0] vm #less overfit than GLM lasso inm <- as.data.frame(cbind(y,x[,vm])) #relaxed lasso GLM out <- glm(y~.,family="poisson",data=inm) ESP <- predict(out) prplot2(ESP,x=x[,vm],y) #response and OD plots #c) Now use MLR forward selection with EBIC since n < 10p. library(leaps) out<-fsel(x,y) vin<-out$vin vin #severe underfit causes poor fit and overdispersion #[1] 5 inm <- as.data.frame(cbind(y,x[,vin])) out <- glm(y~.,family="poisson",data=inm) ESP <- predict(out) prplot2(ESP,x=x[,vin],y) #response and OD plots ## Problem 4.12, HW 10 Problem C) Regression Tree library(rpart) cars <- car90[, -match(c("Rim", "Tires", "Model2"), names(car90))] carfit <- rpart(Price/1000 ~ ., data=cars) plot(carfit) text(carfit) #Problem 5.11, HW8 D) needs sldata #a) group <- pottery[pottery[,1]!=5,1] group <- (as.integer(group!=1)) + 1 x <- pottery[pottery[,1]!=5,-1] #b) z <- x[,1:9] library(MASS) out <- qda(x=z,group) 1-mean(predict(out,z)$class==group) #c) #delete the first variable out <- qda(x=z[,-c(1)],group) 1-mean(predict(out,z[,-c(1)])$class==group) #delete the first two variables out <- qda(x=z[,-c(1,2)],group) 1-mean(predict(out,z[,-c(1,2)])$class==group) #deleting the second variable caused a classification error #so delete the first and third variable out <- qda(x=z[,-c(1,3)],group) 1-mean(predict(out,z[,-c(1,3)])$class==group) #deleting the third variable caused a classification error #so delete the first and fourth variable out <- qda(x=z[,-c(1,4)],group) 1-mean(predict(out,z[,-c(1,4)])$class==group) #do not need the 4th variable so continue deleting variables out <- qda(x=z[,-c(1,4,5)],group) 1-mean(predict(out,z[,-c(1,4,5)])$class==group) ##Problem 5.12 HW8 E) library(MASS) group <- pottery[pottery[,1]!=5,1] group <- (as.integer(group!=1)) + 1 x <- pottery[pottery[,1]!=5,-1] out<-lda(x[,c(6,11,14,18)],group) 1-mean(predict(out,x[,c(6,11,14,18)])$class==group) ##Problem 5.13 HW8 F) #a) group <- pottery[pottery[,1]!=5,1] group <- (as.integer(group!=1)) + 1 x <- pottery[pottery[,1]!=5,-1] library(class) out<-knn(x,x,group,k=1) mean(group!=out) #AER for 1NN is 0 #b) indx<-sample(1:28,replace=F) train <- indx[13:28] test <- indx[1:12] ztest <- x[test,] #12 test cases grptest <- group[test] ztrain <- x[train,] grptrain <- group[train] out<-knn(ztrain,ztest,grptrain,k=1) mean(grptest!=out) #validation ER #c) out<-knn(x,x,group,k=2) mean(group!=out) #AER for 2NN #d) out<-knn(ztrain,ztest,grptrain,k=2) mean(grptest!=out) #validation ER for 2NN #e) out <- knn(x[,c(6,11,14,18)], x[,c(6,11,14,18)],group,k=2) mean(group!=out) # AER for good subset of variables #f) out<-knn(ztrain[,c(6,11,14,18)], ztest[,c(6,11,14,18)],grptrain,k=2) mean(grptest!=out) #validation ER for good subset of variables ##Problem 5.14 HW9 B) #a) backward elimination Logistic Regression y <- cbrainx[,10] age <- cbrainx[,1] lnag <- log(age) brd <- cbrainx[,3] fcau <- as.factor(cbrainx[,4]) ceph <- cbrainx[,5] circ <- cbrainx[,6] hdht <- cbrainx[,7] ht <- cbrainx[,8] len <- cbrainx[,9] siz <- cbrainx[,11] lnsz <- log(siz) outf <- glm(y~age+lnag+brd+fcau+ceph+circ+hdht+ht+len+siz+lnsz,family=binomial) summary(outf) #summary for the full data back <- step(outf) #backward elimination outn <- glm(y~1,family=binomial) # null model without predictors AIC(outn) AIC(outf) summary(back) #step(outf,direction="both") #did backwards #stepAIC and step would not do forwards #forw <- step(outn,scope=list(lower~1, #upper~age+lnag+brd+fcau+ceph+circ+hdht+ht+len+siz+lnsz),direction="forward") #b) w has the variables from the submodel w <- cbind(cbrainx[,1],log(cbrainx[,1]),cbrainx[,c(5,6,7,8,9)],log(cbrainx[,11])) lrplot3(tem=back,x=w) #c) ESP <- predict(back) yhat <- as.integer(ESP > 0) AER <- 1 - mean(y==yhat) #1 - proportion of correct classifications AER #ESP2 <- w%*%back$coef[-1] + back$coef[1] #ESP-ESP2 #0 to 10 digits #d) set.seed(1) indx <- sample(1:267,replace=F) train <- indx[68:267] test <- indx[1:67] #67 test cases ytest <- y[test] ytrain <- y[train] wtest <- w[test,] wtrain <- w[train,] tem <- as.data.frame(cbind(ytrain,wtrain)) outtrain <- glm(ytrain~.,family=binomial,data=tem) ESPtest <- wtest%*%outtrain$coef[-1] + outtrain$coef[1] yhat <- as.integer(ESPtest > 0) VER <- 1 - mean(ytest==yhat) #1 - proportion of correct classifications VER #the following code also works #z <- as.data.frame(cbind(y,w)) #ztest <- z[test,] #ztrain<- z[train,] #outtrain<-glm(y~.,family=binomial,data=ztrain) #ESPtest2 <- predict(outtrain,newdata=ztest) #this code causes errors #wtest <- as.data.frame(wtest) #ESPtest <- predict(outtrain,newdata=wtest) #three variables get names V1, V2, and V8 which causes an error #because they are called V1, V2 and V8 in the data.frame #e) lasso with classification criterion library(glmnet) y <- cbrainx[,10] x <-cbind(age,lnag,brd,ceph,circ,hdht,ht,len,siz,lnsz) out<-cv.glmnet(x,y,family="binomial",type.measure="class") lam <- out$lambda.min kfoldER <- min(out$cvm) kfoldER #f) vars <- 1:dim(x)[2] bhat <- as.vector(predict(out,type="coefficients",s=lam)) ahat <- bhat[1] #alphahat bhat<-bhat[-1] vin <- vars[bhat!=0] #[1] 1 2 4 5 6 7 9 10 ESPlasso <- predict(out,newx=x,s=lam) #ESP2 <- x%*%bhat + ahat #ESP - ESP2 = 0 to 10 digits ind <- as.data.frame(cbind(y,x[,vin])) #relaxed lasso GLM tem <- glm(y~.,family="binomial",data=ind) tem$coef lrplot3(tem=tem,x=x[,vin]) #binary response plot #g) ESPRL <- predict(tem) plot(ESPRL,ESPlasso) abline(0,1) title("EE Plot") ##Problem 5.15 HW 10 Problem D) Classification Tree library(rpart) fit <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis) plot(fit) text(fit) #Problem 5.16 HW11 Problem A) pottery data with two groups: Bagging, Random Forests, SVM #a) y <- pottery[pottery[,1]!=5,1] y <- (as.integer(y!=1)) + 1 y <- 2*(y-1.5) #levels -1 and 1 x <- pottery[pottery[,1]!=5,-1] shards <-data.frame(y=as.factor(y),x) set.seed(1) library(randomForest) #bagging: use mtry = number of predictors = 20 bagshards <- randomForest(y~.,data=shards,mtry=20,importance=TRUE) pred<-predict(bagshards) table(predict=pred,truth=shards$y) #b) random forests default uses mtry approx sqrt(p) rfshards <- randomForest(y~.,data=shards,importance=TRUE) pred<-predict(rfshards) table(predict=pred,truth=shards$y) #c) SVM with a fixed cost library(e1071) svmfit=svm(y~., data=shards, kernel="linear",cost=10,scale=FALSE) pred <- predict(svmfit) table(predict=pred,truth=shards$y) #d) SVM with cost chosen by 10 fold CV set.seed(1) #tuneout takes a minute tuneout <- tune(svm,y~.,data=shards,kernel="linear", ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100))) summary(tuneout) #10 fold CV on several models pred<-predict(tuneout$best.mod) table(predict=pred,truth=shards$y) ##Problem 5.17 HW11 Problem B) Gladstone data with Gender as response #a) bagging AER brainwt <- cbrainy y <- cbrainx[,10]; y[y==0] <- -1 y<-as.factor(y) #-1=F, 1 = M brainx<-cbind(cbrainx[,-10],brainwt) brainx<-brainx[,-c(2,4)] brainx<-brainx[-c(230,254:258),] #remove outliers y<-y[-c(230,254:258)] brain<-data.frame(y,brainx) set.seed(1) train=sample(1:nrow(brain),200) library(randomForest) #bagging: use mtry = number of predictors = 9 bagbrain <- randomForest(y~.,data=brain,mtry=9,importance=TRUE) pred<-predict(bagbrain) table(predict=pred,truth=brain$y) #get AER #b) bagging with a validation set error rate bagbrain <- randomForest(y~.,data=brain,subset=train,mtry=9,importance=TRUE) pred<-predict(bagbrain,newdata=brain[-train,]) table(predict=pred,truth=brain$y[-train]) #c) random forests default uses mtry approx sqrt(p) rfbrain <- randomForest(y~.,data=brain,importance=TRUE) pred<-predict(rfbrain) table(predict=pred,truth=brain$y) #d) random forests with a validation set error rate rfbrain <- randomForest(y~.,data=brain,subset=train,importance=TRUE) pred<-predict(rfbrain,newdata=brain[-train,]) table(predict=pred,truth=brain$y[-train]) #e) SVM with cost chosen by 10 fold CV library(e1071) set.seed(1) #tuneout takes a minute tuneout <- tune(svm,y~.,data=brain,kernel="linear", ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100))) summary(tuneout) #10 fold CV on several models pred<-predict(tuneout$best.mod) table(predict=pred,truth=brain$y) #f) SVM with cost chosen by 10 fold CV: validation set error rate set.seed(1) #tuneout takes a minute tuneout <- tune(svm,y~.,data=brain,subset=train,kernel="linear", ranges=list(cost=c(0.001,0.01,0.1,1,5,10,100))) summary(tuneout) #10 fold CV on several models pred<-predict(tuneout$best.mod,newdata=brain[-train,]) table(predict=pred,truth=brain$y[-train]) ##Problem 7.1, HW10 A) x <- cbind(buxx,buxy) out<-kmeans(x,2,nstart=25) plot(x, col = out$cluster) points(out$centers, col = 1:2, pch = 8, cex=2) ##Problem 7.2 x <- cbind(buxx,buxy) out <- hclust(dist(x),"complete") #complete is the default plot(out) #plot(out,hang=-1) ##Problem 7.3 HW10 B) x1 <- c(-0.6, 0.1,-1.5,-1.4, 1.1,-0.9, 1.4, 0.6,0 ) x2 <- c(-1 ,-0.75,-0.4,-1.6,-0.3,-1.2, 0,-0.2,0.7) x <- cbind(x1,x2) ##out<-hclust(x) #errors out <- hclust(dist(x)) plot(out) plot(x[,1],x[,2]) library(cluster) out<-agnes(x) plot(out) #right click twice ### Problem 7.4 #a) mbb1415 <- read.csv(file.choose(), header=T) #select mbb1415 from F drive, or where you stored the file #b) out <- hclust(dist(mbb1415[,c(7,13)])) plot(out) #uses index as label plot(out, labels=as.character(mbb1415[,1])) #uses position as label #c) quantile(mbb1415[,3]) mcut <- mbb1415[mbb1415[,3]>182,] out <- hclust(dist(mcut[,c(7,13)])) plot(out, labels=as.character(mcut[,1])) ### Problem 7.5 #a) wbb1415 <- read.csv(file.choose(), header=T) #select wbb1415 from F drive, or where you stored the file #b) out <- hclust(dist(wbb1415[,c(7,13)])) plot(out) #uses index as label plot(out, labels=as.character(wbb1415[,1])) #uses position as label #c) quantile(wbb1415[,3]) wcut <- wbb1415[wbb1415[,3]>182,] out <- hclust(dist(wcut[,c(7,13)])) plot(out, labels=as.character(wcut[,1]))