##ppgamcode.txt: R/Splus code to go with Olive, D.J. (2011), 
##Plots for Generalized Additive Models
##code is for 2011 version of R, 
##some paper plots were made with 2008 version of R


##Splus ozone data  Chambers and Hastie (1993, pp. 251, 516) 

outy <- gam(ozone ~ s(radiation) + s(wind) + s(temperature), 
data = air)
EAP <- outy$additive.predictor
Y <- air[,1]
par(mfrow=c(2,1))
plot(EAP,Y,ylab="Y")
abline(0,1)
title("a) Response Plot")
plot(EAP,outy$resid,ylab="RESID")
title("b) Residual Plot")
#Fig. 1: ozone1.eps

##Splus ozone data  response transformations 

outn12 <- gam(ozone^(-1/2) ~ s(radiation) + s(wind) + s(temperature), data = air)
EAP <- outn12$additive.predictor
Y <- air[,1]^(-1/2)
plot(EAP,Y,ylab="Z**(-1/2)")
abline(0,1)
#plot(EAP,outn12$resid)

outn13 <- gam(ozone^(-1/3) ~ s(radiation) + s(wind) + s(temperature), data = air)
EAP <- outn13$additive.predictor
Y <- air[,1]^(-1/3)
plot(EAP,Y,ylab="Z**(-1/3)")
abline(0,1)
#plot(EAP,outn13$resid)

out12 <- gam(ozone^(1/2) ~ s(radiation) + s(wind) + s(temperature), data = air)
EAP <- out12$additive.predictor
Y <- air[,1]^(1/2)
plot(EAP,Y,ylab="Z**(1/2)")
abline(0,1)
#plot(EAP,out12$resid)

par(mfrow=c(2,2))

outn1 <- gam(ozone^(-1) ~ s(radiation) + s(wind) + s(temperature), data = air)
EAP <- outn1$additive.predictor
Y <- air[,1]^(-1)
plot(EAP,Y,ylab="Z**(-1)")
abline(0,1)
title("a) t(Z) = 1/Z")
#plot(EAP,outn1$resid)
#98 df, nonlinear S_j, radiation may not be significant

outln <- gam(log(ozone) ~ s(radiation) + s(wind) + s(temperature), data = air)
EAP <- outln$additive.predictor
Y <- log(air[,1])
plot(EAP,Y,ylab="log(Z)")
abline(0,1)
title("b) t(Z) = log(Z)")
#plot(EAP,outln$resid)
#98 df, nonlinear S_j

out13 <- gam(ozone^(1/3) ~ s(radiation) + s(wind) + s(temperature), data = air)
EAP <- out13$additive.predictor
Y <- air[,1]^(1/3)
plot(EAP,Y,ylab="Z**(1/3)")
abline(0,1)
title("c) t(Z) = Z^(1/3)")
#plot(EAP,out13$resid)
#98 df, nonlinear S_j, radiation may not be significant

outy <- gam(ozone ~ s(radiation) + s(wind) + s(temperature), 
data = air)
EAP <- outy$additive.predictor
Y <- air[,1]
plot(EAP,Y,ylab="Y")
abline(0,1)
title("d) t(Z) = Z")
#plot(EAP,outy$resid)
#98 df, nonlinear S_j, radiation is not significant

outy <- gam(ozone ~  s(wind) + s(temperature), 
data = air)
EAPS <- outy$additive.predictor
Y <- air[,1]
plot(EAPS,Y,ylab="Y")
abline(0,1)
#plot(EAPS,outy$resid)
plot(EAPS,EAP)
abline(0,1)
#cor(EAPS,EAP) = 0.96
#Figure 4:  ozone.eps

RES <- outln$resid
EAP <- outln$additive.predictor
plot(EAP,RES,main="log(Z)")
RES <- out13$resid
EAP <- out13$additive.predictor
plot(EAP,RES,main="cubrt(Z)")
EAP <- out12$additive.predictor
RES <- out12$resid
plot(EAP,RES,main="sqrt(Z)")
EAP <- outy$additive.predictor
RES <- outy$resid
plot(EAP,RES,main="Z")
#Figure 5:  rozone.eps
#no transformation is better than cube root transformation

##Get Gladstone data or cbrain data from  
#www.math.siu.edu/olive/regdata.txt
#using age and log(age) and size and log(size) helps
#get rid of curvature for age, circ, and ht
#cbrainx <- cbind(age,ageclass,breadth,cause,cephalic,
#circum,headht,height,len,sex,size)

Y <- cbrainx[,10]
age <- cbrainx[,1]
lnag <- log(age)
ht <- cbrainx[,8]
circ <- cbrainx[,6]
len <- cbrainx[,9]
siz <- cbrainx[,11]
lnsz <- log(siz)

oglm <- glm(Y~age+lnag+circ+ht+len+siz+lnsz,family=binomial)
ESP <- predict(oglm)

library(mgcv)
ogam <- gam(Y~s(age) + s(lnag) + s(circ) 
+  s(ht) + s(len) + s(siz) + s(lnsz),family=binomial)
plot(ogam)
#none of the predictors look very curved

EAP <- predict(ogam)

plot(EAP, ESP)
#cor(EAP, ESP) = 0.9, rather low

outg <- gam(Y~s(age) + s(circ) + s(ht) + s(len) + s(siz), family=binomial)
plot(outg)
#age, circ, ht curved

#GAM response plot
 
plot(EAP, Y)
fit <- Y
fit <- exp(EAP)/(1 + exp(EAP))	
indx <- sort.list(EAP)
lines(EAP[indx], fit[indx])
n <- length(Y)
slices <- 8
fit2 <- fit
n <- length(Y)
val <- as.integer(n/slices)
for(i in 1:(slices - 1)) {
  fit2[((i - 1) * val + 1):(i * val)] <- mean(Y[indx[((i - 1) * 
  val + 1):(i * val)]])
	}
  fit2[((slices - 1) * val + 1):n] <- mean(Y[indx[((slices - 1) * val + 1
		):n]])	
lines(EAP[indx], fit2)
title("Response Plot")
#Fig. 2 glad.ps
 
#GLM response plot, not shown in paper
 
plot(ESP, Y)
fit <- Y
fit <- exp(ESP)/(1 + exp(ESP))	
indx <- sort.list(ESP)
lines(ESP[indx], fit[indx])
n <- length(Y)
slices <- 8
fit2 <- fit
n <- length(Y)
val <- as.integer(n/slices)
for(i in 1:(slices - 1)) {
   fit2[((i - 1) * val + 1):(i * val)] <- mean(Y[indx[((i - 1) * 
   val + 1):(i * val)]])
	}
  fit2[((slices - 1) * val + 1):n] <- mean(Y[indx[((slices - 1) * val + 1
		):n]])	
lines(ESP[indx], fit2)
title("Response Plot")

##species data is from Cook and Weisberg (1999, pp. 285-286) 
#Y = number of species, endem = number of endemic species
#area = area of island, elev = elevation of island
#distnear = distance of nearest island, 
#distsc = distance to Santa Cruz
#areanear = area of nearest island
#get data set from www.math.siu.edu/olive/regdata.txt

Y <- c(58, 31,  3, 25, 2, 18, 10, 8, 2, 97, 93, 58, 5, 40, 347, 51, 
2, 104, 108, 12, 70, 280, 237, 444, 62, 285, 44, 16, 21)

endem <- c(23, 21, 3, 9, 1, 11, 7, 4, 2, 26, 35, 17, 4, 19, 89, 
23, 2, 37, 33, 9, 30, 65, 81, 95, 28, 73, 16, 8, 12)

area <- c(25.09, 1.24, 0.21, 0.10, 0.05, 0.34, 2.33, 0.03, 0.18, 
58.27, 634.49, 0.57, 0.78, 17.35, 4669.32, 129.49, 0.01, 59.56, 
17.95, 0.23, 4.89, 551.62, 572.33, 903.82, 24.08, 170.92, 1.84, 
1.24, 2.85)

elev <- c(NA, 109, 114, 46, NA, NA, 168, NA, 112, 198, 1494, 49, 
227, 76, 1707, 343, 25, 777, 458, NA, 367, 716, 906, 864, 259, 640, 
NA, 186, 253)
 
distnear <- c(0.6, 0.6, 2.8, 1.9, 1.9, 8.0, 34.1, 0.4, 2.6, 1.1, 4.3, 
1.1, 4.6, 47.4, 0.7, 29.1, 3.3, 29.1, 10.7, 0.5, 4.4, 45.2, 0.2, 0.6, 
16.5, 2.6, 0.6, 6.8, 34.1)

distsc <- c(0.6, 26.3, 58.7, 47.4, 1.9, 8.0, 290.2, 0.4, 50.2, 88.3, 
95.3, 93.1, 62.2, 92.2, 28.1, 85.9, 45.9, 119.6, 10.7, 0.6, 24.4, 
66.5, 19.8, 0.0, 16.5, 49.2, 9.6, 50.9, 254.7)
 
areanear <- c(1.84, 572.33, 0.78, 0.18, 903.82, 1.84, 2.85, 17.95, 
0.10, 0.57, 4669.32, 58.27, 0.21, 129.49, 634.49, 59.56, 0.10, 129.49, 
0.03, 25.09, 572.33, 0.57, 4.89, 0.52, 0.52, 0.10, 25.09, 17.95, 2.33)
 
library(mgcv)
outg2 <- gam(Y~s(log(endem))+s(log(areanear)),family=poisson)

EAP <- predict(outg2)
par(mfrow = c(2, 2))
plot(EAP, Y)
Ehat <- exp(EAP)
indx <- sort.list(EAP)
lines(EAP[indx], Ehat[indx])
lines(lowess(EAP, Y), type = "s")
title("a) Response Plot")
Vhat <- (Y - Ehat)^2
plot(Ehat, Vhat)
abline(0, 1)
abline(0, 4)	
title("b) OD Plot")
Z <- Y
Z[Y < 1] <- Z[Y < 1] + 0.5
WRES <- sqrt(Z) * (log(Z) - EAP)
WFIT <- sqrt(Z) * EAP
plot(WFIT, sqrt(Z) * log(Z))
abline(0, 1)	
title("c) WFRP")
plot(WFIT, WRES)
title("d) Wtd Residual Plot")
#Fig. 3

##code leading to response and OD plots for negative binomial GAM
out1 <- glm(Y~log(endem)+log(areanear),family=poisson)
plot(out1)
#dev/df approx X^2/df approx 71.4/26 approx 2.75
#residual plot right opening megaphone
library(mgcv)
outg1 <- gam(Y~s(endem)+s(areanear),family=poisson)
outg2 <- gam(Y~s(log(endem))+s(log(areanear)),family=poisson)

library(MASS)

#glm.nb estimates theta = kappa so that disp approx 1

out <- glm.nb(Y~log(endem) + log(areanear))
summary(out)
#estimates theta = kappa = 50.9 with se(theta) = 26.7
#log(areanear) has pval = 0.044
out <- glm.nb(Y~log(endem))
summary(out)
#estimates theta = kappa = 36.7 with se(theta) = 17.1

#negative.binomial(37) assumes theta = kappa is 37 and known
outnb <- glm(Y~log(endem), family=negative.binomial(37))
plot(outnb)
summary(outnb)
#OD plot for NBR GLM
ESP <- predict(outnb)
Ehat <- exp(ESP)
ModVar <- exp(ESP) +  exp(2*ESP)/37
Vhat <- (Y - Ehat)^2
plot(ModVar, Vhat)
abline(0, 1)
abline(0,4)
#has dispersion parameter = .995
#plots are a bit better than for Poisson regression

####2008 version of R uses outnbg <- gam(Y~s(endem), family=negative.binomial(37))
outnbg <- gam(Y~s(endem), family=negbin(37))
EAP <- predict(outnbg)
ESP <- predict(outnb)
plot(EAP,ESP)

outnbg2 <- gam(Y~s(log(endem)), family=negbin(37))
plot(outnbg2)
#linear

#make response plot for negative binomial GAM
EAP <- predict(outnbg2)
plot(EAP,ESP)
plot(EAP,Y)
fit <- exp(EAP)
indx <- sort.list(EAP)
lines(EAP[indx],fit[indx])
lines(lowess(EAP,Y))
#Fig. 6: species.ps

#make OD plot for negative binomial GAM
#37 = 1/tauhat = kappahat
Ehat <- exp(EAP)
ModVar <- exp(EAP) +  exp(2*EAP)/37
Vhat <- (Y - Ehat)^2
plot(ModVar, Vhat)
abline(0, 1)
abline(0,4)
#abline(lsfit(ModVar, Vhat)$coef)
#Fig. 7: od.ps


##Wood (2006, p. 82-86) heart attack data
ck <- c(20,60,100,140,180,220,260,300,340,380,420,460)
#patients with heart attack
ha <- c(2,13,30,30,21,19,18,13,19,15,7,8)
#patients without a heart attack
ok <- c(88,26,8,5,0,1,1,1,1,0,0,0)
heart <- as.data.frame(cbind(ck,ha,ok))
out <- glm(cbind(ha,ok)~ck,family=binomial, data=heart)
#deviance is too high, need more predictors
out2 <- glm(cbind(ha,ok)~ck+I(ck^2)+I(ck^3),family=binomial, data=heart)
out3 <- glm(cbind(ha,ok)~ck+log(ck),family=binomial, data=heart)
out4 <- glm(cbind(ha,ok)~log(ck),family=binomial, data=heart)
library(mgcv)
out5 <- gam(cbind(ha,ok)~s(ck),family=binomial, data=heart)
EAP <- predict(out5)
ESP <- predict(out3)
plot(EAP,ESP)
abline(0,1)
ESPp <- predict(out2)
plot(EAP,ESPp,ylab="ESP")
abline(0,1)
#Figure 8: hrt2.eps
ESPl <- predict(out4)
plot(EAP,ESPl,ylab="ESP")
abline(0,1)
#Figure 9:  hrt1.eps

Z <- ha/(ha+ok)
fit<-Z
plot(ESPl,Z,xlab="ESP")
fit <- exp(ESPl)/(1 + exp(ESPl))
indx <- sort.list(ESPl)
lines(ESPl[indx], fit[indx])
#Figure 10: hrt3.eps

AIC(out2)
# 33.65773
AIC(out3)
#  34.18879
AIC(out4)
# 33.45305
##neither out2 nor out3 is too close to the GAM, but out3 seems better

out6 <- gam(cbind(ha,ok)~s(log(ck)),family=binomial, data=heart)
plot(out6)
#linear
EAP <- predict(out6)
plot(EAP,ESPl)
#cor is 1


##icu data from Statlib or URL 
#www.math.siu.edu/olive/ICU.lsp
#delete header of ICU.lsp and delete last parentheses at the end of
#the file. Save the file on G drive as icu.lsp

icu <- read.table("G:\\icu.txt")

names(icu) <- c("ID", "STA", "AGE", "SEX", "RACE", "SER", "CAN", "CRN",
"INF", "CPR", "SYS", "HRA", "PRE", "TYP", "FRA", "PO2", "PH", "PCO",
"Bic", "CRE", "LOC")

icu[,5] <- as.factor(icu[,5])
icu[,21] <- as.factor(icu[,21])
dim(icu)
icu2<-icu[,-1]
outf <- glm(formula = STA~.,family=binomial,data=icu2)
outr <- glm(formula = STA~AGE+CAN+SYS+TYP+LOC,
family=binomial, data=icu2)
outb <- glm(formula = STA~AGE+CAN+SYS+TYP+LOC+RACE,
family=binomial,data=icu2)
ESP <- predict(outf)


library(mgcv)
outgam <- gam(STA ~ s(AGE)+SEX+RACE+SER+CAN+CRN+INF+CPR+s(SYS)+s(HRA)
+PRE+TYP+FRA+PO2+PH+PCO+Bic+CRE+LOC,family=binomial,data=icu2)
EAP <- predict.gam(outgam)
plot(EAP,ESP)
abline(0,1)
#figure 12: icuee.ps  


Y <- icu2[,1]
plot(EAP,Y)
fit<-Y
fit <- exp(EAP)/(1 + exp(EAP))
indx <- sort.list(EAP)
lines(EAP[indx], fit[indx])
slices <- 18
fit2 <- fit
n <- length(Y)
val <- as.integer(n/slices)
for(i in 1:(slices - 1)) {
fit2[((i - 1) * val + 1):(i * val)] <- 
mean(Y[indx[((i - 1) * 	val + 1):(i * val)]])}
fit2[((slices - 1) * val + 1):n] <- 
mean(Y[indx[((slices - 1) * val + 1):n]])	
lines(EAP[indx], fit2)	
#figure 11: icug.ps


icuplot<-function(slices = 10, ff = 0.99, step = T)
{# Makes the response plot for binary logistic regression if ICU data
# is in file icu2 where ID numbers have been removed.
# If slice = T use step function, else use lowess.
# Workstation need to activate a graphics
# device with command "X11()" or "motif()."
icu2 <- icu
icu2[,5] <- as.factor(icu2[,5])
icu2[,21] <- as.factor(icu2[,21])
#get rid of ID number = 1st column
icu2<-icu[,-1]
x <- as.matrix(icu2[,-1])
outf <- glm(formula = STA~.,family=binomial,data=icu2)
ESP <- predict(outf)
Y <- icu2[,1]
plot(ESP, Y)
fit <- Y
fit <- exp(ESP)/(1 + exp(ESP))	#  lines(sort(ESP),sort(fit))
indx <- sort.list(ESP)
lines(ESP[indx], fit[indx])
if(step == T){
  fit2 <- fit
  n <- length(Y)
  val <- as.integer(n/slices)
  for(i in 1:(slices - 1)) {
    fit2[((i - 1) * val + 1):(i * val)] <- mean(Y[indx[((i - 1) * 
			val + 1):(i * val)]])
	}
    fit2[((slices - 1) * val + 1):n] <- mean(Y[indx[((slices - 1) * val + 1
		):n]])	# fit2 is already sorted in order corresponding to indx
    lines(ESP[indx], fit2)}
else lines(lowess(ESP, Y, f = ff), type = "s")
title("Response Plot")
}

#response plot for GLM not shown
icuplot() # used slices = 10 
icuplot(slices=18)

icuplot2<-function(){
# Makes the EE plot for binary logistic regression if ICU data
# is in file icu.
# If slice = T use step function, else use lowess.
# Workstation need to activate a graphics
# device with command "X11()" or "motif()."
#
#   
icu2 <- icu
icu2[,5] <- as.factor(icu2[,5])
icu2[,21] <- as.factor(icu2[,21])
#get rid of ID number = 1st column
icu2<-icu[,-1]
race2 <- as.integer(icu2[,4]==2)
race3 <- as.integer(icu2[,4]==3)
loc2 <- as.integer(icu2[,20]==2)
loc3 <- as.integer(icu2[,20]==3)
icu2 <- icu2[,-c(4,20)]
icu2 <- cbind(icu2,loc2,loc3,race2,race3)
x <- as.matrix(icu2[,-1])
outf <- glm(formula = STA~.,family=binomial,data=icu2)
ESP <- x %*% outf$coef[-1] + outf$coef[1]
outr <- glm(formula = STA~AGE+CAN+SYS+TYP+loc2+loc3,
family=binomial, data=icu2)
xr <- as.matrix(icu2[,c(2,5,9,12,19,20)])
ESPS <- xr %*% outr$coef[-1] + outr$coef[1]
plot(ESPS,ESP)
abline(0,1)
title("EE PLOT for Model without Race")
outs <- glm(formula = STA~AGE+CAN+SYS+TYP+loc2+loc3+race2+race3,
family=binomial, data=icu2)
xs <- as.matrix(icu2[,c(2,5,9,12,19,20,21,22)])
ESPS <- xs %*% outs$coef[-1] + outs$coef[1]
plot(ESPS,ESP)
abline(0,1)
title("EE PLOT for Model with Race")
}

##figures 13 and 14: icu2.ps and icu3.ps
icuplot2()