##ppgamcode.txt: R/Splus code to go with Olive, D.J. (2011), ##Plots for Generalized Additive Models ##code is for 2011 version of R, ##some paper plots were made with 2008 version of R ##Splus ozone data Chambers and Hastie (1993, pp. 251, 516) outy <- gam(ozone ~ s(radiation) + s(wind) + s(temperature), data = air) EAP <- outy$additive.predictor Y <- air[,1] par(mfrow=c(2,1)) plot(EAP,Y,ylab="Y") abline(0,1) title("a) Response Plot") plot(EAP,outy$resid,ylab="RESID") title("b) Residual Plot") #Fig. 1: ozone1.eps ##Splus ozone data response transformations outn12 <- gam(ozone^(-1/2) ~ s(radiation) + s(wind) + s(temperature), data = air) EAP <- outn12$additive.predictor Y <- air[,1]^(-1/2) plot(EAP,Y,ylab="Z**(-1/2)") abline(0,1) #plot(EAP,outn12$resid) outn13 <- gam(ozone^(-1/3) ~ s(radiation) + s(wind) + s(temperature), data = air) EAP <- outn13$additive.predictor Y <- air[,1]^(-1/3) plot(EAP,Y,ylab="Z**(-1/3)") abline(0,1) #plot(EAP,outn13$resid) out12 <- gam(ozone^(1/2) ~ s(radiation) + s(wind) + s(temperature), data = air) EAP <- out12$additive.predictor Y <- air[,1]^(1/2) plot(EAP,Y,ylab="Z**(1/2)") abline(0,1) #plot(EAP,out12$resid) par(mfrow=c(2,2)) outn1 <- gam(ozone^(-1) ~ s(radiation) + s(wind) + s(temperature), data = air) EAP <- outn1$additive.predictor Y <- air[,1]^(-1) plot(EAP,Y,ylab="Z**(-1)") abline(0,1) title("a) t(Z) = 1/Z") #plot(EAP,outn1$resid) #98 df, nonlinear S_j, radiation may not be significant outln <- gam(log(ozone) ~ s(radiation) + s(wind) + s(temperature), data = air) EAP <- outln$additive.predictor Y <- log(air[,1]) plot(EAP,Y,ylab="log(Z)") abline(0,1) title("b) t(Z) = log(Z)") #plot(EAP,outln$resid) #98 df, nonlinear S_j out13 <- gam(ozone^(1/3) ~ s(radiation) + s(wind) + s(temperature), data = air) EAP <- out13$additive.predictor Y <- air[,1]^(1/3) plot(EAP,Y,ylab="Z**(1/3)") abline(0,1) title("c) t(Z) = Z^(1/3)") #plot(EAP,out13$resid) #98 df, nonlinear S_j, radiation may not be significant outy <- gam(ozone ~ s(radiation) + s(wind) + s(temperature), data = air) EAP <- outy$additive.predictor Y <- air[,1] plot(EAP,Y,ylab="Y") abline(0,1) title("d) t(Z) = Z") #plot(EAP,outy$resid) #98 df, nonlinear S_j, radiation is not significant outy <- gam(ozone ~ s(wind) + s(temperature), data = air) EAPS <- outy$additive.predictor Y <- air[,1] plot(EAPS,Y,ylab="Y") abline(0,1) #plot(EAPS,outy$resid) plot(EAPS,EAP) abline(0,1) #cor(EAPS,EAP) = 0.96 #Figure 4: ozone.eps RES <- outln$resid EAP <- outln$additive.predictor plot(EAP,RES,main="log(Z)") RES <- out13$resid EAP <- out13$additive.predictor plot(EAP,RES,main="cubrt(Z)") EAP <- out12$additive.predictor RES <- out12$resid plot(EAP,RES,main="sqrt(Z)") EAP <- outy$additive.predictor RES <- outy$resid plot(EAP,RES,main="Z") #Figure 5: rozone.eps #no transformation is better than cube root transformation ##Get Gladstone data or cbrain data from #www.math.siu.edu/olive/regdata.txt #using age and log(age) and size and log(size) helps #get rid of curvature for age, circ, and ht #cbrainx <- cbind(age,ageclass,breadth,cause,cephalic, #circum,headht,height,len,sex,size) Y <- cbrainx[,10] age <- cbrainx[,1] lnag <- log(age) ht <- cbrainx[,8] circ <- cbrainx[,6] len <- cbrainx[,9] siz <- cbrainx[,11] lnsz <- log(siz) oglm <- glm(Y~age+lnag+circ+ht+len+siz+lnsz,family=binomial) ESP <- predict(oglm) library(mgcv) ogam <- gam(Y~s(age) + s(lnag) + s(circ) + s(ht) + s(len) + s(siz) + s(lnsz),family=binomial) plot(ogam) #none of the predictors look very curved EAP <- predict(ogam) plot(EAP, ESP) #cor(EAP, ESP) = 0.9, rather low outg <- gam(Y~s(age) + s(circ) + s(ht) + s(len) + s(siz), family=binomial) plot(outg) #age, circ, ht curved #GAM response plot plot(EAP, Y) fit <- Y fit <- exp(EAP)/(1 + exp(EAP)) indx <- sort.list(EAP) lines(EAP[indx], fit[indx]) n <- length(Y) slices <- 8 fit2 <- fit n <- length(Y) val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- mean(Y[indx[((i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- mean(Y[indx[((slices - 1) * val + 1 ):n]]) lines(EAP[indx], fit2) title("Response Plot") #Fig. 2 glad.ps #GLM response plot, not shown in paper plot(ESP, Y) fit <- Y fit <- exp(ESP)/(1 + exp(ESP)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) n <- length(Y) slices <- 8 fit2 <- fit n <- length(Y) val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- mean(Y[indx[((i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- mean(Y[indx[((slices - 1) * val + 1 ):n]]) lines(ESP[indx], fit2) title("Response Plot") ##species data is from Cook and Weisberg (1999, pp. 285-286) #Y = number of species, endem = number of endemic species #area = area of island, elev = elevation of island #distnear = distance of nearest island, #distsc = distance to Santa Cruz #areanear = area of nearest island #get data set from www.math.siu.edu/olive/regdata.txt Y <- c(58, 31, 3, 25, 2, 18, 10, 8, 2, 97, 93, 58, 5, 40, 347, 51, 2, 104, 108, 12, 70, 280, 237, 444, 62, 285, 44, 16, 21) endem <- c(23, 21, 3, 9, 1, 11, 7, 4, 2, 26, 35, 17, 4, 19, 89, 23, 2, 37, 33, 9, 30, 65, 81, 95, 28, 73, 16, 8, 12) area <- c(25.09, 1.24, 0.21, 0.10, 0.05, 0.34, 2.33, 0.03, 0.18, 58.27, 634.49, 0.57, 0.78, 17.35, 4669.32, 129.49, 0.01, 59.56, 17.95, 0.23, 4.89, 551.62, 572.33, 903.82, 24.08, 170.92, 1.84, 1.24, 2.85) elev <- c(NA, 109, 114, 46, NA, NA, 168, NA, 112, 198, 1494, 49, 227, 76, 1707, 343, 25, 777, 458, NA, 367, 716, 906, 864, 259, 640, NA, 186, 253) distnear <- c(0.6, 0.6, 2.8, 1.9, 1.9, 8.0, 34.1, 0.4, 2.6, 1.1, 4.3, 1.1, 4.6, 47.4, 0.7, 29.1, 3.3, 29.1, 10.7, 0.5, 4.4, 45.2, 0.2, 0.6, 16.5, 2.6, 0.6, 6.8, 34.1) distsc <- c(0.6, 26.3, 58.7, 47.4, 1.9, 8.0, 290.2, 0.4, 50.2, 88.3, 95.3, 93.1, 62.2, 92.2, 28.1, 85.9, 45.9, 119.6, 10.7, 0.6, 24.4, 66.5, 19.8, 0.0, 16.5, 49.2, 9.6, 50.9, 254.7) areanear <- c(1.84, 572.33, 0.78, 0.18, 903.82, 1.84, 2.85, 17.95, 0.10, 0.57, 4669.32, 58.27, 0.21, 129.49, 634.49, 59.56, 0.10, 129.49, 0.03, 25.09, 572.33, 0.57, 4.89, 0.52, 0.52, 0.10, 25.09, 17.95, 2.33) library(mgcv) outg2 <- gam(Y~s(log(endem))+s(log(areanear)),family=poisson) EAP <- predict(outg2) par(mfrow = c(2, 2)) plot(EAP, Y) Ehat <- exp(EAP) indx <- sort.list(EAP) lines(EAP[indx], Ehat[indx]) lines(lowess(EAP, Y), type = "s") title("a) Response Plot") Vhat <- (Y - Ehat)^2 plot(Ehat, Vhat) abline(0, 1) abline(0, 4) title("b) OD Plot") Z <- Y Z[Y < 1] <- Z[Y < 1] + 0.5 WRES <- sqrt(Z) * (log(Z) - EAP) WFIT <- sqrt(Z) * EAP plot(WFIT, sqrt(Z) * log(Z)) abline(0, 1) title("c) WFRP") plot(WFIT, WRES) title("d) Wtd Residual Plot") #Fig. 3 ##code leading to response and OD plots for negative binomial GAM out1 <- glm(Y~log(endem)+log(areanear),family=poisson) plot(out1) #dev/df approx X^2/df approx 71.4/26 approx 2.75 #residual plot right opening megaphone library(mgcv) outg1 <- gam(Y~s(endem)+s(areanear),family=poisson) outg2 <- gam(Y~s(log(endem))+s(log(areanear)),family=poisson) library(MASS) #glm.nb estimates theta = kappa so that disp approx 1 out <- glm.nb(Y~log(endem) + log(areanear)) summary(out) #estimates theta = kappa = 50.9 with se(theta) = 26.7 #log(areanear) has pval = 0.044 out <- glm.nb(Y~log(endem)) summary(out) #estimates theta = kappa = 36.7 with se(theta) = 17.1 #negative.binomial(37) assumes theta = kappa is 37 and known outnb <- glm(Y~log(endem), family=negative.binomial(37)) plot(outnb) summary(outnb) #OD plot for NBR GLM ESP <- predict(outnb) Ehat <- exp(ESP) ModVar <- exp(ESP) + exp(2*ESP)/37 Vhat <- (Y - Ehat)^2 plot(ModVar, Vhat) abline(0, 1) abline(0,4) #has dispersion parameter = .995 #plots are a bit better than for Poisson regression ####2008 version of R uses outnbg <- gam(Y~s(endem), family=negative.binomial(37)) outnbg <- gam(Y~s(endem), family=negbin(37)) EAP <- predict(outnbg) ESP <- predict(outnb) plot(EAP,ESP) outnbg2 <- gam(Y~s(log(endem)), family=negbin(37)) plot(outnbg2) #linear #make response plot for negative binomial GAM EAP <- predict(outnbg2) plot(EAP,ESP) plot(EAP,Y) fit <- exp(EAP) indx <- sort.list(EAP) lines(EAP[indx],fit[indx]) lines(lowess(EAP,Y)) #Fig. 6: species.ps #make OD plot for negative binomial GAM #37 = 1/tauhat = kappahat Ehat <- exp(EAP) ModVar <- exp(EAP) + exp(2*EAP)/37 Vhat <- (Y - Ehat)^2 plot(ModVar, Vhat) abline(0, 1) abline(0,4) #abline(lsfit(ModVar, Vhat)$coef) #Fig. 7: od.ps ##Wood (2006, p. 82-86) heart attack data ck <- c(20,60,100,140,180,220,260,300,340,380,420,460) #patients with heart attack ha <- c(2,13,30,30,21,19,18,13,19,15,7,8) #patients without a heart attack ok <- c(88,26,8,5,0,1,1,1,1,0,0,0) heart <- as.data.frame(cbind(ck,ha,ok)) out <- glm(cbind(ha,ok)~ck,family=binomial, data=heart) #deviance is too high, need more predictors out2 <- glm(cbind(ha,ok)~ck+I(ck^2)+I(ck^3),family=binomial, data=heart) out3 <- glm(cbind(ha,ok)~ck+log(ck),family=binomial, data=heart) out4 <- glm(cbind(ha,ok)~log(ck),family=binomial, data=heart) library(mgcv) out5 <- gam(cbind(ha,ok)~s(ck),family=binomial, data=heart) EAP <- predict(out5) ESP <- predict(out3) plot(EAP,ESP) abline(0,1) ESPp <- predict(out2) plot(EAP,ESPp,ylab="ESP") abline(0,1) #Figure 8: hrt2.eps ESPl <- predict(out4) plot(EAP,ESPl,ylab="ESP") abline(0,1) #Figure 9: hrt1.eps Z <- ha/(ha+ok) fit<-Z plot(ESPl,Z,xlab="ESP") fit <- exp(ESPl)/(1 + exp(ESPl)) indx <- sort.list(ESPl) lines(ESPl[indx], fit[indx]) #Figure 10: hrt3.eps AIC(out2) # 33.65773 AIC(out3) # 34.18879 AIC(out4) # 33.45305 ##neither out2 nor out3 is too close to the GAM, but out3 seems better out6 <- gam(cbind(ha,ok)~s(log(ck)),family=binomial, data=heart) plot(out6) #linear EAP <- predict(out6) plot(EAP,ESPl) #cor is 1 ##icu data from Statlib or URL #www.math.siu.edu/olive/ICU.lsp #delete header of ICU.lsp and delete last parentheses at the end of #the file. Save the file on G drive as icu.lsp icu <- read.table("G:\\icu.txt") names(icu) <- c("ID", "STA", "AGE", "SEX", "RACE", "SER", "CAN", "CRN", "INF", "CPR", "SYS", "HRA", "PRE", "TYP", "FRA", "PO2", "PH", "PCO", "Bic", "CRE", "LOC") icu[,5] <- as.factor(icu[,5]) icu[,21] <- as.factor(icu[,21]) dim(icu) icu2<-icu[,-1] outf <- glm(formula = STA~.,family=binomial,data=icu2) outr <- glm(formula = STA~AGE+CAN+SYS+TYP+LOC, family=binomial, data=icu2) outb <- glm(formula = STA~AGE+CAN+SYS+TYP+LOC+RACE, family=binomial,data=icu2) ESP <- predict(outf) library(mgcv) outgam <- gam(STA ~ s(AGE)+SEX+RACE+SER+CAN+CRN+INF+CPR+s(SYS)+s(HRA) +PRE+TYP+FRA+PO2+PH+PCO+Bic+CRE+LOC,family=binomial,data=icu2) EAP <- predict.gam(outgam) plot(EAP,ESP) abline(0,1) #figure 12: icuee.ps Y <- icu2[,1] plot(EAP,Y) fit<-Y fit <- exp(EAP)/(1 + exp(EAP)) indx <- sort.list(EAP) lines(EAP[indx], fit[indx]) slices <- 18 fit2 <- fit n <- length(Y) val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- mean(Y[indx[((i - 1) * val + 1):(i * val)]])} fit2[((slices - 1) * val + 1):n] <- mean(Y[indx[((slices - 1) * val + 1):n]]) lines(EAP[indx], fit2) #figure 11: icug.ps icuplot<-function(slices = 10, ff = 0.99, step = T) {# Makes the response plot for binary logistic regression if ICU data # is in file icu2 where ID numbers have been removed. # If slice = T use step function, else use lowess. # Workstation need to activate a graphics # device with command "X11()" or "motif()." icu2 <- icu icu2[,5] <- as.factor(icu2[,5]) icu2[,21] <- as.factor(icu2[,21]) #get rid of ID number = 1st column icu2<-icu[,-1] x <- as.matrix(icu2[,-1]) outf <- glm(formula = STA~.,family=binomial,data=icu2) ESP <- predict(outf) Y <- icu2[,1] plot(ESP, Y) fit <- Y fit <- exp(ESP)/(1 + exp(ESP)) # lines(sort(ESP),sort(fit)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) if(step == T){ fit2 <- fit n <- length(Y) val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- mean(Y[indx[((i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- mean(Y[indx[((slices - 1) * val + 1 ):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2)} else lines(lowess(ESP, Y, f = ff), type = "s") title("Response Plot") } #response plot for GLM not shown icuplot() # used slices = 10 icuplot(slices=18) icuplot2<-function(){ # Makes the EE plot for binary logistic regression if ICU data # is in file icu. # If slice = T use step function, else use lowess. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # # icu2 <- icu icu2[,5] <- as.factor(icu2[,5]) icu2[,21] <- as.factor(icu2[,21]) #get rid of ID number = 1st column icu2<-icu[,-1] race2 <- as.integer(icu2[,4]==2) race3 <- as.integer(icu2[,4]==3) loc2 <- as.integer(icu2[,20]==2) loc3 <- as.integer(icu2[,20]==3) icu2 <- icu2[,-c(4,20)] icu2 <- cbind(icu2,loc2,loc3,race2,race3) x <- as.matrix(icu2[,-1]) outf <- glm(formula = STA~.,family=binomial,data=icu2) ESP <- x %*% outf$coef[-1] + outf$coef[1] outr <- glm(formula = STA~AGE+CAN+SYS+TYP+loc2+loc3, family=binomial, data=icu2) xr <- as.matrix(icu2[,c(2,5,9,12,19,20)]) ESPS <- xr %*% outr$coef[-1] + outr$coef[1] plot(ESPS,ESP) abline(0,1) title("EE PLOT for Model without Race") outs <- glm(formula = STA~AGE+CAN+SYS+TYP+loc2+loc3+race2+race3, family=binomial, data=icu2) xs <- as.matrix(icu2[,c(2,5,9,12,19,20,21,22)]) ESPS <- xs %*% outs$coef[-1] + outs$coef[1] plot(ESPS,ESP) abline(0,1) title("EE PLOT for Model with Race") } ##figures 13 and 14: icu2.ps and icu3.ps icuplot2()