##The command ##source("http://parker.ad.siu.edu/Olive/mpack.txt") ##is an easy way to get these functions into R. ##### BETTER robust MLD algorithms for FCH, RFCH, and RMVN ##would use MB if DGK causes an error. anovasim2<-function(n1=20,n2=20,n3=20,n4=20,m1=0,m2=0,m3=0,m4=0, sd1=1,sd2=1,sd3=1,sd4=1,type=1,B=400,nruns=100,alpha=0.05){ # Simulates ANOVA F, bootstrap F, modified F, Welch F, modified Welch F, #rank F statistic, large sample F test for 1 way ANOVA. # tests Ho: mu1 = mu2 = mu3 = mu4 p <- 4 r <- p-1 n <- n1 + n2 + n3 + n4 up <- min((1 - alpha/2), (1 - alpha + 10*alpha*r/B)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + r/B)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha indx1 <- 1:n1 indx2 <- 1:n2 indx3 <- 1:n3 indx4 <- 1:n4 y <- 0 * 1:n c1 <- 0 * 1:n c2 <- c1 c3 <- c1 c4 <- c1 c1[1:n1] <- 1 c2[(n1 + 1):(n1 + n2)] <- 1 c3[(n1 + n2 + 1):(n1 + n2 + n3)] <- 1 c4[(n1 + n2 + n3 + 1):n] <- 1 x <- cbind(c1, c2, c3, c4) #cell means model zx <- c(rep(1,n1),rep(2,n2),rep(3,n3),rep(4,n4)) y1 <- 1:n1 y2 <- 1:n2 y3 <- 1:n3 y4 <- 1:n4 beta <- c(m1, m2, m3, m4) bstat <- 1:B bcut <- 1:nruns quant = 1 - alpha fcut <- qf(quant, p - 1, n - p) dfmin <- min(n1,n2,n3,n4) lcut <- qf(quant, p - 1, dfmin) faov <- 0 * 1:nruns rf <- faov lf <- rf wwf <- rf mct <- 0 wct <- 0 mwct <- 0 cinv <- diag(x=c(n1,n2,n3)) -(1/n)*as.matrix(c(n1,n2,n3))%*%c(n1,n2,n3) for(i in 1:nruns) { #make data if(type==1){ y1 <- rnorm(n1, mean = m1, sd = sd1) y2 <- rnorm(n2, mean = m2, sd = sd2) y3 <- rnorm(n3, mean = m3, sd = sd3) y4 <- rnorm(n4, mean = m4, sd = sd4) } if(type==2){ y1 <- rexp(n1, rate = 1/sd1) + m1 - sd1 y2 <- rexp(n2, rate = 1/sd2) + m2 - sd2 y3 <- rexp(n3, rate = 1/sd3) + m3 - sd3 y4 <- rexp(n4, rate = 1/sd4) + m4 - sd4 } if(type==3){ y1 <- rnorm(n1, mean = m1, sd = sd1) y2 <- rnorm(n2, mean = m2, sd = sd2) y3 <- rexp(n3, rate = 1/sd3) + m3 - sd3 y4 <- rexp(n4, rate = 1/sd4) + m4 - sd4 } y <- c(y1, y2, y3, y4) mn1 <- mean(y1) mn2 <- mean(y2) mn3 <- mean(y3) mn4 <- mean(y4) s1sq <- var(y1) s2sq <- var(y2) s3sq <- var(y3) s4sq <- var(y4) #get ANOVA F statistic fit<-c(rep(mn1,n1),rep(mn2,n2),rep(mn3,n3),rep(mn4,n4)) res <- y-fit mse <- t(res) %*% res/(n - p) sse <- mse * (n - p) mny <- mean(y) sstr <- n1 * (mn1 - mny)^2 + n2 * (mn2 - mny)^2 + n3 * (mn3 - mny)^2 + n4 * (mn4 - mny)^2 fnum <- sstr/(p - 1) faov[i] <- fnum/mse AT <- c(mn1-mn4,mn2-mn4,mn3-mn4) #faov[i] = AT%*%cinv%*%AT/((p-1)*mse) #get modified F statistic num <- (p - 1) * fnum den <- (1 - n1/n) * s1sq + (1 - n2/n) * s2sq + (1 - n3/n) * s3sq + (1 - n4/n) * s4sq modf <- num/den c1 <- ((1 - n1/n) * s1sq)/den c2 <- ((1 - n2/n) * s2sq)/den c3 <- ((1 - n3/n) * s3sq)/den c4 <- ((1 - n4/n) * s4sq)/den finv <- c1^2/(n1 - 1) + c2^2/(n2 - 1) + c3^2/(n3 - 1) + c4^2/(n4 - 1) md <- ceiling(1/finv) if(modf > qf(quant, p - 1, md)) mct <- mct + 1 #get Welch F statistic w1 <- n1/s1sq w2 <- n2/s2sq w3 <- n3/s3sq w4 <- n4/s4sq u <- w1 + w2 + w3 + w4 xdd <- (w1 * mn1)/u + (w2 * mn2)/u + (w3 * mn3)/u + (w4 * mn4)/u num <- (w1 * (mn1 - xdd)^2 + w2 * (mn2 - xdd)^2 + w3 * (mn3 - xdd)^2 + w4 * (mn4 - xdd)^2)/(p - 1) tem <- (1 - w1/u)^2/(n1 - 1) + (1 - w2/u)^2/(n2 - 1) + (1 - w3/u)^2/(n3 - 1) + (1 - w4/u)^2/(n4 - 1) den <- 1 + (2 * (p - 2) * tem)/(p^2 - 1) wf <- num/den #wwf[i] <- wf finv <- (3 * tem)/(p^2 - 1) wd <- ceiling(1/finv) if(wf > qf(quant, p - 1, wd)) wct <- wct + 1 #get modified Welch F statistic dfnum <- (1/w1 + 1/w2 + 1/w3 + 1/w4)^2 dfden <- (1/w1)^2/(n1 - 1) + (1/w2)^2/(n2 - 1) + (1/w3)^2/(n3 - 1) + (1/w4)^2/(n4 - 1) wd <- ceiling(dfnum/dfden) if(wf > qf(quant, p - 1, wd)) mwct <- mwct + 1 #get large sample F test wt <- c(w1,w2,w3) dinv <- diag(x=wt) -as.matrix(wt)%*%wt/u lf[i] <- AT%*%dinv%*%AT/(p-1) #get rank F statistic ry <- rank(y) out <- lsfit(x, ry, intercept = F) brols <- out$coef mn1 <- brols[1] mn2 <- brols[2] mn3 <- brols[3] mn4 <- brols[4] res <- out$resid mser <- t(res) %*% res/(n - p) sse <- mser * (n - p) mny <- mean(ry) sstr <- n1 * (mn1 - mny)^2 + n2 * (mn2 - mny)^2 + n3 * (mn3 - mny)^2 + n4 * (mn4 - mny)^2 fnum <- sstr/(p - 1) rf[i] <- fnum/mser #get bootstrap cutoff bag <- 0 for(j in 1:B){ tem <- sample(indx1,n1,replace=T) mny1 <- mean(y1[tem]) tem <- sample(indx2,n2,replace=T) mny2 <- mean(y2[tem]) tem <- sample(indx3,n3,replace=T) mny3 <- mean(y3[tem]) tem <- sample(indx4,n4,replace=T) mny4 <- mean(y4[tem]) ATb <- c(mny1-mny4,mny2-mny4,mny3-mny4) bstat[j] = (ATb-AT)%*%cinv%*%(ATb-AT)/((p-1)*mse) } bcut[i] <- quantile(bstat,up) } faovcov <- sum(faov > fcut)/nruns bootcov <- sum(faov > bcut)/nruns modfcov <- mct/nruns mwfcov <- mwct/nruns wfcov <- wct/nruns rfcov <- sum(rf > fcut)/nruns lfcov <- sum(lf > lcut)/nruns list(faovcov=faovcov,bootcov=bootcov,modfcov=modfcov,wfcov=wfcov, mwfcov=mwfcov,rfcov=rfcov,lfcov=lfcov,bcut=mean(bcut),fcut=fcut,lcut=lcut,up=up) } bootci<-function(bstat, stat, upc){ #computes the prediction region method and Bickel and Ren CIs for statistic stat. #bstat is the bootstrap sample T_1^*,...,T_B^* where stat = T_n cent<-mean(bstat) adist <- abs(bstat-cent) phlen <- quantile(adist, upc) pLn <- cent - phlen pUn <- cent + phlen adist <- abs(bstat-stat) brhlen <- quantile(adist, upc) brLn <- stat - brhlen brUn <- stat + brhlen prcilen<-2*phlen brcilen<-2*brhlen list(prci=c(pLn, pUn),prcilen=prcilen,brci=c(brLn, brUn),brcilen=brcilen) } cci<-function(x, alpha = 0.05) {#gets classical 100 (1-alpha)% CI #defaults are alpha = .05 n <- length(x) up <- 1 - alpha/2 mn <- mean(x) v <- var(x) se <- sqrt(v/n) val <- qt(up, n - 1) * se lo <- mn - val hi <- mn + val list(int = c(lo, hi), mean = mn, se = se) } cltv<-function(gam = 0.5) {# Gets asy var for lts(h) and lta(h) at Cauchy C(0,1) # where h/n -> gam. k <- tan((pi * gam)/2) num <- 2 * k - pi * gam den <- pi * (gam - (2 * k)/(pi * (1 + k^2)))^2 ltsv <- num/den num <- gam den <- 4 * (1/pi - 1/(pi * (1 + k^2)))^2 ltav <- num/den list(ltsv=ltsv, ltav=ltav) } cmba2<-function(x, csteps = 5, ii = 1) {# gets the covmba estimator using 98, 95, 90, 80, 70, 60 and 50% trimming #needs p > 1, plots the 7 DD plots #right click 7 times n <- dim(x)[1] p <- dim(x)[2] mds <- matrix(nrow = n, ncol = 8, 0) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) cmd <- sqrt(mahalanobis(x, mns, covs)) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } mds[, 8] <- sqrt(mahalanobis(x, mns, covs)) covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the resistant estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) smd2 <- sort(md2) val <- p + 3 tem <- 1:7 tem[1] <- smd2[val + floor(0.02 * n)] tem[2] <- smd2[val + floor(0.05 * n)] tem[3] <- smd2[val + floor(0.1 * n)] tem[4] <- smd2[val + floor(0.2 * n)] tem[5] <- smd2[val + floor(0.3 * n)] tem[6] <- smd2[val + floor(0.4 * n)] tem[7] <- median(md2) medd2 <- tem[7] for(j in ii:7) { ## get the start val2 <- tem[j] mns <- apply(x[md2 <= val2, ], 2, mean) covs <- var(x[md2 <= val2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } mds[, j] <- sqrt(mahalanobis(x, mns, covs)) plot(cmd, mds[, j]) identify(cmd, mds[, j]) crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } } pairs(mds) ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb list(center = mnb, cov = covb, mds = mds) } cmve<- function(x, csteps = 5) {# gets the cmve, rcmve and mb estimators zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) medd3 <- medd2 ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get CMVE attractor covf <- covm mnf <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < medd3) { ##crit = [med(D)]^p * square root of det(cov) rd2 <- mahalanobis(x, mnd, covd) critd <- (sqrt(median(rd2)))^p*prod(diag(chol(covd))) rd2 <- mahalanobis(x, mnm, covm) critm <- (sqrt(median(rd2)))^p*prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get CMVE estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ##reweight the above CMVE estimator (mnf,covf) to get the ##RCMVE estimator (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf list(center = mnf, cov = covf, rmnf = rmnf, rcovf = rcovf, mnm = mnm, covm = covm) } concmv<-function(n = 100, csteps = 5, gam = 0.4, outliers = T, start = 3) {#Shows how concentration works when p = 2. #right click csteps times # Use start = 1 for DGK, start = 2 for MBA sphere = FCH sphere = MB estimator, # start = 3 for MBA coord MED, diag([MAD(X1)]^2,...,[MAD(Xp)]^2) start p <- 2 #A <- cbind(c(1, 0.9), c(0.9, 1)) x <- matrix(rnorm(n * p), ncol = p, nrow = n) #A <- diag(sqrt(1:p)) #if(outliers == T) { # val <- floor(gam * n) # tem <- 10 + 0 * 1:p # x[1:val, ] <- x[1:val, ] + tem #} #x <- x %*% A A <- cbind(c(1, 0.4), c(0.4, 1)) B <- cbind(c(0.5, 0), c(0, 0.5)) if(outliers == T) { val <- floor(gam * n) x[(val + 1):n, ] <- x[(val + 1):n, ] %*% A x[1:val, ] <- x[1:val, ] %*% B x[1:val, 1] <- x[1:val, 1] + 0 x[1:val, 2] <- x[1:val, 2] + 6 } else { x <- x %*% A } if(start == 1) { covs <- var(x) mns <- apply(x, 2, mean) } if(start == 2) { covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(start > 2) { tem <- apply(x, 2, mad)^2 covv <- diag(tem) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) plot(x[, 1], x[, 2]) points(x[md2 <= medd2, 1], x[md2 <= medd2, 2], pch = 15) identify(x[, 1], x[, 2]) } } concsim<-function(n = 100, p = 2, steps = 5, gam = 0.4, runs = 20) {# type "library(MASS)" or "library(lqs)." # Need n > 2p, p > 1. This function is used to determine when the DD # plot separates outliers from non-outliers for various starts. A <- sqrt(diag(1:p)) mbact <- 0 fmcdct <- 0 mbct <- 0 madct <- 0 dgkct <- 0 fchct <- 0 for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) ## outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem x <- x %*% A #MBA out <- covmba(x, csteps = steps) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbact <- mbact + 1 #DGK covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(j in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) dgkct <- dgkct + 1 #Median Ball start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(j in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 #MAD start tem <- apply(x, 2, mad)^2 covv <- diag(tem) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(j in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) madct <- madct + 1 #FMCD out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fmcdct <- fmcdct + 1 #FCH out <- covfch(x, csteps = steps) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fchct <- fchct + 1 } list(mbact = mbact, fmcdct = fmcdct, dgkct = dgkct, mbct = mbct, madct = madct, fchct = fchct) } confreg<-function(x, g = 4, that = 1:4, alpha = 0.05){ # Makes confidence regions for theta from rows of x = Ti* from a bootstrap. # Use fot testing H_0: theta = 0 versus H_1: theta != 0. # The prediction region method, hybrid, and Bickel and Ren regions are used. # If g = 1, the shorth interval should work better. # Also computes the distance for the 0 vector. # Need g = dim(x)[2] and T = that the g by 1 estimator of theta. # Often that = A betahat(I_min,0). # Note that center= Tbar* = bagging estimator and cov = S*_T. x <- as.matrix(x) that <- as.vector(that) n <- dim(x)[1] zero <- 0*(1:g) up <- min((1 - alpha/2), (1 - alpha + 10*alpha*g/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + g/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical distance MD <- sqrt(md2) #get prediction region method cutoff cuplim <- sqrt(quantile(md2, up)) D0 <- sqrt(mahalanobis(zero, center, cov)) #get hybrid region statistic = Bickel and Ren statistic br0 <- sqrt(mahalanobis(zero, that, cov)) #get the Bickel and Ren cutoff and test statistic br2 <- mahalanobis(x,that,cov) brlim <- sqrt(quantile(br2, up)) list(cuplim=cuplim,brlim=brlim,br0=br0,D0=D0,MD=MD,center=center,cov=cov) } corboot<-function(x, B = 1000){ #rowwise bootstrap of the correlation matrix #stacks entries above the diagonal into a vector beta x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] indx <- 1:n nc <- p*(p-1)/2 betas <- matrix(0,nrow=B,ncol=nc) for(i in 1:B){ tem <- sample(indx,n,replace=T) out <- cor(x[tem,]) temp <- out[1,(2:p)] for(j in (2:(p-1))) temp <- c(temp,out[j,((j+1):p)]) betas[i,] <- temp } list(betas=betas) } corbootsim<-function(n = 100, p = 4, BB=1000, nruns = 100, type = 1, psi = 0.0, dd=1, eps = 0.25, alph = 0.05){ #Simulates bootstrap for correlation matrix. #Stacks entries above the diagonal into a vector beta. # Make x where type = 1 for MVN Nq(0,I), # 2, 3, 4 and 5 (with delta = eps) for (1 - delta) Nq(0,I) + delta Nq(0, 25 I) # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd # 10 for lognormal. # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(p-2)psi^2]/[1 + (p-1)psi^2], i not = j # when the correlation exists. nc <- p*(p-1)/2 ncp1 <- nc + 1 cicov <- 0*(1:ncp1) avelen <- cicov rho <- (2*psi + (p-2)*psi^2)/(1 + (p-1)*psi^2) A <- matrix(psi,nrow=p,ncol=p) diag(A) <- 1 for(i in 1:nruns) { #make data x <- matrix(rnorm(n * p), nrow = n, ncol = p) if(type == 2) { zu <- runif(n) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(type == 3) { zu <- runif(n) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(type == 4) { zu <- runif(n) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(type == 5) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } if(type == 6) { zu <- sqrt(rchisq(n, 3)/3) x <- x/zu } if(type == 7) { zu <- sqrt(rchisq(n, 5)/5) x <- x/zu } if(type == 8) { zu <- sqrt(rchisq(n, 19)/19) x <- x/zu } if(type == 9) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(type == 10) x <- exp(x) x <- x %*% A out <-corboot(x, B= BB) #rowwise bootstrap for (j in 1:nc){ tem <- shorth3(out$betas[,j],alpha=alph) if(rho >= tem$shorth[1] && rho <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] } #test whether the nc values of beta are 0 tem <- predreg(out$betas,alpha=alph) if(tem$D0 <= tem$cuplim) cicov[ncp1] <- cicov[ncp1] + 1 avelen[ncp1] <- avelen[ncp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen)} corrsim<-function(n = 100, p = 3, eps = 0.4, nruns = 100, type = 5) {#Need n > 2p. For R, type "library(MASS)" before using this #function. This function generates 100 n by p matrices x. # The output is the 100 sample correlations between the MDi and RDi # RDi uses covmba for type = 1, rmba for type = 2, cov.mcd for # type = 3, covfch for type = 4, rfch for type = 5, cmve for type 6 # rcmve for type 7, rmvn for type = 8. # mahalanobis gives squared Maha distances corrs <- 1:nruns for(i in 1:nruns) { wt <- 0 * (1:n) x <- matrix(rnorm(n * p), ncol = p, nrow = n) #The following 3 commands make x elliptically contoured. #zu <- runif(n) #x[zu < eps,] <- x[zu < eps,]*5 #x <- x^2 # To make marginals of x lognormal, use #x <- exp(x) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) if(type == 1) { out <- covmba(x) } if(type == 2) { out <- rmba(x) } if(type == 3) { out <- cov.mcd(x) } if(type == 4) { out <- covfch(x) } if(type == 5) { out <- covfch(x) center <- out$rmnf cov <- out$rcovf } if(type == 6) { out <- cmve(x) } if(type == 7) { out <- cmve(x) center <- out$rmnf cov <- out$rcovf } if(type == 8) { out <- covrmvn(x) } if(type != 5 && type != 7){ center <- out$center cov <- out$cov } rd2 <- mahalanobis(x, center, cov) # need square roots for the usual distances md <- sqrt(md2) rd <- sqrt(rd2) const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd # wt[rd < sqrt(qchisq(0.975, p))] <- 1 # corrs[i] <- cor(md[wt > 0], rd[wt > 0])} corrs[i] <- cor(md, rd) } cmean <- mean(corrs) cmin <- min(corrs) clt95 <- sum(corrs < 0.95) clt80 <- sum(corrs < 0.8) list(cmean = cmean, cmin = cmin, clt95 = clt95, clt80 = clt80, corrs = corrs) } corrsim2<-function(n = 100, q = 4, nruns = 100, xtype = 1, type = 1, dd = 1, eps = 0.25){ # Need n > 2q. R users need to type library(MASS). # MAY NOT WORK IF q = 1 # Multiply x by A where xtype = 1 for MVN Nq(0,I), # 2, 3, 4 and 5 (with delta = eps) for (1 - delta) Nq(0,I) + delta Nq(0, 25 I) # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd # 10 for lognormal. # The output is the 100 sample correlations between the MDi and RDi # RDi uses covmba for type = 1, rmba for type = 2, cov.mcd for # type = 3, covfch for type = 4, rfch for type = 5, cmve for type 6 # rcmve for type 7, rmvn for type = 8. # mahalanobis gives squared Maha distances set.seed(974) corrs <- 1:nruns k <- q/2 A <- sqrt(diag(1:q)) for(i in 1:nruns) { #make data x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(xtype == 2) { zu <- runif(n) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(n) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(n) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } if(xtype == 6) { zu <- sqrt(rchisq(n, 3)/3) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(n, 5)/5) x <- x/zu } if(xtype == 8) { zu <- sqrt(rchisq(n, 19)/19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(xtype == 10) x <- exp(x) x <- x %*% A center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) if(type == 1) { out <- covmba(x) } if(type == 2) { out <- rmba(x) } if(type == 3) { out <- cov.mcd(x) } if(type == 4) { out <- covfch(x) } if(type < 5) { center <- out$center cov <- out$cov } if(type == 5) { out <- covfch(x) center <- out$rmnf cov <- out$rcovf } if(type == 6) { out <- cmve(x) } if(type == 7) { out <- cmve(x) center <- out$rmnf cov <- out$rcovf } if(type == 8) { out <- covrmvn(x) } if(type != 5 && type != 7){ center <- out$center cov <- out$cov } rd2 <- mahalanobis(x, center, cov) # need square roots for the usual distances md <- sqrt(md2) rd <- sqrt(rd2) const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd corrs[i] <- cor(md, rd) } cmean <- mean(corrs) cmin <- min(corrs) clt95 <- sum(corrs < 0.95) clt80 <- sum(corrs < 0.8) list(cmean = cmean, cmin = cmin, clt95 = clt95, clt80 = clt80, corrs = corrs) } covcheck<-function(n = 100, p = 4, xtype = 1, dd = 3, eps = 0.25, tau = 5){# MAY NOT WORK IF p = 1. # Simulates multivariate data and computes the population # and sample covariance matrices for several distributions. # p = number of predictors including intercept # want n > 10 p # etype = 1 for MVN Nq(0,I), # etype = 2 for (1 - eps) Nq(0,I) + eps Nq(0, (tau^2) I) # eps = 0.1, 0.25, 0.4, and 0.6 are interesting # etype = 3 for multivariate t_d with d = dd degrees of freedom # Need dd > 2. # etype = 4 for lognormal - E(lognormal). #make data x <- matrix(rnorm(n * p), nrow = n, ncol = p) if(xtype == 1) cov <- diag(p) if(xtype == 2) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * tau cov <- (1 - eps + eps*tau^2) * diag(p)} if(xtype == 3) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu cov <- (dd/(dd-2))*diag(p)} if(xtype == 4){ x <- exp(x) - exp(0.5) cov <- (exp(2) - exp(1)) * diag(p)} xbar <- apply(x, 2, mean) covhat <- var(x) list(xbar = xbar, covhat = covhat, cov = cov) } covdgk<-function(x, csteps = 10) {#computes the scaled DGK multivariate estimator, need p > 1 p <- dim(x)[2] covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } ##scale for consistency at MVN rd2 <- mahalanobis(x, mns, covs) const <- median(rd2)/(qchisq(0.5, p)) covs <- const * covs list(center = mns, cov = covs) } covesim<-function(n = 10, p = 2, csteps = 5, gam = 0.4, runs = 20, outliers = 0, pm = 10){ # This R function compares various ways to robustly estimate the # covariance matrix. The estimators used are ccov: the classical # estimator applied to the clean cases, RFCH and RMVN. # For RMVN want diago approx (1, 2, ...., p) # For RFCH want diagfch approx c (1, 2, ...., p) # Need p > 1. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm A <- sqrt(diag(1:p)) ccov <- 0 * A rfche <- ccov rmvne <- ccov covv <- diag(p) val <- floor(gam * n) up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) qtem <- 1:runs qtem2 <- qtem diagdiff <- 1:p for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } ccov <- ccov + var(x[ - (1:val), ]) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(j in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) medd3 <- medd2 ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(j in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < medd3) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the RFCH estimator ## (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) rmnf <- apply(x[rd2 <= up, ], 2, mean) rcovf <- var(x[rd2 <= up, ]) ##use the following 3 commands for the covrmvn estimator rd3 <- rd2 rmnmvn <- rmnf rcovmvn <- rcovf ## rd2 <- mahalanobis(x, rmnf, rcovf) ##rd4 is used for covrmvn rd4 <- rd2 const <- median(rd2)/qchi rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) rmnf <- apply(x[rd2 <= up, ], 2, mean) rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/qchi rcovf <- const * rcovf rfche <- rfche + rcovf ##reweight the FCH estimator (mnf,covf) to get the covrmvn estimator ## (rmnmvn,rcovmvn) tailored for MVN data d1 <- sum(rd3 <= up) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) qtem[i] <- qchi2 const <- median(rd4)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## reweight again rd2 <- mahalanobis(x, rmnmvn, rcovmvn) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d2 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d2 qchi2 <- min(qchi2, 0.995) qtem2[i] <- qchi2 const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn rmvne <- rmvne + rcovmvn } ccov <- ccov/runs rfche <- rfche/runs rmvne <- rmvne/runs diago <- diag(rmvne) diagdiff <- diag(ccov) - diag(rmvne) abssumrmvn <- sum(abs(diagdiff)) diagfch <- diag(rfche) diagdifch <- diag(ccov) - diag(rfche) abssumfch <- sum(abs(diagdifch)) list(ccov = ccov, rfche = rfche, rmvne = rmvne, diago = diago, diagdiff = diagdiff, abssumrmvn = abssumrmvn, diagfch = diagfch, diagdifch= diagdifch, abssumfch = abssumfch) } covfch<-function(x, csteps = 5) {# gets the FCH and RFCH estimators and the MB attractor # works for p = 1 zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ##get the location criterion lcut <- medd2 ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < lcut) {##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get FCH estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the RFCH estimator ## (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf list(center = mnf, cov = covf, rmnf = rmnf, rcovf = rcovf, mnm=mnm, covm=covm) } covfch2<-function(x, csteps = 5, locc = 0.5) {# Gets the FCH and RFCH estimators and the MB attractor # Allows the location criterion cutoff to be varied. zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < lcut) {##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get FCH estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the RFCH estimator ## (rmnf,rcovf) rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1){ rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf list(center = mnf, cov = covf, rmnf = rmnf, rcovf = rcovf, mnm=mnm, covm=covm) } covmb<-function(x, steps = 5, scale=F) {# Computes the median ball estimator. # Needs p > 1. If scale = T, the plotted points will roughly # scatter about the identity line if the data is MVN # and spherical about mu. p <- dim(x)[2] #Median Ball start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the half set start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(scale == T){ rd2 <- mahalanobis(x, mns, covs) const <- median(rd2)/(qchisq(0.5, p)) covs <- const * covs } list(center=mns,cov=covs) } covmb2<-function(x, m=0, k=5, msteps=9){ # Computes the covmb2 estimator with concentration type steps. Needs p > 1. # Use even if p > n. Look at out<-medout(x) to determine how many cases # m are clean. Use m >= n/2. # Estimate m if m = 0: use k >= 0, so at least half of the cases are used, # and do the concentration type msteps to get a weighted median. # The concentration type steps help the most when the outlier proportion # is high. Try covbm2(x,msteps=0) and covmb2(x,msteps=9). # Using msteps > 0 does slow down the function some. p <- dim(x)[2] #Median Ball start covv <- diag(p) med <- apply(x, 2, median) #Get squared Euclidean distances from coordinatewise median. md2 <- mahalanobis(x, center = med, covv) if(m == 0){ if(msteps > 0){#do concentration type steps for(i in 1:msteps){ medd <- median(md2) medw <- apply(x[md2<=medd,], 2, median) md2 <- mahalanobis(x, center = medw, covv) } } md <- sqrt(md2) mcut <- median(md) + k*mad(md,constant=1) mns <- apply(x[md <= mcut, ], 2, mean) covs <- var(x[md <= mcut, ]) } else{ #Use m cases with the smallest distances. mcut <- sort(md2)[m] mns <- apply(x[md2 <= mcut, ], 2, mean) covs <- var(x[md2 <= mcut, ]) } list(center=mns,cov=covs) } covmba <- function(x, csteps = 5) { # gets the MBA estimator, works for p = 1 zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1){ mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1){ mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the resistant estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start if(p > 1){ mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1){ mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1){ mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1){ mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb list(center = mnb, cov = covb, mbL = mns, mbc = covs) } covmba2<-function(x, csteps = 5) {# gets the MBA estimator, use covmba2 instead of covmba if p > 1 p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the resistant estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb list(center = mnb, cov = covb) } covrmb<-function(x, csteps = 5){ # Need p > 1. Produces the median ball and reweighted median ball (RMB) # estimators. RMB is tailored to estimate the covariance matrix # of the bulk of the data if the bulk of the data is MVN # and the outliers are "not too bad." # This function is like covrmvn, except only the MB attractor # is used. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the half subset start mnm <- apply(x[md2 <= medd2, ], 2, mean) covm <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mnm, covm) medd2 <- median(md2) mnm <- apply(x[md2 <= medd2, ], 2, mean) covm <- var(x[md2 <= medd2, ]) } ## scale the MB estimator to behave well for MVN data rd2 <- mahalanobis(x, mnm, covm) const <- median(rd2)/qchi covm <- const * covm ## reweight the scaled MB estimator (mnm,covm) twice ## to get the covrmb estimator (mnrmb,covrmb) tailored for MVN data rd2 <- mahalanobis(x, mnm, covm) mnrmb <- apply(x[rd2 <= up, ], 2, mean) covrmb <- var(x[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(x, mnrmb, covrmb) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) covrmb <- const * covrmb ## reweight again rd2 <- mahalanobis(x, mnrmb, covrmb) mnrmb <- apply(x[rd2 <= up, ], 2, mean) covrmb <- var(x[rd2 <= up, ]) d2 <- sum(rd2 <= up) rd2 <- mahalanobis(x, mnrmb, covrmb) qchi2 <- (0.5 * 0.975 * n)/d2 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) covrmb <- const * covrmb list(center = mnrmb, cov = covrmb, mnm = mnm, covm = covm) } covrmvn<-function(x, csteps = 5, locc = 0.5) {# Needs number of predictors p > 1. # This robust MLD estimator is tailored to estimate the covariance matrix # of the bulk of the data when the bulk of the data is MVN and the outliers # are "not too bad." The FCH and MB estimators are also produced. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the cov estimator ## (rmnmvn,rcovmvn) tailored for MVN data rd2 <- mahalanobis(x, mnf, covf) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## reweight again rd2 <- mahalanobis(x, rmnmvn, rcovmvn) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d2 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d2 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn list(center = rmnmvn, cov = rcovmvn, mnf = mnf, covf = covf, mnm = mnm, covm = covm) } covrob<-function(x, csteps = 5, locc = 0.5){ # gets the MBA, FCH, RFCH, RMVN and CMVE estimators # and the MB attractor zx <- x x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1]##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) ##get MBA attractor covb <- covm mnb <- mnm if(critd < critm) { covb <- covd mnb <- mnd } ##get FCH and CMVE attractors covf <- covm mnf <- mnm covcmv <- covm mncmv <- mnm val <- mahalanobis(t(mnd), med, covv) if(val < lcut) { if(critd < critm) { covf <- covd mnf <- mnd } rd2 <- mahalanobis(x, mnd, covd) critdm <- (sqrt(median(rd2)))^p * critd rd2 <- mahalanobis(x, mnm, covm) critmm <- (sqrt(median(rd2)))^p * critm if(critdm < critmm) { covcmv <- covd mncmv <- mnd } } ##scale for better performance at MVN ## get MBA estimator chisqm <- qchisq(0.5, p) rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/chisqm covb <- const * covb ## get FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/chisqm covf <- const * covf ## get 1st step of the RFCH estimator rd2 <- mahalanobis(x, mnf, covf) up <- qchisq(0.975, p) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) ##use the following 3 commands for the RMVN estimator rd3 <- rd2 rmnmvn <- rmnf rcovmvn <- rcovf ## use rd2 for RFCH and rd4 for RMVN rd2 <- mahalanobis(x, rmnf, rcovf) rd4 <- rd2 ##use the following command for the RFCH estimator const <- median(rd2)/chisqm rcovf <- const * rcovf ## reweight again rd2 <- mahalanobis(x, rmnf, rcovf) if(p > 1) { rmnf <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnf = mean(zx[rd2 <= up]) } rcovf <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnf, rcovf) const <- median(rd2)/chisqm rcovf <- const * rcovf ## get the RMVN estimator d1 <- sum(rd3 <= up) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd4)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## reweight again rd2 <- mahalanobis(x, rmnmvn, rcovmvn) if(p > 1){ rmnmvn <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnmvn = mean(zx[rd2 <= up]) } rcovmvn <- var(x[rd2 <= up, ]) d2 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d2 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## get CMVE estimator rd2 <- mahalanobis(x, mncmv, covcmv) const <- median(rd2)/(qchisq(0.5, p)) covcmv <- const * covcmv list(center = mnb, cov = covb, mnf = mnf, covf = covf, rmnf = rmnf, rcovf = rcovf, rmnmvn = rmnmvn, rcovmvn = rcovmvn, mncmv = mncmv, covcmv = covcmv, mnm = mnm, covm = covm) } covsim2<-function(n=100, p = 2, gam = 0.4, runs = 20) {# Needs p > 1. This function is used to determine when the DD # plot separates outliers from non-outliers. # Rather large n and p can be used since covfch is used. A <- sqrt(diag(1:p)) ct <- 0 for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) ## outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem x <- x %*% A out <- covfch(x) #try covmba(x),rmba(x),cov.mcd(x),covOGK(x,sigmamu = scaleTau2) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) ct <- ct + 1 } list(ct = ct) } ctrviews<-function(x, Y, ii = 1) {# Uses classical distances instead of robust distances. # Trimmed views for 90, 80, ... 0 percent # trimming. Allows visualization of m # and crude estimatation of c beta in models # of the form y = m(x^T beta) + e. # Advance the view with the right mouse button # and in R, highight "stop." # Workstation: activate a graphics # device with command "X11()" or "motif()." x <- as.matrix(x) center <- apply(x, 2, mean) cov <- var(x) rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- x %*% bhat[-1] plot(ESP, Y) title(labs[i]) identify(ESP, Y) print(bhat) } } ddcomp<-function(x, steps = 5) {# Need p > 1. # Makes 4 DD plots using the DGK, FCH, FMCD and MB estimators. # Click left mouse button to identify points. # Click right mouse button to end the function. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. # R users need to type "library(MASS)" or "library(lqs)." p <- dim(x)[2] par(mfrow = c(2, 2)) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical and RD the robust distance MD <- sqrt(md2) #DGK start md2 <- mahalanobis(x, center, cov) medd2 <- median(md2) ## get the start mns <- center covs <- cov ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDdgk <- const * rd plot(MD, RDdgk) abline(0, 1) identify(MD, RDdgk) title("DGK DD Plot") #FCH out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) rd <- sqrt(rd2) RDfch <- rd plot(MD, RDfch) abline(0, 1) identify(MD, RDfch) title("FCH DD Plot") #FMCD out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDf <- const * rd plot(MD, RDf) abline(0, 1) identify(MD, RDf) title("FMCD DD Plot") #Median Ball start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDmb <- const * rd plot(MD, RDmb) abline(0, 1) identify(MD, RDmb) title("Med Ball DD Plot") par(mfrow=c(1,1)) } ddcomp2<-function(x, steps = 5) {# Need p > 1. # Makes a 4 DD plots using the MBA, FCH, RFCH and MB estimators. # Click left mouse button to identify points. # Click right mouse button to end the function, and in R, highlight "stop." # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. p <- dim(x)[2] par(mfrow = c(2, 2)) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical and RD the robust distance MD <- sqrt(md2) #MBA out <- covmba(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) RDm <- sqrt(rd2) plot(MD, RDm) abline(0, 1) identify(MD, RDm) title("MBA DD Plot") #FCH out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) RDf <- sqrt(rd2) plot(MD, RDf) abline(0, 1) identify(MD, RDf) title("FCH DD Plot") #RFCH center <- out$rmnf cov <- out$rcovf rd2 <- mahalanobis(x, center, cov) RDr <- sqrt(rd2) plot(MD, RDr) abline(0, 1) identify(MD, RDr) title("RFCH DD Plot") #MB center <- out$mnm cov <- out$covm rd2 <- mahalanobis(x, center, cov) RDmb <- sqrt(rd2) plot(MD, RDmb) identify(MD, RDmb) title("MB DD Plot") par(mfrow=c(1,1)) } ddmv<-function(n = 100, p = 2, steps = 5, gam = 0.4, outtype = 2, est = 1) {# Need p > 1. # This function is used to determine when the DD # plot separates outliers from non-outliers for various starts. # Workstation needs to activate a graphics # device with the command "X11()" or "motif()." # Advance the view with the right mouse button, and in R, highlight "stop." ## est = 1 for DGK, 2 for median ball, 3 for MAD A <- sqrt(diag(1:p)) x <- matrix(rnorm(n * p), ncol = p, nrow = n) val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem #if outtype = 1, outliers are Np(10 1, Ip) nonoutliers Np(0,Ip) if(outtype == 2) x <- x %*% A ## outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T ## get the start if(est == 1) { #DGK classical start covs <- var(x) mns <- apply(x, 2, mean) } if(est == 2) { #Median Ball high breakdown start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(est == 3) { #MAD high breakdown start tem <- apply(x, 2, mad)^2 covv <- diag(tem) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } ## concentrate and plot, highlighting outliers MD <- sqrt(mahalanobis(x, mns, covs)) for(i in 1:steps) { md <- sqrt(mahalanobis(x, mns, covs)) medd <- median(md) mns <- apply(x[md <= medd, ], 2, mean) covs <- var(x[md <= medd, ]) rd <- sqrt(mahalanobis(x, mns, covs)) plot(MD, rd) points(MD[1:val], rd[1:val], pch = 15) identify(MD, rd) } } ddplot<-function(x, type = 5){ # Makes a DD plot. Needs p > 1. # Click left mouse button to identify points. # Click right mouse button to end the function # and in R, highlight "stop". # R users need to type "library(MASS)" or "library(lqs)." # RDi uses covmba for type = 1, rmba for type = 2, cov.mcd for # type = 3, covfch for type = 4, rfch for type = 5, cmve for type 6 # rcmve for type 7, rmvn for type = 8. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. p <- dim(x)[2] center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) if(type == 1){ out <- covmba(x) } if(type == 2){ out <- rmba(x) } if(type >= 3){ out <- cov.mcd(x) } if(type == 4) { out <- covfch(x) } if(type == 5) { out <- covfch(x) center <- out$rmnf cov <- out$rcovf } if(type == 6) { out <- cmve(x) } if(type == 7) { out <- cmve(x) center <- out$rmnf cov <- out$rcovf } if(type == 8) { out <- covrmvn(x) } if(type != 5 && type != 7){ center <- out$center cov <- out$cov } rd2 <- mahalanobis(x, center, cov) # md is the classical and rd the robust distance MD <- sqrt(md2) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RD <- const * rd plot(MD, RD) abline(0, 1) identify(MD, RD) # list(MD = MD, RD = RD) } ddplot2<-function(x, type = 2) {# Need p > 1. Makes a DD plot with RDi depending on type # as well as a MB DD plot. # Click left mouse button to identify points. # Click right mouse button to end the function # and in R, highlight "stop". # R users need to type "library(MASS)" or "library(lqs)." # The plot used covfch for the RDi if type = 1 # rfch if type = 2 and cov.mcd if type = 3. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. p <- dim(x)[2] center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) outm <- covfch(x) if(type == 1){ center <- outm$center cov <- outm$cov } if(type == 2){ center <- outm$rmnf cov <- outm$rcovf } if(type >= 3){ out <- cov.mcd(x) center <- out$center cov <- out$cov } rd2 <- mahalanobis(x, center, cov) # md is the classical and rd the robust distance MD <- sqrt(md2) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RD <- const * rd par(mfrow=c(1,2)) plot(MD, RD) abline(0, 1) identify(MD, RD) #if statement causes a bug # if(type > 1) rd2 <- mahalanobis(x, outm$mnm, outm$covm) rd <- sqrt(rd2) plot(MD,rd) title("MB DD Plot") identify(MD, rd) # list(MD = MD, RD = RD) par(mfrow=c(1,1)) } ddplot4<-function(x, alpha = 0.1){ # Makes a DD plot with covrmvn used for the RDi. # Need p > 1. # Semiparametric prediction regions are added. # Click left mouse button to identify points. # Click right mouse button to end the function. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. p <- dim(x)[2] n <- dim(x)[1] up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) out <- covrmvn(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) # MD is the classical and RD the robust distance MD <- sqrt(md2) RD <- sqrt(rd2) plot(MD, RD) abline(0, 1) #get nonparametric prediction region boundary cuplim <- sqrt(quantile(md2, up)) a <- min(RD) b <- max(RD) lines(c(cuplim, cuplim), c(b, a)) #get semiparametric prediction region boundary ruplim <- sqrt(quantile(rd2, up)) a <- min(MD) b <- max(MD) lines(c(a, b), c(ruplim, ruplim)) #get parametric MVN prediction region boundary mvnlim <- sqrt(qchisq(up, p)) b <- min(b, mvnlim) lines(c(a, b), c(mvnlim, mvnlim)) identify(MD, RD) list(cuplim = cuplim, ruplim = ruplim, mvnlim = mvnlim) } ddplot5<-function(x, mm=0, kk= 5,steps = 9) {# Plots Euclidean distances from the coordinatewise median #vs. those of covmb2 location estimator with 9 concentration type steps. #Good plot for outlier detection even if p > n. #Needs p > 1. x<-as.matrix(x) p <- dim(x)[2] covv <- diag(p) med <- apply(x, 2, median) RDMED <- sqrt(mahalanobis(x, center = med, covv)) RDCOVMB2 <- sqrt(mahalanobis(x,center=covmb2(x,m=mm,k=kk,msteps=steps)$center,covv)) plot(RDMED,RDCOVMB2) #list(RDMED=RDMED,RDCOVMB2=RDCOVMB2) } ddiscr<-function(x, w, group, xwflag = F){ #Uses Mahalanobis distances to classify data. #Let x contains classified data, group is vector with group[i] = j #if ith row is a case from the jth group, j = 1, ..., k. #Let w contains data to be classified, each row is a case. #Need each group size nh > p. #Set xwflag = T if x = w. #mahalanobis computes squared distances k <- max(group) maxd <- 1:k x <- as.matrix(x) w <- as.matrix(w) n <- dim(x)[1] m <- dim(w)[1] nh <- 1:k err <- 1:k mdx <- matrix(0,nrow=n,ncol=k) mdw <- matrix(0,nrow=m,ncol=k) classx <- 1:n classw <- 1:m for(i in 1:k){ nh[i] <- sum(group==i) xi <- as.matrix(x[group==i,]) mns <- mean(xi) covs<- var(xi) maxd[i] <- max(sqrt(mahalanobis(xi, mns, covs))) mdx[,i] <- sqrt(mahalanobis(x, mns, covs)) mdw[,i] <- sqrt(mahalanobis(w, mns, covs)) } for(i in 1:m){ if(sum(mdw[i,] <= maxd) == 0){ classw[i] <- order(mdw[i,]/maxd)[1] } else{val <- mdw[i,] val[val > maxd] <- max(val) + 1 classw[i] <- order(val)[1] } } if(xwflag == T){ classx <- classw} else{ for(i in 1:n){ if(sum(mdx[i,] <= maxd) == 0){ classx[i] <- order(mdx[i,]/maxd)[1] } else{val <- mdx[i,] val[val > maxd] <- max(val) + 1 classx[i] <- order(val)[1] } } } for(i in 1:k){ err[i] <- 1 - sum(classx[group==i]==i)/nh[i] } toterr = 1 - sum(classx==group)/n list(mdx=mdx,mdw=mdw,classw = classw,classx=classx,err=err,toterr=toterr) } ddiscr2<-function(x, w, group, xwflag=F){ #Uses Mahalanobis distances. If the covering ellipsoids overlap, #then a nonparametric density estimator is used to classify data. #Let x contains classified data, group is vector with group[i] = j #if ith row is a case from the jth group, j = 1, ..., k. #Let w contains data to be classified, each row is a case. #Need each group size nh > max(3,p). Want nh > 10p. #mahalanobis computes squared distances #Computes fhat for cases where the covering ellipsoids overlap. #Set xwflag = T if x = w. #Slow for n = 10000. k <- max(group) maxd <- 1:k x <- as.matrix(x) w <- as.matrix(w) n <- dim(x)[1] p <- dim(x)[2] m <- dim(w)[1] covv <- as.matrix(diag(p)) nh <- 1:k err <- 1:k radsq <- 1:k mdx <- matrix(0,nrow=n,ncol=k) mdw <- matrix(0,nrow=m,ncol=k) classx <- 1:n classw <- 1:m for(i in 1:k){ nh[i] <- sum(group==i) tem <- ceiling(2*sqrt(nh[i])) xi <- as.matrix(x[group==i,]) mns <- mean(xi) covs<- var(xi) maxd[i] <- max(sqrt(mahalanobis(xi, mns, covs))) mdx[,i] <- sqrt(mahalanobis(x, mns, covs)) mdw[,i] <- sqrt(mahalanobis(w, mns, covs)) radsq[i] <- sort(mahalanobis(xi,mns,covv))[tem] } for(i in 1:m){ if(sum(mdw[i,] <= maxd) == 0){ classw[i] <- order(mdw[i,]/maxd)[1]} else{if(sum(mdw[i,] <= maxd) == 1){ classw[i] <- order(rank(mdw[i,]>maxd))[1]} else{val<-mdw[i,] val[val>maxd] <- -1 for(j in 1:k){ if(mdw[i,j] <= maxd[j]) #get estimate of f_j(w[i,]) {xj <- as.matrix(x[group==j,]) val[j] <- sum( mahalanobis(xj,w[i,],covv) <= radsq[j] ) val[j] <- val[j]/( nh[j] * sqrt(radsq[j])^p ) } } classw[i] <- order(val)[k] } } } if(xwflag == T){ classx <- classw} else{ for(i in 1:n){ if(sum(mdx[i,] <= maxd) == 0){ classx[i] <- order(mdx[i,]/maxd)[1]} else{if(sum(mdx[i,] <= maxd) == 1){ classx[i] <- order(rank(mdx[i,]>maxd))[1]} else{val<-mdx[i,] val[val>maxd] <- -1 for(j in 1:k){ if(mdx[i,j] <= maxd[j]) #get estimate of f_j(x[i,]) {xj <- as.matrix(x[group==j,]) val[j] <- sum( mahalanobis(xj,x[i,],covv) <= radsq[j] ) val[j] <- val[j]/( nh[j] * sqrt(radsq[j])^p ) } } classx[i] <- order(val)[k] } } } } for(i in 1:k){ err[i] <- 1 - sum(classx[group==i]==i)/nh[i] } toterr = 1 - sum(classx==group)/n list(mdx=mdx,mdw=mdw,classw = classw, classx=classx, err=err, toterr=toterr) } ddsim<-function(n = 100, p = 3, eps = 0.4, type = 5) {# Need p > 1. # R: type "library(MASS)" or "library(lqs)." # Rapidly plots 20 DD plots in a row. # Unix: type "X11()" or "motif()" to # turn on a graphics device. # RDi uses covmba for type = 1, rmba for type = 2, cov.mcd for # type = 3, covfch for type = 4, rfch for type = 5, rmvn type = 6. med <- 1:20 for(i in 1:20) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) ## For elliptically contoured data, use: #zu <- runif(n) #x[zu < eps,] <- x[zu < eps,]*5 #x <- x^2 ##For lognormal marginals, add: #x <- exp(x) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) if(type == 1) { out <- covmba(x) } if(type == 2) { out <- rmba(x) } if(type == 3) { out <- cov.mcd(x) } if(type == 4) { out <- covfch(x) } if(type < 5) { center <- out$center cov <- out$cov } if(type == 5) { out <- covfch(x) center <- out$rmnf cov <- out$rcovf } if(type == 6) { out <- covrmvn(x) } if(type != 5) { center <- out$center cov <- out$cov } rd2 <- mahalanobis(x, center, cov) md <- sqrt(md2) rd <- sqrt(rd2) #Scale the RDi so plot follows 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd plot(md, rd) abline(0, 1) med[i] <- median(md) #The following command can be inserted #to slow down the plots "identify(md,rd)" } list(med = med) } ddsim3<-function(n = 100, p = 4, xtype = 1, dd = 1, eps = 0.25, alpha = 0.1){ # MAY NOT WORK IF p = 1. Calls ddplot4. # Makes DD plots of data used in function "predsim". Gets coverages # of semiparametric and robust parametric prediction regions. # Multiply x by A where xtype = 1 for MVN Nq(0,I), # 2, 3, 4 and 10 (with delta = eps) for delta Np(0,I) + (1-delta) Np(0, 25 I) # 5 for lognormal, # 6, 7, 8 and 9 for multivariate t_dd # mahalanobis gives squared Maha distances up <- min((1 - alpha/2), (1 - alpha + (50 * alpha)/n)) A <- sqrt(diag(1:p)) m <- n + 1 #make data x <- matrix(rnorm(m * p), nrow = m, ncol = p) if(xtype == 2) { zu <- runif(m) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(m) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(m) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) x <- exp(x) if(xtype == 6) { zu <- sqrt(rchisq(m, 3)/3 ) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(m, 5)/5 ) x <- x/zu } if(xtype == 8) { zu <- sqrt(rchisq(m, 19)/ 19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(m, dd)/ dd) x <- x/zu } if(xtype == 10) { zu <- runif(m) x[zu < eps, ] <- x[zu < eps, ] * 5 } x <- x %*% A xx <- x[ - m, ] ddplot4(xx) } deltv<-function(gam = 0.5) {# Gets asy var for lts(h) and lta(h) at standard double exp # where h/n -> gam. k <- -1 * log(1 - gam) num <- 2 - (2 + 2 * k + k^2) * exp( - k) den <- (gam - k * exp( - k))^2 ltsv <- num/den ltav <- 1/gam list(ltsv=ltsv, ltav=ltav) } drsim5<-function(n=100,q=4,nruns=100,h=2,tr=10,xtype=1,mtype=1,eps=0.4,dd=5) {# MAY NOT WORK IF q = 1 # Needs dr library so only works in R. Uses FCH. # Finds ave(esp), scaled sd, ave scaled ols se, ave scaled chen and li se for tvreg(M) estimator. # CHANGE A IF q IS NOT EVEN # Change beta, q, and A to change Ho. # Use tr = 1 for 90%, tr = 2 for 80%, ..., tr = 10 for 0% trimming # xtype = 1 for MVN Nq(0,I), # 2, 3, 4 and 10 for eps Nq(0,I) + (1-eps) Nq(0, 25 I) # 5 for lognormal, # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd, set.seed(974) k <- q/2 # test Ho beta(q/2 + 1) = ... = beta(q) = 0 beta <- 0 * 1:q beta[1:q/2] <- 1 bols <- matrix(0, nrow = nruns, ncol = (q + 1)) bsir <- matrix(0, nrow = nruns, ncol = q) bwsir <- bsir sseols <- matrix(0, nrow = nruns, ncol = q) ssecl <- sseols zols <- 0 * 1:nruns fols <- zols zsir <- zols zwsir <- zsir labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") A2 <- diag(q/2) A1 <- 0 * A2 A <- cbind(A1, A2) tem <- seq(0.1, 1, 0.1) for(i in 1:nruns) { #make data x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(xtype == 2){ zu <- runif(n) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(n) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(n) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) x <- exp(x) if(xtype == 6) { zu <- sqrt(rchisq(n, 3)/3) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(n, 5)/5) x <- x/zu } if(xtype == 8){ zu <- sqrt(rchisq(n, 19)/19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(xtype == 10) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } sp <- 1 + x %*% beta if(mtype == 1) y <- sp + rnorm(n) if(mtype == 2) y <- sp^2 + rnorm(n) if(mtype == 3) y <- exp(sp) + rnorm(n) if(mtype == 4) y <- (sp)^3 + rnorm(n) if(mtype == 5) y <- sin(sp)/sp + 0.01*rnorm(n) if(mtype == 6) y <- sp + sin(sp) + 0.1*rnorm(n) if(mtype == 7) y <- sqrt(abs(sp)) + 0.1*rnorm(n) #do trimming out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) val <- quantile(rd2, tem[tr]) #get untrimmed data xm <- x[rd2 <= val, ] ym <- y[rd2 <= val] #get ESP from untrimmed data out <- lsfit(xm, ym) bols[i, ] <- out$coef resm <- out$resid nm <- length(ym) msem <- t(resm) %*% resm/(nm - q - 1) cxm <- var(xm) #get OLS SE which is usually incorrect cxminv <- solve(cxm) sseols[i, ] <- sqrt(n/nm) * sqrt(msem * diag(cxminv)) #get OLS chisquare test statistic bhat <- bols[i, -1] covAb <- A %*% cxminv %*% t(A) covinv <- solve(covAb) cent <- t(A) %*% covinv %*% A num <- t(bhat) %*% cent %*% bhat zols[i] <- (nm * num)/msem #get OLS partial F statistic fols[i] <- ((nm - 1) * zols[i])/(k * nm) #get Chen and Li scaled SE mnxm <- apply(xm, 2, mean) mid <- matrix(0, nrow = q, ncol = q) for(j in 1:nm) mid <- mid + resm[j]^2 * (xm[j, ] - mnxm) %*% t(xm[j, ] - mnxm) mid <- mid/nm cmcl <- cxminv %*% mid %*% cxminv ssecl[i, ] <- sqrt(n/nm) * sqrt(diag(cmcl)) #get Statlib SIR scaled SE and test statistics out <- sir(xm, ym, h) bsir[i, ] <- out$edr[, 1] bhat <- bsir[i, ] lam <- out$evalues[1] val <- (1 - lam)/lam num <- t(bhat) %*% cent %*% bhat zsir[i] <- (nm * num)/val #get Weisberg SIR scaled SE and test statistics out <- dr(ym~xm[,1]+xm[,2]+xm[,3]+xm[,4],nslices=h) bwsir[i,] <- out$evectors[,1] bhat <- bwsir[i, ] lam <- out$evalues[1] val <- (1 - lam)/lam num <- t(bhat) %*% cent %*% bhat zwsir[i] <- (nm * num)/val } print(labs[tr]) print(beta) mnbsir <- apply(bsir, 2, mean) mnwbsir <- apply(bwsir,2,mean) mnbols <- apply(bols, 2, mean) ssdbols <- sqrt(n) * sqrt(apply(bols, 2, var)) mnsseols <- apply(sseols, 2, mean) mnssecl <- apply(ssecl, 2, mean) zolslev <- sum(zols > qchisq(0.95, k))/nruns folslev <- sum(fols > qf(0.95, k, nm - q - 1))/nruns zsirlev <- sum(zsir > qchisq(0.95, k))/nruns zwsirlev <- sum(zwsir > qchisq(0.95, k))/nruns list(mnbsir = mnbsir, mnwbsir= mnwbsir, mnbols = mnbols, ssdbols = ssdbols, mnsseols = mnsseols, mnssecl = mnssecl, zolslev = zolslev, folslev = folslev, zsirlev = zsirlev, zwsirlev=zwsirlev) } drsim6<-function(n = 100,q = 4,nruns = 100,xtype = 1,mtype = 1,eps = 0.4,dd = 5) {# MAY NOT WORK IF q = 1 # Needs dr library so only works in R. Uses FCH. # Finds ave(esp), and rejection proportions for 0% and adaptive # trimming OLS estimators. # CHANGE A IF q IS NOT EVEN # xtype = 1 for MVN Nq(0,I), # 2, 3, 4 and 10 for eps Nq(0,I) + (1-eps) Nq(0, 25 I) # 5 for lognormal, # 6, 7, 8 and 9 for multivariate t_d, with d = 3, 5, 19 or dd set.seed(974) k <- q/2 beta <- 0 * 1:q beta[1:q/2] <- 1 esp <- matrix(0, nrow = n, ncol = 10) bols <- matrix(0, nrow = nruns, ncol = (q + 1)) baols <- bols fols <- 0 * 1:nruns best <- fols atrp <- best faols <- fols faolslev <- 0 labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") A2 <- diag(q/2) A1 <- 0 * A2 A <- cbind(A1, A2) tem <- seq(0.1, 1, 0.1) for(i in 1:nruns) { #make data x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(xtype == 2) { zu <- runif(n) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(n) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(n) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) x <- exp(x) if(xtype == 6) { zu <- sqrt(rchisq(n, 3)/3) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(n, 5)/5) x <- x/zu } if(xtype == 8) { zu <- sqrt(rchisq(n, 19)/19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(xtype == 10) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } sp <- 1 + x %*% beta if(mtype == 1) y <- sp + rnorm(n) if(mtype == 2) y <- sp^2 + rnorm(n) if(mtype == 3) y <- exp(sp) + rnorm(n) if(mtype == 4) y <- (sp)^3 + rnorm(n) if(mtype == 5) y <- sin(sp)/sp + 0.01 * rnorm(n) if(mtype == 6) y <- sp + sin(sp) + 0.1 * rnorm(n) if(mtype == 7) y <- sqrt(abs(sp)) + 0.1 * rnorm(n) #get OLS for 0% trimming xm <- x ym <- y #get ESP out <- lsfit(xm, ym) bols[i, ] <- out$coef resm <- out$resid nm <- length(ym) msem <- t(resm) %*% resm/(nm - q - 1) cxm <- var(xm) cxminv <- solve(cxm) #get OLS chisquare test statistic bhat <- bols[i, -1] covAb <- A %*% cxminv %*% t(A) covinv <- solve(covAb) cent <- t(A) %*% covinv %*% A num <- t(bhat) %*% cent %*% bhat zols <- (nm * num)/msem #get OLS partial F statistic fols[i] <- ((nm - 1) * zols)/(k * nm) #get covfch estimator out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) #get best trimming proportion btr <- 10 esp[, 10] <- x %*% bhat bcorr <- abs(cor(esp[, 10], sp)) for(j in 1:9) { val <- quantile(rd2, tem[j]) #get (xm,ym) xm <- x[rd2 <= val, ] ym <- y[rd2 <= val] #get ESP from (xm,ym) out <- lsfit(xm, ym)$coef[-1] esp[, j] <- x %*% out tcorr <- abs(cor(esp[, j], sp)) if(tcorr > bcorr) btr <- j } best[i] <- btr #get adaptive trimming proportion atr <- btr if(btr < 10) { for(j in (btr + 1):10) { tval <- abs(cor(esp[, j], esp[, btr])) if(tval > 0.95) atr <- j } } atrp[i] <- atr val <- quantile(rd2, tem[atr]) #get data from adaptive trimming xm <- x[rd2 <= val, ] ym <- y[rd2 <= val] #get ESP from adaptive trimming out <- lsfit(xm, ym) baols[i, ] <- out$coef resm <- out$resid nm <- length(ym) msem <- t(resm) %*% resm/(nm - q - 1) cxm <- var(xm) cxminv <- solve(cxm) #get OLS chisquare test statistic bhat <- baols[i, -1] covAb <- A %*% cxminv %*% t(A) covinv <- solve(covAb) cent <- t(A) %*% covinv %*% A num <- t(bhat) %*% cent %*% bhat zols <- (nm * num)/msem #get OLS partial F statistic faols[i] <- ((nm - 1) * zols)/(k * nm) #check whether adaptive trimming rejects Ho if(faols[i] > qf(0.95, q/2, nm - q - 1)) faolslev <- faolslev + 1 } print(labs[btr]) print(beta) mnbols <- apply(bols, 2, mean) ssdbols <- sqrt(n) * sqrt(apply(bols, 2, var)) folslev <- sum(fols > qf(0.95, q/2, n - q - 1))/nruns mnbaols <- apply(baols, 2, mean) ssdbaols <- sqrt(n) * sqrt(apply(baols, 2, var)) faolslev <- faolslev/nruns list(mnbols = mnbols, mnbaols = mnbaols, ssdbols = ssdbols, ssdbaols = ssdbaols, folslev = folslev, faolslev = faolslev, best = best, atrp = atrp) } drsim7<-function(n=100,p = 4,h=4,steps = 5,gam=0.49,nruns=100,drtype=1,outliers=F) {# Need to modify dr formulas if p is not equal to 4. # This R function generates outlier data. Uses FCH. # Compares SIR, SAVE, PHD and OLS for 0 and 50% trimming # Use drtype = 1 for sir, 2 for save, 3 for phd, 4 for ols # Needs library dr. set.seed(974) beta1 <- 0 * 1:p beta1[1] <- 1 bwdr1 <- matrix(0, nrow = nruns, ncol = p) b50 <- bwdr1 mbaloc <- 0 * (1:p) mbasig <- 0 * diag(1:p) for(i in 1:nruns) { x <- matrix(rnorm(n * p), nrow = n, ncol = p) sp1 <- 1 + x %*% beta1 y <- (sp1)^3 + rnorm(n) ## code below: x outliers have mean +/- (10, 10, ..., 10)^T ## and y outliers have mean 0 if(outliers == T) { val <- floor(gam * n) val2 <- as.integer(val/2) tem <- 10 + 0 * 1:p x[1:val2, ] <- x[1:val2, ] + tem x[(val2+1):val,] <- x[(val2+1):val,] - tem y[1:val] <- rnorm(val) } out <- covfch(x, csteps = steps) mbaloc <- mbaloc + out$center mbasig <- mbasig + out$cov #do trimming center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) val <- quantile(rd2, 0.5) #get untrimmed data xm <- x[rd2 <= val, ] ym <- y[rd2 <= val] #get Weisberg SIR, SAVE or PHD if(drtype==1){ out <- dr(y~x[,1]+x[,2]+x[,3]+x[,4],nslices=h,method="sir") outm <- dr(ym~xm[,1]+xm[,2]+xm[,3]+xm[,4],nslices=h,method="sir") } if(drtype==2){ out <- dr(y~x[,1]+x[,2]+x[,3]+x[,4],nslices=h,method="save") outm <- dr(ym~xm[,1]+xm[,2]+xm[,3]+xm[,4],nslices=h,method="save") } if(drtype==3){ out <- dr(y~x[,1]+x[,2]+x[,3]+x[,4],nslices=h,method="phd") outm <- dr(ym~xm[,1]+xm[,2]+xm[,3]+xm[,4],nslices=h,method="phd") } if(drtype==4){ out <- lsfit(x,y) outm <- lsfit(xm,ym) } if(drtype == 4){ bwdr1[i,] <- out$coef[-1] b50[i,] <- outm$coef[-1] } else{ bwdr1[i,] <- out$evectors[,1] b50[i,] <- outm$evectors[,1] } } mbaloc <- mbaloc/nruns mbasig <- mbasig/nruns mnwbdr1 <- apply(bwdr1,2,mean) mnb50 <- apply(b50,2,mean) list(mnwbdr1=mnwbdr1, mnb50 = mnb50, mbaloc = mbaloc, mbasig = mbasig) } ellipse<-function(x, center = apply(x, 2, mean), cov = var(x), alph = 0.95) {# Makes a covering interval. The x should have 2 columns. mu1 <- center[1] mu2 <- center[2] w <- solve(cov) w11 <- w[1, 1] w12 <- w[1, 2] w22 <- w[2, 2] tem <- x[, 2] - mu2 y2 <- seq(min(tem), max(tem), length = 100) xc <- qchisq(alph, 2) el <- matrix(0, 2, 2) ind <- 0 for(i in 1:100) { j1 <- (y2[i] * w12)^2 j2 <- w11 * ((y2[i])^2 * w22 - xc) # print(i) # print(j1 - j2) if((j1 - j2) >= 0) { ind <- ind + 2 tem <- (y2[i] * w12)^2 tem <- tem - w11 * ((y2[i])^2 * w22 - xc) tem <- sqrt(tem) term <- ( - y2[i] * w12 + tem)/ w11 el <- rbind(el, c((term + mu1), ( y2[i] + mu2))) term <- ( - y2[i] * w12 - tem)/ w11 el <- rbind(el, c((term + mu1), ( y2[i] + mu2))) } } el <- el[3:ind, ] nn <- dim(x)[1] if((ind - 2) > nn) { tem <- sample((ind - 2), nn) el <- el[tem, ] } xt <- cbind(x[, 1], el[, 1]) yt <- cbind(x[, 2], el[, 2]) matplot(xt, yt) } ellipse2<- function(x, center = apply(x, 2, mean), cov = var(x), cut = 4, alph = 0.95, cutoff = F) {# Makes a covering interval. The x should have 2 columns. mu1 <- center[1] mu2 <- center[2] w <- solve(cov) w11 <- w[1, 1] w12 <- w[1, 2] w22 <- w[2, 2] tem <- x[, 2] - mu2 y2 <- seq(min(tem), max(tem), length = 100) if(cutoff ==F) xc <- qchisq(alph, 2) else xc <- cut el <- matrix(0, 2, 2) ind <- 0 for(i in 1:100) { j1 <- (y2[i] * w12)^2 j2 <- w11 * ((y2[i])^2 * w22 - xc) # print(i) # print(j1 - j2) if((j1 - j2) >= 0) { ind <- ind + 2 tem <- (y2[i] * w12)^2 tem <- tem - w11 * ((y2[i])^2 * w22 - xc) tem <- sqrt(tem) term <- ( - y2[i] * w12 + tem)/ w11 el <- rbind(el, c((term + mu1), ( y2[i] + mu2))) term <- ( - y2[i] * w12 - tem)/ w11 el <- rbind(el, c((term + mu1), ( y2[i] + mu2))) } } el <- el[3:ind, ] nn <- dim(x)[1] if((ind - 2) > nn) { tem <- sample((ind - 2), nn) el <- el[tem, ] } xt <- cbind(x[, 1], el[, 1]) yt <- cbind(x[, 2], el[, 2]) matplot(xt, yt) } essp<- function(x, Y, M = 50, type = 3) {# Trimmed view or ESSP for M percent # trimming. Allows visualization of g # and crude estimation of c beta in models # of the form y = g(x^T beta,e). # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(MASS)" or "library(lqs)." # Click on the right mouse button to finish, and # in R, highlight "stop." x <- as.matrix(x) tval <- M/100 if(type == 1) out <- cov.mcd(x) if(type == 2) out <- covmba(x) if(type == 3) out <- covfch(x) if(type == 4) out <- covrmvn(x) if (type <= 4){ center <- out$center cov <- out$cov} #type = 5 for rfch estimator if (type == 5){ out <- covfch(x) center <- out$rmnf cov <- out$rcovf} rd2 <- mahalanobis(x, center, cov) val <- quantile(rd2, (1 - tval)) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- x %*% bhat[-1] + bhat[1] plot(ESP, Y) identify(ESP, Y) return(bhat[-1]) } FDAboot<-function(x, group, B = 1000){ #Bootstraps FDA betahat = first eigenvector. #Nominal 95% CIs #group labels are 1,2,...,g #needs library(MASS) g <- max(group) x <- as.matrix(x) p <- dim(x)[2] betas <- matrix(0,nrow=B,ncol=p) nis <- 1:g tindx <- 1:dim(x)[1] out<-lda(x,group) for(i in 1:g){ nis[i] <- sum(group==i) } for(i in 1:B){ indx <- 1:2 for(j in 1:g){ tind <- tindx[group==j] tdx<- sample(tind,nis[j],replace=T) indx <- c(indx,tdx) } indx <- indx[-c(1,2)] betas[i,]<-lda(x[indx,],group[indx])$scaling[,1] } shorci <- apply(betas,2,shorth3) list(betas=betas,shorci=shorci,out=out) } ffplot2<-function(x, y, nsamps = 7){ # For Unix, use X11() to turn on the graphics device before # using this function. For R, first type library(MASS). # Makes an FF plot with several resistant estimators. # Needs hbreg, mbareg, mbalata, and rmreg3. n <- length(y) x <- as.matrix(x) rmat <- matrix(nrow = n, ncol = 8) lsfit <- y - lsfit(x, y)$residuals print("got OLS") almsfit <- y - lmsreg(x, y)$resid print("got ALMS") altsfit <- y - ltsreg(x, y)$residuals print("got ALTS") mbacoef <- mbareg(x, y, nsamp = nsamps)$coef MBAfit <- mbacoef[1] + x %*% mbacoef[-1] print("got MBA") bbcoef <- hbreg(x, y)$bbcoef BBfit <- bbcoef[1] + x %*% bbcoef[-1] print("got BB") mbalcoef <- mbalata(x, y, nsamp = nsamps)$coef MBALfit <- mbalcoef[1] + x %*% mbalcoef[-1] print("got MBALATA") RMREG2 <- y - rmreg3(x,y)$res rmat[, 1] <- y rmat[, 2] <- lsfit rmat[, 3] <- almsfit rmat[, 4] <- altsfit rmat[, 5] <- MBAfit rmat[, 6] <- BBfit rmat[, 7] <- MBALfit rmat[, 8] <- RMREG2 pairs(rmat, labels = c("Y", "OLS Fit", "ALMS Fit", "ALTS Fit", "MBA Fit", "BB Fit", "MBALATA", "RMREG2")) } frey <- function(alph = 0.05){ #compare Frey correction with Olive correction nn <- seq(20,500,by = 1) m <- length(nn) fr <- 1:m ol <- fr for(i in 1:m){ fr[i] <- min(1,1 - alph + 1.12*sqrt(alph/nn[i])) ol[i] <- 1 - alph + 10*alph/nn[i] } plot(ol,fr) abline(0,1) } fselboot<-function(x,y,B = 1000){ #needs library(leaps), n > 5p, p > 2 #bootstrap min Cp model forward selection regression #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum Cp submodel tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(x[,vin],y) betas <- matrix(0,nrow=B,ncol=p) #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-regsubsets(x,y=yb,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp)] vin <- vars[mincp[-1]] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,betas=betas) } gamper<-function(h, k=500) {#estimates amount of comtamination FLTS can tolerate n <- 10000 c <- 5000 gam0 <- min((n - c)/n, (1 - (1 - 0.2^(1/k))^(1/ h))) * 100 print(gam0) } gamper2<-function(p, k = 500) {#estimates the amount of contamination fmcd can tolerate n <- 10000 c <- 5000 h <- p + 1 gam0 <- min((n - c)/n, (1 - (1 - 0.2^(1/k))^(1/h))) * 100 print(gam0) } getB<-function(x, m=0, k= 5, msteps = 0){ # Gets the covmb2 subset B and the index of cases indx. # Best if p > n. # You can estimate number of clean cases m > n/2 with plot: out<-medout(x) x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] index <- 1:n covv <- diag(p) med <- apply(x, 2, median) #Get squared Euclidean distances from coordinatewise median. md2 <- mahalanobis(x, center = med, covv) if(m==0){ if(msteps > 0){#do concentration type steps for(i in 1:msteps){ medd <- median(md2) medw <- apply(x[md2<=medd,], 2, median) md2 <- mahalanobis(x, center = medw, covv) } } md <- sqrt(md2) mcut <- median(md) + k*mad(md,constant=1) } else{ #Use m cases with the smallest distances. md <- sqrt(md2) mcut <- sort(md)[m] } B <- x[md <= mcut,] indx <- index[md <= mcut] list(B = B, indx=indx) } getBbig<-function(x, group){ #Gets Bbig for discriminant analysis, binary regression, or one way MANOVA. #Let x contains data, group is vector with group[i] = j #if ith row is a case from the jth group, j = 1, ..., g. #Can be used if p > n. g <- max(group) x <- as.matrix(x) indx <- 1:2 tindx <- 1:dim(x)[1] #get the cases used in the covmb2 set B from each group for(i in 1:g){ xi <- as.matrix(x[group==i,]) tind <- tindx[group==i] tem <- getB(xi) tind <- tind[tem$indx] indx <- c(indx,tind) } indx <- indx[-c(1,2)] Bbig <- x[indx,] grp <- group[indx] list(Bbig=Bbig,indx=indx,grp=grp) } getfubig<-function(x, group, type = 1){ # needs library(robustbase) #Gets fUbig for discriminant analysis, binary regression, or one way MANOVA. #Let x contains data, group is vector with group[i] = j #if ith row is a case from the jth group, j = 1, ..., g. #Need each group size nh > 2(p+1). Would like nh > 20p. #type = 1: fastMCD, type = 2: DetMCD, type = 3: OGK g <- max(group) x <- as.matrix(x) indx <- 1:2 tindx <- 1:dim(x)[1] #get the cases used in the RMVN set from each group for(i in 1:g){ xi <- as.matrix(x[group==i,]) tind <- tindx[group==i] if(type == 1){ tem <- covMcd(xi) tind <- tind[tem$mcd.wt>0.5] } if(type == 2){ tem <- covMcd(xi,nsamp="deterministic") tind <- tind[tem$mcd.wt>0.5] } if(type == 3){ tem <- covOGK(xi,sigmamu=scaleTau2) #tem <- covOGK(xi,sigmamu=s_IQR) #tem <- covOGK(xi,sigmamu=s_Sn) #tem <- covOGK(xi,sigmamu=s_Qn) tind <- tind[tem$weights>0.5] } indx <- c(indx,tind) } indx <- indx[-c(1,2)] fUbig <- x[indx,] grp <- group[indx] list(fUbig=fUbig,indx=indx,grp=grp) } getu<-function(x, csteps = 5, locc = 0.5){ # Gets the RMVN subset U and the index of cases indx. # Needs number of predictors > 1. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] index <- 1:n up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc)## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) to get the cov estimator ## (rmnmvn,rcovmvn) tailored for MVN data rd2 <- mahalanobis(x, mnf, covf) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ## get U rd2 <- mahalanobis(x, rmnmvn, rcovmvn) U <- x[rd2 <= up, ] indx <- index[rd2 <= up] list(U = U, indx=indx) } getubig<-function(x, group){ # Gets Ubig for discriminant analysis, binary regression, or one way MANOVA. #Let x contains data, group is vector with group[i] = j #if ith row is a case from the jth group, j = 1, ..., g. #Need each group size nh > 2(p+1). Would like nh > 20p. g <- max(group) x <- as.matrix(x) indx <- 1:2 tindx <- 1:dim(x)[1] #get the cases used in the RMVN set from each group for(i in 1:g){ xi <- as.matrix(x[group==i,]) tind <- tindx[group==i] tem <- getu(xi) tind <- tind[tem$indx] indx <- c(indx,tind) } indx <- indx[-c(1,2)] Ubig <- x[indx,] grp <- group[indx] list(Ubig=Ubig,indx=indx,grp=grp) } getuc<-function(x, group){ # Gets Uc for discriminant analysis or one way MANOVA # assuming the g groups or populations only differ in mean. #So Yij - muj are iid where j = 1, ..., g. BIG ASSUMPTION! #Subtracts the coordinatewise median from each group. #Then gets the RMVN set from the 0 median data. #Then gets the corresponding x data. #Let x contains data, group is vector with group[i] = j #if ith row is a case from the jth group, j = 1, ..., g. #Need each group size nj > 2(p+1). Would like nj > 20p. g <- max(group) x <- as.matrix(x) Uz <- x[1:2,] #Subtract the group coordinatewise median from data in each group. #Combine the groups into one data set Uz of nearly iid data #if the medians approximate the means. for(i in 1:g){ xi <- as.matrix(x[group==i,]) med <- apply(xi, 2, median) #one <- 1 + 0*1:dim(xi)[1];xc <- xi - one%*%t(med) xc <- sweep(xi,2,med) Uz <- rbind(Uz,xc) } Uz <- Uz[-c(1,2),] out <- getu(Uz) indx <- out$indx Uc <- x[indx,] grp <- group[indx] list(Uc=Uc,indx=indx,grp=grp) } gsqboot<-function(x, B = 1000){ #bootstraps Fan's squared cor coef of the correlation matrix x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] Ip <- diag(p) indx <- 1:n gsqs <- 1:B for(i in 1:B){ tem <- sample(indx,n,replace=T) out <- cor(x[tem,]) ##get det(R*) detR <- prod(diag(chol(out)))^2 temp <- (out - Ip)%*%(out - Ip) ##get squared Frobenius norm of (R* - Ip)^2 fr <- sum(diag(temp)) gsqs[i] <- fr/(2*detR + fr) } list(gsqs=gsqs) } gsqbootsim<-function(n = 100, p = 4, BB=1000, nruns = 100, type = 1, psi = 0.0, dd=7, eps = 0.25, alph = 0.05){ #Simulates bootstrap for Fan's squared cor coef of the correlation matrix. # Use 0 <= psi < 1. # Make x where type = 1 for MVN Nq(0,I), # 2, 3, 4 and 5 (with delta = eps) for (1 - delta) Nq(0,I) + delta Nq(0, 25 I) # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd # 10 for lognormal. # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(p-2)psi^2]/[1 + (p-1)psi^2], i not = j # when the correlation exists. #Makes the shorth CI [Ln,Un], the lower CI [0,U] and the upper CI [L,1] cicov <- 0 avelen <- 0 ucicov <- 0 uavelen <- 0 lcicov <- 0 lavelen <- 0 rho <- (2*psi + (p-2)*psi^2)/(1 + (p-1)*psi^2) gsq <- p*(p-1)*rho^2/( 2*(1-rho)^(p-1) * (1+(p-1)*rho) + p*(p-1)*rho^2 ) A <- matrix(psi,nrow=p,ncol=p) diag(A) <- 1 for(i in 1:nruns) { #make data x <- matrix(rnorm(n * p), nrow = n, ncol = p) if(type == 2) { zu <- runif(n) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(type == 3) { zu <- runif(n) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(type == 4) { zu <- runif(n) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(type == 5) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } if(type == 6) { zu <- sqrt(rchisq(n, 3)/3) x <- x/zu } if(type == 7) { zu <- sqrt(rchisq(n, 5)/5) x <- x/zu } if(type == 8) { zu <- sqrt(rchisq(n, 19)/19) x <- x/zu } if(type == 9) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(type == 10) x <- exp(x) x <- x %*% A gsqrs <-gsqboot(x, B= BB)$gsqs #nonparametric bootstrap tem <- shorthLU(gsqrs,alpha=alph) if(gsq >= tem$shorth[1] && gsq <= tem$shorth[2]) cicov <- cicov + 1 avelen <- avelen + tem$shorth[2] - tem$shorth[1] if(gsq <= tem$right) lcicov <- lcicov + 1 lavelen <- lavelen + tem$right if(gsq >= tem$left) ucicov <- ucicov + 1 uavelen <- uavelen + 1 - tem$left } cicov <- cicov/nruns avelen <- avelen/nruns lcicov <- lcicov/nruns lavelen <- lavelen/nruns ucicov <- ucicov/nruns uavelen <- uavelen/nruns list(cicov=cicov,avelen=avelen,lcicov=lcicov,lavelen=lavelen, ucicov=ucicov,uavelen=uavelen)} hbplot<-function(x, Y, aa = 1.4, rr = T, typ = 1) {# Makes the response plot for hbreg estimator and attractors. #Click on right mouse button to advance plot, and in R, highlight #"stop." # For R, type "library(MASS)" before using this function. # The hbreg estimator uses MBA if typ = 1, ltsreg if typ = 2, # mbalata if typ = 3. x <- as.matrix(x) out <- hbreg(x, Y, a = aa, rreg = rr, type = typ) bhat <- out$coef HBFIT <- bhat[1] + x %*% bhat[-1] bhat <- out$olscoef OLSFIT <- bhat[1] + x %*% bhat[-1] bhat <- out$arobcoef AROBFIT <- bhat[1] + x %*% bhat[-1] bhat <- out$bbcoef BBFIT <- bhat[1] + x %*% bhat[-1] par(mfrow = c(2, 2)) plot(HBFIT, Y) abline(0, 1) identify(HBFIT, Y) plot(OLSFIT, Y) abline(0, 1) identify(OLSFIT, Y) plot(AROBFIT, Y) abline(0, 1) identify(AROBFIT, Y) plot(BBFIT, Y) abline(0, 1) identify(BBFIT, Y) par(mfrow = c(1, 1)) } #used in rmltreg hbreg<-function(x, y, csteps = 10, a = 1.4, rreg = T, type = 1) {# Gets the hbreg estimator with 10 concentration steps applied to # the HB attractor BB. Uses LTA criterion to screen the 3 attractors. # For R, type "library(MASS)" before using this function. # If rreg = F, arobcoef = olscoef, if T, arobcoef uses # mbareg if type = 1, ltsreg if type = 2, mbalata if type = 3 # rmreg2 if type =4. # Calls mbareg, mbalata, rmreg3 which is rmreg2 without plots. med <- median(y) madd <- mad(y, constant = 1) x <- as.matrix(x) n <- dim(x)[1] nc <- dim(x)[2] + 1 #nc is number of predictors including intercept temp <- lsfit(x, y) hbf <- temp$coef absres <- abs(temp$residuals) indx <- (absres <= median(absres)) critf <- sum(absres[indx]) #mbareg or lmsreg may work better than ltsreg if(rreg == T){ if(type == 1){ tem <- mbareg(x, y) temres <- y - tem$coef[1] - x %*% tem$coef[-1] absres <- abs(temres)} if(type == 2){ tem <- ltsreg(x, y) absres <- abs(tem$residuals)} if(type == 3){ tem <- mbalata(x,y) temres <- y - tem$coef[1] - x %*% tem$coef[-1] absres <- abs(temres)} if(type == 4){ tem <- rmreg3(x,y) temres <- y - tem$Bhat[1] - x %*% tem$Bhat[-1] absres <- abs(temres)} indx <- (absres <= median(absres)) crit <- a*sum(absres[indx]) if(crit < critf) { critf <- crit hbf <- tem$coef} } #get y's closest to median y indx <- (y >= (med - madd) & y <= (med + madd)) #get the HB attractor bk = BB bk <- lsfit(x[indx, ], y[indx])$coef res <- y - (x %*% bk[2:nc] + bk[1]) ressq <- res^2 m <- median(ressq) for(i in 1:csteps) { indx <- (ressq <= m) bk <- lsfit(x[indx, ], y[indx])$coef res <- y - (x %*% bk[2:nc] + bk[1]) ressq <- res^2 m <- median(ressq) } bb <- bk res <- y - (x %*% bb[2:nc] + bb[1]) absres <- abs(res) indx <- (absres <= median(absres)) crit <- a*sum(absres[indx]) if(crit < critf) hbf <- bb if(rreg == T && type != 4) arobcoef <- tem$coef else if (rreg == T && type == 4) arobcoef <- as.vector(tem$Bhat) else arobcoef <- temp$coef list(coef = hbf, olscoef = temp$coef, arobcoef = arobcoef, bbcoef = bb) } hbregsim<-function(n = 100, p = 4, csteps = 10, nruns = 10, aa = 1.4, rr = T, typ = 1, sige = 1, sigx = 1, etype = 1){ # Simulates MLR model and gets fits from hbreg. # For R, type "library(MASS)" before using this function. # For MLR, want mnbhat = (1, ..., 1), sd(i) = 1\sqrt(n). # If rr = F, arobcoef = olscoef, if T, arobcoef uses # mbareg if typ = 1, ltsreg if typ = 2, mbalata if typ = 3, rmreg3 if typ = 4. # errors are normal if etype = 1, shifted exponential for etype=2, shifted lognormal if etype=3 # q <- p - 1 beta <- 1 + 0 * 1:q # beta <- 1:q hbhat <- matrix(0, nrow = nruns, ncol = p) olshat <- hbhat arobhat <- hbhat bbhat <- hbhat for(i in 1:nruns) { x <- matrix(rnorm(n * q, sd = sige), nrow = n, ncol = q) if (etype == 1) err <- rnorm(n, sd = sigx) if (etype == 2) err <- sige*(rexp(n) - 1) if (etype == 3) err <- sige*(exp(rnorm(n)) - exp(0.5)) y <- 1 + x %*% beta + err out <- hbreg(x, y, csteps = csteps, a = aa, rreg = rr, type = typ) hbhat[i, ] <- out$coef olshat[i, ] <- out$olscoef arobhat[i, ] <- out$arobcoef bbhat[i, ] <- out$bbcoef } mnhbhat <- apply(hbhat, 2, mean) sdhbhat <- sqrt(apply(hbhat, 2, var)) mnolshat <- apply(olshat, 2, mean) sdolshat <- sqrt(apply(olshat, 2, var)) mnarobhat <- apply(arobhat, 2, mean) sdarobhat <- sqrt(apply(arobhat, 2, var)) mnbbhat <- apply(bbhat, 2, mean) sdbbhat <- sqrt(apply(bbhat, 2, var)) list(mnhbhat = mnhbhat, sdhbhat = sdhbhat, mnolshat = mnolshat, sdolshat = sdolshat, mnarobhat = mnarobhat, sdarobhat = sdarobhat, mnbbhat = mnbbhat, sdbbhat = sdbbhat) } hdhot1wsim<-function(n=100,p=10,B=100,nruns=100,xtype=1,eps=0.4,dd=4,delta=0, covtyp=1,psi=0.1,alpha=0.1){ # This R function simulates the one sample Hotelling's T^2 test based on the # bootstrap where n/p is small and the identity matrix is the dispersion # matrix. Need p > 1. Here p is the dimension of the mean mu. # In the literature we sometimes use m instead of p. # Calls identrgn. # xtype = 1 for MVN Np(0,I), # 2 for (1 - eps) Np(0,I) + eps Np(0, 25 I) # 3 for multivariate t_d with d = dd # 4 for lognormal. # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu0 = (0, ..., 0). # For MVN data, Cov(x) = I for covtyp=1. # Cov(x) = diag(1,...,p) for covtyp=2. # For covtyp = 3, cor(x_i,x_j) = rho for i not= j and # for MVN data this results in a covariance matrix with eigenvector # c(1, ..., 1)^T corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # cor(X_i,X_j) = [2 psi +(p-2)psi^2]/[1 + (p-1)psi^2], i not = j # when the correlation exists. A <- sqrt(diag(1:p)) if(covtyp==3){ A <- matrix(psi,nrow=p,ncol=p) diag(A) <- 1} munot <- 0 * (1:p) mu <- delta * (1 + munot) indx <- 1:n mnmus <- matrix(0,nrow=B,ncol=p) medmus <- mnmus prcv <- 0 brcv <- 0 prmedcv <- 0 brmedcv <- 0 chisqcut <- qchisq(p=(1-alpha),df=p) cutoffs <- 1:nruns cutbr <- cutoffs for(i in 1:nruns) { #make data x <- matrix(rnorm(n*p), ncol = p, nrow = n) if(xtype == 2) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } if(xtype == 3) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(xtype == 4){ x <- exp(x) x <- x-exp(0.5) } if(covtyp>1){ x <- x %*% A} x <- mu + x cent <- apply(x,2,mean) centmed <- apply(x,2,median) #x has mean mu =delta(1,...,1)^T #get bootstrapped xbar1 for(j in 1:B){ tem <- sample(indx,n,replace=T) mnx <- apply(x[tem,],2,mean) mnmus[j,] <- mnx medx <- apply(x[tem,],2,median) medmus[j,] <- medx } outmn <- identrgn(mnmus,stat=cent,alpha=alpha) prcv <- prcv + outmn$inr brcv <- brcv + outmn$inr2 cutoffs[i]<- (outmn$cuplim)^2 cutbr[i] <- (outmn$cuplim2)^2 outmed <- identrgn(medmus,stat=centmed,alpha=alpha) prmedcv <- prmedcv + outmed$inr brmedcv <- brmedcv + outmed$inr2 } #prop of times Ho is rejected prcv <- 1 - prcv/nruns brcv <- 1 - brcv/nruns prmedcv <- 1 - prmedcv/nruns brmedcv <- 1 - brmedcv/nruns mncut <- mean(cutoffs) mncutbr <- mean(cutbr) list(chisqcut=chisqcut,mncut=mncut,mncutbr=mncutbr,prcv=prcv,brcv=brcv, prmedcv=prmedcv,brmedcv=brmedcv) } hdhot2wsim<-function(n1=100,n2=100,p=10,B=100,nruns=100,xtype=1,sig=1, eps=0.4,dd=4,delta=0,covtyp=1,alpha=0.05){ # This R function simulates the two sample Hotelling's T^2 test based on the # bootstrap where n/p is small and the identity matrix is the dispersion #matrix. Need p > 1. #Calls identrgn. # xtype = 1 for MVN Np(0,I), # 2 for (1 - eps) Np(0,I) + eps Np(0, 25 I) # 3 for multivariate t_d with d = dd # 4 for lognormal. # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu2 = (0, ..., 0). # Cov(x) = I, Cov(x2) = sig^2 Cov(x) for covtyp=1. # Cov(x) = diag(1,2,...,p), Cov(x2) = sig^2 Cov(x) for covtyp=2. # Cov(x) = I, Cov(x2) = sig^2 diag(1,2,...,p) for covtyp=3 A <- sqrt(diag(1:p)) munot <- 0 * (1:p) mu <- delta * (1 + munot) indx <- 1:n1 indx2 <- 1:n2 mnmus <- matrix(0,nrow=B,ncol=p) medmus <- mnmus prcv <- 0 brcv <- 0 prmedcv <- 0 brmedcv <- 0 chisqcut <- qchisq(p=(1-alpha),df=p) cutoffs <- 1:nruns cutbr <- cutoffs for(i in 1:nruns) { #make data x <- matrix(rnorm(n1 * p), ncol = p, nrow = n1) x2 <- matrix(rnorm(n2 * p), ncol = p, nrow = n2) if(xtype == 2) { zu <- runif(n1) x[zu < eps, ] <- x[zu < eps, ] * 5 zu <- runif(n2) x2[zu < eps, ] <- x2[zu < eps, ] * 5 } if(xtype == 3) { zu <- sqrt(rchisq(n1, dd)/dd) x <- x/zu zu <- sqrt(rchisq(n2, dd)/dd) x2 <- x2/zu } if(xtype == 4){ x <- exp(x) x <- x-exp(0.5) x2 <- exp(x2) x2 <- x2 - exp(0.5) } if(covtyp==2){ x <- x %*% A x2 <- x2 %*% A} if(covtyp==3){ x2 <- x2 %*% A} x2 <- sig * x2 x <- mu + x cent <- apply(x,2,mean) - apply(x2,2,mean) centmed <- apply(x,2,median) - apply(x2,2,median) #x has mean mu =delta(1,...,1)^T, x2 has mean (0,...,0)^T #get bootstrapped xbar1 - xbar2 for(j in 1:B){ tem <- sample(indx,n1,replace=T) mnx <- apply(x[tem,],2,mean) medx <- apply(x[tem,],2,median) tem <- sample(indx2,n2,replace=T) mnx2 <- apply(x2[tem,],2,mean) medx2 <- apply(x2[tem,],2,median) mnmus[j,] <- mnx-mnx2 medmus[j,] <- medx-medx2 } outmn <- identrgn(mnmus,stat=cent,alpha=alpha) prcv <- prcv + outmn$inr brcv <- brcv + outmn$inr2 cutoffs[i]<-(outmn$cuplim)^2 cutbr[i] <- (outmn$cuplim2)^2 outmed <- identrgn(medmus,stat=centmed,alpha=alpha) prmedcv <- prmedcv + outmed$inr brmedcv <- brmedcv + outmed$inr2 } #prop of times Ho is rejected prcv <- 1 - prcv/nruns brcv <- 1 - brcv/nruns prmedcv <- 1 - prmedcv/nruns brmedcv <- 1 - brmedcv/nruns mncut <- mean(cutoffs) mncutbr <- mean(cutbr) list(chisqcut=chisqcut,mncut=mncut,mncutbr=mncutbr,prcv=prcv,brcv=brcv, prmedcv=prmedcv,brmedcv=brmedcv) } hdmansim<-function(n1=100,n2=100,n3=100,m=2,B=100,nruns=100,ytype=1, sig2=1,sig3=1,eps=0.4,dd=4,delta1=0,delta2=0,delta3=0,cov3I=F,alpha=0.1){ # This function simulates high dimensional one way Manova type tests for # Ho: mu_1 = mu_2 = mu_3 where p = g = 3 = number of groups. # Has the test using C_n = I. Test with standardized data is also given. # yi is m by 1. # Need m > 1. Want m >= ni. # Multiply y by A where ytype = 1 for MVN Nm(0,I), # 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I), # 3 for multivariate t_d with d = dd, # 4 for lognormal. # Power can be estimated by using unequal deltai so mu1 = delta1(1,...,1) # and mu2 = delta2(1, ..., 1), mu3 = delta3(1,...,1). # Cov(y1) = diag(1,2,...,m), Cov(y2) = sig^2 Cov(y1) for clean data. # Cov(y3) = sig^3 Cov(y1) for clean data if cov3I = F, # or Cov(y3) = cI_3 if cov3I = T. A <- sqrt(diag(1:m)) munot <- 0 * (1:m) mu1 <- delta1 * (1 + munot) mu2 <- delta2 * (1 + munot) mu3 <- delta3 * (1 + munot) indx1 <- 1:n1 indx2 <- 1:n2 indx3 <- 1:n3 mnmus <- matrix(0,nrow=B,ncol=2*m) smnmus <- matrix(0,nrow=B,ncol=2*m) cutoffs <- 1:nruns cutbr <- cutoffs scutoffs <- cutoffs scutbr <- cutoffs prcv <- 0 brcv <- 0 sprcv <- 0 sbrcv <- 0 for(i in 1:nruns) { #make data y1 <- matrix(rnorm(n1 * m), ncol = m, nrow = n1) y2 <- matrix(rnorm(n2 * m), ncol = m, nrow = n2) y3 <- matrix(rnorm(n3 * m), ncol = m, nrow = n3) if(ytype == 2) { zu <- runif(n1) y1[zu < eps, ] <- y1[zu < eps, ] * 5 zu <- runif(n2) y2[zu < eps, ] <- y2[zu < eps, ] * 5 zu <- runif(n3) y3[zu < eps, ] <- y3[zu < eps, ] * 5 } if(ytype == 3) { zu <- sqrt(rchisq(n1, dd)/dd) y1 <- y1/zu zu <- sqrt(rchisq(n2, dd)/dd) y2 <- y2/zu zu <- sqrt(rchisq(n3, dd)/dd) y3 <- y3/zu } if(ytype == 4){ y1 <- exp(y1) y1 <- y1 - exp(0.5) y2 <- exp(y2) y2 <- y2 - exp(0.5) y3 <- exp(y3) y3 <- y3 - exp(0.5) } y1 <- y1 %*% A y2 <- y2 %*% A y2 <- sig2 * y2 if( cov3I != T){ y3 <- y3 %*% A y3 <- sig3 * y3} y1 <- mu1 + y1 y2 <- mu2 + y2 y3 <- mu3 + y3 #yi has mean mui = deltai (1,...,1)^T diag1 <- 1/apply(y1,2,sd) D <- diag(diag1) z1 <- y1%*%D diag1 <- 1/apply(y2,2,sd) D <- diag(diag1) z2 <- y2%*%D diag1 <- 1/apply(y3,2,sd) D <- diag(diag1) z3 <- y3%*%D #Get the classical test w T1 <- apply(y1,2,mean) T2 <- apply(y2,2,mean) T3 <- apply(y3,2,mean) w <- c(T1-T3,T2-T3) T1s <- apply(z1,2,mean) T2s <- apply(z2,2,mean) T3s <- apply(z3,2,mean) ws <- c(T1s-T3s,T2s-T3s) #get bootstrapped Ty - Ty3, Ty2 - Ty3 for various statistics T for(j in 1:B){ tem <- sample(indx1,n1,replace=T) mny1 <- apply(y1[tem,],2,mean) mnz1 <- apply(z1[tem,],2,mean) tem <- sample(indx2,n2,replace=T) mny2 <- apply(y2[tem,],2,mean) mnz2 <- apply(z2[tem,],2,mean) tem <- sample(indx3,n3,replace=T) mny3 <- apply(y3[tem,],2,mean) mnz3 <- apply(z3[tem,],2,mean) mnmus[j,] <- c(mny1-mny3,mny2-mny3) smnmus[j,] <- c(mnz1-mnz3,mnz2-mnz3) } outmn <- identrgn(mnmus,stat=w,alpha=alpha) prcv <- prcv + outmn$inr brcv <- brcv + outmn$inr2 cutoffs[i]<-(outmn$cuplim)^2 cutbr[i] <- (outmn$cuplim2)^2 outmn <- identrgn(smnmus,stat=ws,alpha=alpha) sprcv <- sprcv + outmn$inr sbrcv <- sbrcv + outmn$inr2 scutoffs[i]<-(outmn$cuplim)^2 scutbr[i] <- (outmn$cuplim2)^2 } #prop of times Ho is rejected prcv <- 1 - prcv/nruns brcv <- 1 - brcv/nruns sprcv <- 1 - sprcv/nruns sbrcv <- 1 - sbrcv/nruns # list(prcv=prcv,brcv=brcv,sprcv=sprcv,sbrcv=sbrcv) } hdr2<-function(x,xf,alph=0.05){ #see if xf is in the estimated highest density region #scaled nonparametric density estimator, poor for p > 2 #kernel is uniform on a hypersphere x <- as.matrix(x) n <- dim(x)[1] tem <- ceiling(2*sqrt(n)) p <- dim(x)[2] covv <- as.matrix(diag(p)) #identity matrix mns <- mean(x) radsq <- sort(mahalanobis(x,mns,covv))[tem] fh<-1:n inr<-0 for(i in 1:n){ fh[i] <- sum( mahalanobis(x,x[i,],covv) <= radsq ) fh[i] <- fh[i]/( n * sqrt(radsq)^p ) } falph <- quantile(fh,alph) fhatxf <- sum( mahalanobis(x,xf,covv) <= radsq ) fhatxf <- fhatxf/( n * sqrt(radsq)^p ) #inr <- 1 if xf is in the prediction region if(fhatxf >= falph){ inr <- 1} list(inr=inr) } hot2sampsim<-function(n1=100,n2=100,p=2,B=100,nruns=100,xtype=1,sig=1, eps=0.4,dd=4,delta=0,covtyp=1,alpha=0.05){ # This R function simulates the two sample Hotelling's T^2 test # a bootstrap test to get a cutoff, and a large sample test. # Two bootstrap tests using C_n = I are also used. # Need p > 1. Want n1 > 20p, n2 > 20p. #Calls identrgn. # Multiply x by A where xtype = 1 for MVN Np(0,I), # 2, (with delta = eps) for (1 - delta) Np(0,I) + delta Np(0, 25 I) # 3 for multivariate t_d with d = dd # 4 for lognormal. # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu2 = (0, ..., 0). # Cov(x) = I, Cov(x2) = sig^2 Cov(x) for covtyp=1. # Cov(x) = diag(1,2,...,p), Cov(x2) = sig^2 Cov(x) for covtyp=2. # Cov(x) = I, Cov(x2) = sig^2 diag(1,2,...,p) for covtyp=3 A <- sqrt(diag(1:p)) munot <- 0 * (1:p) mu <- delta * (1 + munot) indx <- 1:n1 indx2 <- 1:n2 n<-n1+n2 mnmus <- matrix(0,nrow=B,ncol=p) prcv <- 0 brcv <- 0 LScov <- 0 Ccov <- 0 bstat <- 1:B cstat <- 0 * 1:nruns bcut <- 1:nruns dn=min(n1-p,n2-p) LScut<- p*qf(p=(1-alpha),df1=p,df2=dn) Ccut <- p*qf(p=(1-alpha),df1=p,df2=(n-2)) up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/B)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/B)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha for(i in 1:nruns) { #make data x <- matrix(rnorm(n1 * p), ncol = p, nrow = n1) x2 <- matrix(rnorm(n2 * p), ncol = p, nrow = n2) if(xtype == 2) { zu <- runif(n1) x[zu < eps, ] <- x[zu < eps, ] * 5 zu <- runif(n2) x2[zu < eps, ] <- x2[zu < eps, ] * 5 } if(xtype == 3) { zu <- sqrt(rchisq(n1, dd)/dd) x <- x/zu zu <- sqrt(rchisq(n2, dd)/dd) x2 <- x2/zu } if(xtype == 4){ x <- exp(x) x <- x-exp(0.5) x2 <- exp(x2) x2 <- x2 - exp(0.5) } if(covtyp==2){ x <- x %*% A x2 <- x2 %*% A} if(covtyp==3){ x2 <- x2 %*% A} x2 <- sig * x2 x <- mu + x #x has mean mu =delta(1,...,1)^T, x2 has mean (0,...,0)^T cent <- apply(x,2,mean) - apply(x2,2,mean) S1<-cov(x) S2<-cov(x2) Sigma<-(S1/n1 + S2/n2) Sinv <- solve(Sigma) TLsq <- t(cent) %*% Sinv %*% cent if(TLsq <= LScut) LScov <- LScov+1 Sp<-(1/n1 + 1/n2)*( (n1-1)*S1 + (n2-1)*S2 )/(n-2) Spinv <- solve(Sp) #inverse scaled pooled cov matrix TCsq <- t(cent) %*% Spinv %*% cent cstat[i] <- TCsq if(TCsq <= Ccut) Ccov <- Ccov+1 for(j in 1:B){ tem <- sample(indx,n1,replace=T) mnx <- apply(x[tem,],2,mean) tem <- sample(indx2,n2,replace=T) mnx2 <- apply(x2[tem,],2,mean) mnmus[j,] <- mnx-mnx2 diff <- mnx-mnx2 - cent bstat[j]<- t(diff) %*% Spinv %*% diff } bcut[i] <- quantile(bstat,up) outmn <- identrgn(mnmus,stat=cent,alpha=alpha) prcv <- prcv + outmn$inr brcv <- brcv + outmn$inr2 } Ccov<-Ccov/nruns LScov <- LScov/nruns bootcov <- sum(cstat <= bcut)/nruns prcv <- prcv/nruns brcv <- brcv/nruns # list(Ccov=Ccov,LScov=LScov,bootcov=bootcov,prcv=prcv,brcv=brcv) } hot2sim<-function(n = 100, n2 = 100, p = 2, csteps = 5, B= 100, gam = 0.4, nruns = 100, xtype = 1, outliers = 0, pm = 10, sig = 1, eps = 0.4, dd=4, delta = 0){ # Needs library(Hotelling). # Partially written by Hasthika S. Rupasinghe Arachchige Don. # This R function simulates the two sample Hotelling's T^2 test based on the # bootstrap. Here x2 is clean, x = x1 may have outliers. # Uses the coordinatewise mean, median, 25% trimmed mean, # and RMVN location estimators as well as the classical test. # The hotsim2 function does not do the classical test. # Need p > 1. Want n > 20p, n2 > 20p, B > 20p. # Multiply x by A where xtype = 1 for MVN Np(0,I), # 2, (with delta = eps) for (1 - delta) Np(0,I) + delta Np(0, 25 I) # 3 for multivariate t_d with d = dd # 4 for lognormal. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # For the point mass outlier types 4 and 5, need gam much smaller than 0.4. # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu2 = (0, ..., 0). # Cov(x) = diag(1,2,...,p), Cov(x2) = sig^2 Cov(x) for clean data. # For outliers=0, want hquant and rquant approx 1. A <- sqrt(diag(1:p)) munot <- 0 * (1:p) mu <- delta * (1 + munot) val <- floor(gam * n) indx <- 1:n indx2 <- 1:n2 medmus <- matrix(0,nrow=B,ncol=p) mnmus <- medmus tmnmus <- medmus rmvnmus <- medmus medcv <- 0 mncv <- 0 tmncv <- 0 rmvncv <- 0 HTcv <- 0 HTpval <- 0 HT <- matrix(0, ncol = 1, nrow = nruns) chisqcut <- qchisq(p=0.95,df=p) cutoffs <- matrix(0,nrow=nruns,ncol=4) for(i in 1:nruns) { #make data x <- matrix(rnorm(n * p), ncol = p, nrow = n) x2 <- matrix(rnorm(n2 * p), ncol = p, nrow = n2) if(xtype == 2) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 zu <- runif(n2) x2[zu < eps, ] <- x2[zu < eps, ] * 5 } if(xtype == 3) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu zu <- sqrt(rchisq(n2, dd)/dd) x2 <- x2/zu } if(xtype == 4){ #Want pop coord med(x) = 0. x <- exp(x) x <- x - 1 x2 <- exp(x2) x2 <- x2 - 1 } x <- x %*% A x2 <- x2 %*% A x2 <- sig * x2 if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } x <- mu + x #clean x has mean mu =delta(1,...,1)^T, x2 has mean (0,...,0)^T #get bootstrapped Tx - Tx2 for various statistics T for(j in 1:B){ tem <- sample(indx,n,replace=T) medx <- apply(x[tem,],2,median) mnx <- apply(x[tem,],2,mean) tmnx <- apply(x[tem,],2,tmn) rmvnx <- covrmvn(x[tem,])$center tem <- sample(indx2,n2,replace=T) medx2 <- apply(x2[tem,],2,median) mnx2 <- apply(x2[tem,],2,mean) tmnx2 <- apply(x2[tem,],2,tmn) rmvnx2 <- covrmvn(x2[tem,])$center medmus[j,] <- medx-medx2 mnmus[j,] <- mnx-mnx2 tmnmus[j,] <- tmnx-tmnx2 rmvnmus[j,] <- rmvnx-rmvnx2 } outmed<-predreg(medmus) medcv <- medcv + outmed$inr outmn <- predreg(mnmus) mncv <- mncv + outmn$inr outtmn <- predreg(tmnmus) tmncv <- tmncv + outtmn$inr outrmvn <- predreg(rmvnmus) rmvncv <- rmvncv + outrmvn$inr cutoffs[i,]<-c(outmed$cuplim,outmn$cuplim,outtmn$cuplim,outrmvn$cuplim)^2 HTpval <- hotelling.test(x, x2) if(HTpval$pval < 0.05){ HT[i,1] <- 1} } medcv <- 1 - medcv/nruns #prop of times Ho is rejected mncv <- 1 - mncv/nruns tmncv <- 1 - tmncv/nruns rmvncv <- 1 - rmvncv/nruns HTcv <- sum(HT)/nruns mncut <- apply(cutoffs,2,mean) list(chisqcut = chisqcut, mncut=mncut, medcv = medcv,mncv = mncv,tmncv=tmncv,rmvncv=rmvncv, HTcv = HTcv) } hot2sim2<-function(n = 100, n2 = 100, p = 2, csteps = 5, B= 100, gam = 0.4, nruns = 100, xtype = 1, outliers = 0, pm = 10, sig = 1, eps = 0.4, dd=4, delta = 0){ # This R function simulates the two sample Hotelling's T^2 test based on the #bootstrap. Here x2 is clean, x = x1 may have outliers. # Uses the coordinatewise mean, median, 25% trimmed mean, and RMVN location estimators. #hotsim also simulates the classical estimator # Need p > 1. Want n > 20p, n2 > 20p, B > 20p. # Multiply x by A where xtype = 1 for MVN Np(0,I), # 2, (with delta = eps) for (1 - delta) Np(0,I) + delta Np(0, 25 I) # 3 for multivariate t_d with d = dd # 4 for lognormal. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # For the point mass outlier types 4 and 5, need gam much smaller than 0.4. # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu2 = (0, ..., 0). # Cov(x) = diag(1,2,...,p), Cov(x2) = sig^2 Cov(x) for clean data. # For outliers=0, want hquant and rquant approx 1. A <- sqrt(diag(1:p)) munot <- 0 * (1:p) mu <- delta * (1 + munot) val <- floor(gam * n) indx <- 1:n indx2 <- 1:n2 medmus <- matrix(0,nrow=B,ncol=p) mnmus <- medmus tmnmus <- medmus rmvnmus <- medmus medcv <- 0 mncv <- 0 tmncv <- 0 rmvncv <- 0 chisqcut <- qchisq(p=0.95,df=p) cutoffs <- matrix(0,nrow=nruns,ncol=4) for(i in 1:nruns) { #make data x <- matrix(rnorm(n * p), ncol = p, nrow = n) x2 <- matrix(rnorm(n2 * p), ncol = p, nrow = n2) if(xtype == 2) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 zu <- runif(n2) x2[zu < eps, ] <- x2[zu < eps, ] * 5 } if(xtype == 3) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu zu <- sqrt(rchisq(n2, dd)/dd) x2 <- x2/zu } if(xtype == 4){ #Want pop coord med(x) = 0. x <- exp(x) x <- x - 1 x2 <- exp(x2) x2 <- x2 - 1 } x <- x %*% A x2 <- x2 %*% A x2 <- sig * x2 if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } x <- mu + x #clean x has mean mu =delta(1,...,1)^T, x2 has mean (0,...,0)^T #get bootstrapped Tx - Tx2 for various statistics T for(j in 1:B){ tem <- sample(indx,n,replace=T) medx <- apply(x[tem,],2,median) mnx <- apply(x[tem,],2,mean) tmnx <- apply(x[tem,],2,tmn) rmvnx <- covrmvn(x[tem,])$center tem <- sample(indx2,n2,replace=T) medx2 <- apply(x2[tem,],2,median) mnx2 <- apply(x2[tem,],2,mean) tmnx2 <- apply(x2[tem,],2,tmn) rmvnx2 <- covrmvn(x2[tem,])$center medmus[j,] <- medx-medx2 mnmus[j,] <- mnx-mnx2 tmnmus[j,] <- tmnx-tmnx2 rmvnmus[j,] <- rmvnx-rmvnx2 } outmed<-predreg(medmus) medcv <- medcv + outmed$inr outmn <- predreg(mnmus) mncv <- mncv + outmn$inr outtmn <- predreg(tmnmus) tmncv <- tmncv + outtmn$inr outrmvn <- predreg(rmvnmus) rmvncv <- rmvncv + outrmvn$inr cutoffs[i,]<-c(outmed$cuplim,outmn$cuplim,outtmn$cuplim,outrmvn$cuplim)^2 } medcv <- 1 - medcv/nruns #prop of times Ho is rejected mncv <- 1 - mncv/nruns tmncv <- 1 - tmncv/nruns rmvncv <- 1 - rmvncv/nruns mncut <- apply(cutoffs,2,mean) list(chisqcut = chisqcut, mncut=mncut, medcv = medcv, mncv = mncv,tmncv=tmncv,rmvncv=rmvncv) } identrgn<-function(x, stat, alpha = 0.05){ # Makes two confidence regions for the rows of x # using I as the dispersion matrix. #Also computes the distance for the 0 vector #to check if the 0 vector is in the confidence region. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] #n is actually B C <- diag(p) zero <- 0*1:p inr<-0 inr2<-0 up <- 1 - alpha #quantile for confidence region #prediction region method tbar <- apply(x,2,mean) #md2 <- mahalanobis(x, center=tbar, cov=C) will invert C md2 <- mahalanobis(x, center=tbar, cov=C,inverted=TRUE) # MD is the classical Euclidean distance MD <- sqrt(md2) #get PR confidence region cutoff cuplim <- sqrt(quantile(md2,up)) D0 <- sqrt(mahalanobis(zero,center=tbar, cov=C)) if(D0 <= cuplim) inr <- 1 #Bickel and Ren type method md2 <- mahalanobis(x, center=stat, cov=C,inverted=TRUE) MD <- sqrt(md2) #get modified Bickel and Ren confidence region cutoff cuplim2 <- sqrt(quantile(md2,up)) D0br <- sqrt(mahalanobis(zero, center=stat, cov=C)) if(D0br <= cuplim2) inr2 <- 1 list(MD=MD,cuplim=cuplim,cuplim2=cuplim2,D0=D0,D0br=D0br,inr=inr,inr2=inr2) } lmsviews<-function(x, Y, ii = 1) {# Trimmed views using lmsreg for 90, 80, ... 0 percent # trimming. Allows visualization of m # and crudely estimation of c beta in models # of the form y = m(x^T beta) + e. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(MASS)" or "library(lqs)." # Advance the view with the right mouse button and # in R, highight "stop." x <- as.matrix(x) out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) b <- lmsreg(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- x %*% b[-1] + b[1] plot(ESP, Y) title(labs[i]) identify(ESP, Y) print(b) } } lrdata <- function(n = 200, type = 3) {# Generates data for logistic regression. # If X|y=1 ~ N(mu_1,I) and X|Y=0 ~ N(0,I) then beta = mu_1 and alpha = -0.5 ||mu_1||^2. # # If q is changed, change the formula in the glm statement. q <- 5 y <- 0 * 1:n y[(n/2 + 1):n] <- y[(n/2 + 1):n] + 1 beta <- 0 * 1:q if(type == 1) { beta[1] <- 1 alpha <- -0.5 } if(type == 2) { beta <- beta + 1 alpha <- -q/2 } if(type == 3) { beta[1:3] <- 1 alpha <- -1.5 } x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { x[(n/2 + 1):n, 1] <- x[(n/2 + 1 ):n, 1] + 1 } if(type == 2) { x[(n/2 + 1):n, ] <- x[(n/2 + 1 ):n, ] + 1 } if(type == 3) { x[(n/2 + 1):n, 1:3 ] <- x[(n/2 + 1 ):n, 1:3 ] + 1 } #X|y=0 ~ N(0, I) and X|y=1 ~ N(beta,I) # change formula to x[,1]+ ... + x[,q] with q out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[,5], family = binomial) list(alpha = alpha, beta = beta, lrcoef = out$coef,x=x,y=y) } lrdata2<-function(n = 200, p = 5, m = 10, type = 1) {# Generates data for logistic regression. q <- p -1 y <- 1:n z <- y mv <- m + 0 * y beta <- 0 * 1:q if(type == 1) { beta <- beta + 2 alpha <- 0 } if(type == 2) { beta[1] <- 2 alpha <- 0 } if(type == 3) { beta[1:3] <- 2 alpha <- 0 } x <- matrix(rnorm(n * q), nrow = n, ncol = q) SP <- alpha + x %*% beta pv <- exp(SP)/(1 + exp(SP)) for(i in 1:n) y[i] <- rbinom(1, size = m, prob = pv[i]) Z <- y/m out<-glm(Z~., family=binomial,weights=mv,data=data.frame(cbind(x,Z))) list(x = x, y = y, mv = mv, alpha = alpha, beta = beta, lrcoef = out$coef) } lrplot2<-function(x, y, slices = 10, ff = 0.99, step = T) {# Makes the ESSP for binary logistic regression. # If slice = T use step function, else use lowess. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # #ESP <- x %*% out$coef[-1] + out$coef[1] #If q is changed, change the formula in the glm statement. #q <- 5 # change formula to x[,1]+ ... + x[,q] with q #out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = poisson) out <- glm(y~., family=binomial, data=data.frame(cbind(x,y))) ESP <- predict(out) Y <- y plot(ESP, Y) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) # lines(sort(ESP),sort(fit)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) if(step == T){ fit2 <- fit n <- length(y) val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- mean(y[indx[((i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- mean(y[indx[((slices - 1) * val + 1 ):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2)} else lines(lowess(ESP, Y, f = ff), type = "s") title("Response Plot") } lrplot3<-function(ESP, y, slices = 10, ff = 0.99, step = T) {# Makes the response plot for binary logistic regression. # If out <- glm(y ~ x[, 1] + x[, 2], family = binomial) # use ESP <- predict(out). # If outgam <- gam(STA ~ s(AGE)+SEX+RACE+SER+CAN+CRN+INF+CPR+s(SYS)+s(HRA) # +PRE+TYP+FRA+PO2+PH+PCO+Bic+CRE+LOC,family=binomial,data=icu2) # use ESP <- predict.gam(outgam). # If step = T use step function, else use lowess. # Y <- y plot(ESP, Y) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) # lines(sort(ESP),sort(fit)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) if(step == T){ fit2 <- fit n <- length(y) val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- mean(y[indx[((i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- mean(y[indx[((slices - 1) * val + 1 ):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2)} else lines(lowess(ESP, Y, f = ff), type = "s") title("Response Plot") } lrplot4<-function(x, y, mv, slices = 10, ff = 0.97, step = T) {# Makes response plot and OD plot for binomial logistic regression. # mv = (m1, ..., mn) where yi is bin(mi,p(SP)) # If slice = T use step function, else use lowess. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." # #ESP <- x %*% out$coef[-1] + out$coef[1] # n <- length(y) tdata <- as.data.frame(cbind(x,y)) ny <- mv-y #ny[i] = mv[i]-y[i] = no. of ``failures" out <- glm(cbind(y,ny)~., family=binomial, data=tdata) ESP <- predict(out) par(mfrow = c(1, 2),pty="s") Z <- y/mv plot(ESP, Z) abline(weighted.mean(Z, mv), 0) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) indx <- sort.list(ESP) lines(ESP[indx], fit[indx]) if(step == T) { fit2 <- fit val <- as.integer(n/slices) for(i in 1:(slices - 1)) { fit2[((i - 1) * val + 1):(i * val)] <- weighted.mean(Z[ indx[((i - 1) * val + 1):(i * val)]], mv[indx[(( i - 1) * val + 1):(i * val)]]) } fit2[((slices - 1) * val + 1):n] <- weighted.mean(Z[indx[(( slices - 1) * val + 1):n]], mv[indx[((slices - 1) * val + 1):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx], fit2) } else lines(lowess(ESP, Z, f = ff), type = "s") title("a) Response Plot") #get OD plot val <- exp(ESP)/(1 + exp(ESP)) Ehat <- mv * val Vmodhat <- Ehat * (1 - val) Vhat <- (y - Ehat)^2 plot(Vmodhat, Vhat) abline(0, 1) abline(0, 4) abline(lsfit(Vmodhat, Vhat)$coef) title("b) OD Plot") } lsviews<-function(x, Y, ii = 10, type = 1) {# This function is the same as tvreg except that the untrimmed # cases are highlighted. It compares the LS fits for 0, 10, # ..., 90 percent trimming. # Use ii = 10 for 0, ii = 9 for 10 then 0, ..., # ii = 1 for 90 then 80 ..., then 0 percent trimming. # Used to visualize g if y = g(beta^T x,e). # Workstation: activate a graphics # device with command "X11()" or "motif()." # R needs command "library(MASS)." # Advance the view with the right mouse button. # In R, highlight ``stop." x <- as.matrix(x) if(type == 1) out <- cov.mcd(x) else out <- covmba(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- bhat[1] + x %*% bhat[-1] plot(ESP, Y) points(ESP[rd2 <= val], Y[rd2 <= val], pch = 15, cex = 1.4) abline(0, 1) title(labs[i]) identify(ESP, Y) print(bhat) } } maha<-function(x) {# Generates the classical mahalanobis distances. Need p > 1 or x a matrix. center <- apply(x, 2, mean) cov <- var(x) return(sqrt(mahalanobis(x, center, cov))) } manova1w<-function(y,p,group){ # Does one way MANOVA model. # Advance the plot by highlighting Stop with the right mouse button. # p is the number of groups group <- as.factor(group) y <- as.matrix(y) n <- dim(y)[1] m <- dim(y)[2] res <- matrix(nrow = n, ncol = m, 0) fit <- res Bhat <- matrix(nrow = p, ncol = m, 0) cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) for(i in 1:m){ out <- aov(y[,i]~group) res[,i] <- out$res fit[,i] <- out$fit Bhat[,i] <- out$coef } for(i in 1:m){ plot(fit[,i],y[,i]) abline(0, 1) title("Response Plot") identify(fit[,i],y[,i]) plot(fit[,i], res[,i]) title("Residual Plot") identify(fit[,i], res[,i]) } par(mfrow = c(1, 1)) par(mar=cmar) Covhat <- (n-1)/(n-p) * var(res) out <- manova(y~group) list(Bhat = Bhat, fit = fit, res = res, Covhat = Covhat, out=out) } manovabt<-function(y, group, g=2, B = 50, type = 1){ # Bootstraps a one way Manova type test for Ho: mu_1 = ... = mu_g. # There are g = p groups. # Here y is an n by m data matrix, group is an n by 1 vector with entries # 1 to g indicating the group of the ith case. So g>1 is the number of groups. # Type = 1 for sample mean, 2 for sample median, 3 for 25% trimmed mean # Using replace = T results in an error. y <- as.matrix(y) ny <- dim(y)[1] m <- dim(y)[2] indx <- 1:g #get n1, ..., ng for(i in 1:g) indx[i]<-sum(group==i) mus <- matrix(0,nrow=B,ncol=m*(g-1)) zg<-y[group==g,] #get bootstrap sample of the robust statistic T1 - Tg,...,T(g-1)-Tg # take a sample of size ni with replacement from each group if(type == 1){ for(j in 1:B){ temg <- sample(1:indx[g],indx[g],replace=TRUE) Tg <- apply(zg[temg,],2,mean) for(i in 1:(g-1)){ tem <- sample(1:indx[i],indx[i],replace=TRUE) z <- y[group==i,] Ti <- apply(z[tem,],2,mean) mus[j,(m*(i-1)+1):(i*m)] <- Ti-Tg } } } if(type == 2){ for(j in 1:B){ temg <- sample(1:indx[g],indx[g],replace=TRUE) Tg <- apply(zg[temg,],2,median) for(i in 1:(g-1)){ tem <- sample(1:indx[i],indx[i],replace=TRUE) z <- y[group==i,] Ti <- apply(z[tem,],2,median) mus[j,(m*(i-1)+1):(i*m)] <- Ti-Tg } } } if(type == 3){ for(j in 1:B){ temg <- sample(1:indx[g],indx[g],replace=TRUE) Tg <- apply(zg[temg,],2,tmn) for(i in 1:(g-1)){ tem <- sample(1:indx[i],indx[i],replace=TRUE) z <- y[group==i,] Ti <- apply(z[tem,],2,tmn) mus[j,(m*(i-1)+1):(i*m)] <- Ti-Tg } } } list(mus=mus) } manbtsim<-function(n = 100, n2 = 100, n3 = 100, p = 2, csteps = 5, B= 100, gam = 0.4, nruns = 100, xtype = 1, outliers = 0, pm = 10, sig2 = 1, sig3 = 1, eps = 0.4, dd=4, delta = 0){ # This R function simulates one way Manova type bootstrap test for # Ho: mu_1 = ... = mu_g where g = 3. # Here x2 is clean, x = x1 may have outliers. # Uses the coordinatewise mean, median, 25% trimmed mean, #####and RMVN location estimators. SO VERY SLOW. # Need p > 1. Want n > 20p, n2 > 20p, n3 > 20p, B > 20p. # Multiply x by A where xtype = 1 for MVN Np(0,I), # 2, (with delta = eps) for (1 - delta) Np(0,I) + delta Np(0, 25 I) # 3 for multivariate t_d with d = dd # 4 for lognormal. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # For the point mass outlier types 4 and 5, need gam much smaller than 0.4. # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu2 = (0, ..., 0). # Cov(x) = diag(1,2,...,p), Cov(x2) = sig^2 Cov(x) for clean data. # For outliers=0, want hquant and rquant approx 1. A <- sqrt(diag(1:p)) munot <- 0 * (1:p) mu <- delta * (1 + munot) val <- floor(gam * n) indx <- 1:n indx2 <- 1:n2 indx3 <- 1:n3 medmus <- matrix(0,nrow=B,ncol=2*p) mnmus <- medmus tmnmus <- medmus rmvnmus <- medmus medcv <- 0 mncv <- 0 tmncv <- 0 rmvncv <- 0 chisqcut <- qchisq(p=0.95,df=2*p) cutoffs <- matrix(0,nrow=nruns,ncol=4) for(i in 1:nruns) { #make data x <- matrix(rnorm(n * p), ncol = p, nrow = n) x2 <- matrix(rnorm(n2 * p), ncol = p, nrow = n2) x3 <- matrix(rnorm(n3 * p), ncol = p, nrow = n3) if(xtype == 2) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 zu <- runif(n2) x2[zu < eps, ] <- x2[zu < eps, ] * 5 zu <- runif(n3) x3[zu < eps, ] <- x3[zu < eps, ] * 5 } if(xtype == 3) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu zu <- sqrt(rchisq(n2, dd)/dd) x2 <- x2/zu zu <- sqrt(rchisq(n3, dd)/dd) x3 <- x3/zu } if(xtype == 4){ #Want pop coord med(x) = 0. x <- exp(x) x <- x - 1 x2 <- exp(x2) x2 <- x2 - 1 x3 <- exp(x3) x3 <- x3 - 1 } x <- x %*% A x2 <- x2 %*% A x2 <- sig2 * x2 x3 <- x3 %*% A x3 <- sig3 * x3 if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } x <- mu + x #clean x has mean mu =delta(1,...,1)^T, x2 and x3 have mean (0,...,0)^T #get bootstrapped Tx - Tx3, Tx2 - Tx3 for various statistics T for(j in 1:B){ tem <- sample(indx,n,replace=T) medx <- apply(x[tem,],2,median) mnx <- apply(x[tem,],2,mean) tmnx <- apply(x[tem,],2,tmn) rmvnx <- covrmvn(x[tem,])$center tem <- sample(indx2,n2,replace=T) medx2 <- apply(x2[tem,],2,median) mnx2 <- apply(x2[tem,],2,mean) tmnx2 <- apply(x2[tem,],2,tmn) rmvnx2 <- covrmvn(x2[tem,])$center tem <- sample(indx3,n3,replace=T) medx3 <- apply(x3[tem,],2,median) mnx3 <- apply(x3[tem,],2,mean) tmnx3 <- apply(x3[tem,],2,tmn) rmvnx3 <- covrmvn(x3[tem,])$center medmus[j,] <- c(medx-medx3,medx2-medx3) mnmus[j,] <- c(mnx-mnx3,mnx2-mnx3) tmnmus[j,] <- c(tmnx-tmnx3,tmnx2-tmnx3) rmvnmus[j,] <- c(rmvnx-rmvnx3,rmvnx2-rmvnx3) } outmed<-predreg(medmus) medcv <- medcv + outmed$inr outmn <- predreg(mnmus) mncv <- mncv + outmn$inr outtmn <- predreg(tmnmus) tmncv <- tmncv + outtmn$inr outrmvn <- predreg(rmvnmus) rmvncv <- rmvncv + outrmvn$inr cutoffs[i,]<-c(outmed$cuplim,outmn$cuplim,outtmn$cuplim,outrmvn$cuplim)^2 } medcv <- 1 - medcv/nruns #prop of times Ho is rejected mncv <- 1 - mncv/nruns tmncv <- 1 - tmncv/nruns rmvncv <- 1 - rmvncv/nruns mncut <- apply(cutoffs,2,mean) list(chisqcut = chisqcut, mncut=mncut, medcv = medcv, mncv = mncv,tmncv=tmncv,rmvncv=rmvncv) } manbtsim2<-function(n = 100, n2 = 100, n3 = 100, m = 2, csteps = 5, B= 100, gam = 0.4, nruns = 100, ytype = 1, outliers = 0, pm = 10, sig2 = 1, sig3 = 1, eps = 0.4, dd=4, delta = 0, delta3 = 0, cov3I = F){ #needs library(MASS) # This R function simulates a one way Manova type bootstrap test for # Ho: mu_1 = mu_2 = mu_3 where p = g = 3 = number of groups. #Here y2 and y3 are clean, y = y1 may have outliers, and y is m by 1. # Uses the coordinatewise mean, median, and 25% trimmed mean. #Also does the classical Hotelling Lawley one way MANOVA test. #Partially written by Hasthika S. Rupasinghe Arachchige Don. ## Does not use the slow RMVN location estimators. # Need m > 1. Want n > 20m, n2 > 20m, n3 > 20m, B > 20m. # Multiply y by A where ytype = 1 for MVN Nm(0,I), # 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I), # 3 for multivariate t_d with d = dd, # 4 for lognormal. # outliers = 0 for no outliers and Y~N(0,diag(1,...,m)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers Y~N((pm,...,pm)',diag(1,...,m)) # 4 for outliers Y[i,m] = pm # 5 for outliers Y[i,1] = pm # For the point mass outlier types 4 and 5, need gam much smaller than 0.4. # Power can be estimated by increasing delta so mu1 = delta(1,...,1) # and mu2 = (0, ..., 0), mu3 = delta3(1,...,1). # Cov(y) = diag(1,2,...,m), Cov(y2) = sig^2 Cov(y) for clean data. # Cov(y3) = sig^3 Cov(y) for clean data if cov3I = F, # or Cov(y3) = cI_3 if cov3I = T. # For outliers=0, want hquant and rquant approx 1. A <- sqrt(diag(1:m)) munot <- 0 * (1:m) mu <- delta * (1 + munot) mu3 <- delta3 * (1 + munot) val <- floor(gam * n) indx <- 1:n indx2 <- 1:n2 indx3 <- 1:n3 medmus <- matrix(0,nrow=B,ncol=2*m) mnmus <- medmus tmnmus <- medmus medcv <- 0 mncv <- 0 tmncv <- 0 crej <- 0 gp <- 0 grp <- 0 pval <- 0 out <- 0 chisqcut <- qchisq(p=0.95,df=2*m) cutoffs <- matrix(0,nrow=nruns,ncol=3) for(i in 1:nruns) { #make data y <- matrix(rnorm(n * m), ncol = m, nrow = n) y2 <- matrix(rnorm(n2 * m), ncol = m, nrow = n2) y3 <- matrix(rnorm(n3 * m), ncol = m, nrow =n3) if(ytype == 2) { zu <- runif(n) y[zu < eps, ] <- y[zu < eps, ] * 5 zu <- runif(n2) y2[zu < eps, ] <- y2[zu < eps, ] * 5 zu <- runif(n3) y3[zu < eps, ] <- y3[zu < eps, ] * 5 } if(ytype == 3) { zu <- sqrt(rchisq(n, dd)/dd) y <- y/zu zu <- sqrt(rchisq(n2, dd)/dd) y2 <- y2/zu zu <- sqrt(rchisq(n3, dd)/dd) y3 <- y3/zu } if(ytype == 4){ #Want pop coord med(y) = 0. y <- exp(y) y <- y - 1 y2 <- exp(y2) y2 <- y2 - 1 y3 <- exp(y3) y3 <- y3 - 1 } y <- y %*% A y2 <- y2 %*% A y2 <- sig2 * y2 if( cov3I != T){ y3 <- y3 %*% A y3 <- sig3 * y3} if(outliers == 1) { y[1:val, ] <- matrix(rnorm(val * m, sd = 0.01), ncol = m, nrow = val) y[1:val, m] <- y[1:val, m] + pm } if(outliers == 2) { y[1:val, ] <- matrix(rnorm(val * m, sd = 0.01), ncol = m, nrow = val) y[1:val, 1] <- y[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:m y[1:val, ] <- y[1:val, ] + tem } if(outliers == 4) { y[1:val, m] <- pm } if(outliers == 5) { y[1:val, 1] <- pm } y <- mu + y y3 <- mu3 + y3 #clean y has mean mu =delta(1,...,1)^T, #y2 has mean (0,..,0)^T and y3 has mean delta3 (1,...,1)^T #get bootstrapped Ty - Ty3, Ty2 - Ty3 for various statistics T for(j in 1:B){ tem <- sample(indx,n,replace=T) medy <- apply(y[tem,],2,median) mny <- apply(y[tem,],2,mean) tmny <- apply(y[tem,],2,tmn) tem <- sample(indx2,n2,replace=T) medy2 <- apply(y2[tem,],2,median) mny2 <- apply(y2[tem,],2,mean) tmny2 <- apply(y2[tem,],2,tmn) tem <- sample(indx3,n3,replace=T) medy3 <- apply(y3[tem,],2,median) mny3 <- apply(y3[tem,],2,mean) tmny3 <- apply(y3[tem,],2,tmn) medmus[j,] <- c(medy-medy3,medy2-medy3) mnmus[j,] <- c(mny-mny3,mny2-mny3) tmnmus[j,] <- c(tmny-tmny3,tmny2-tmny3) } outmed<-predreg(medmus) medcv <- medcv + outmed$inr outmn <- predreg(mnmus) mncv <- mncv + outmn$inr outtmn <- predreg(tmnmus) tmncv <- tmncv + outtmn$inr cutoffs[i,]<-c(outmed$cuplim,outmn$cuplim,outtmn$cuplim)^2 #Get the classical test coverage yall<-rbind(y,y2,y3) yall<-as.matrix(yall) gp <- c(rep(1, n),rep(2, n2),rep(3, n3)) grp<-factor(gp) out<-manova(yall~grp) pval <- summary(out,test="Hotelling-Lawley")$stats[1,6] #pvalue for Hotelling-Lawley's test if(pval < 0.05){ crej <- crej +1 } } medcv <- 1 - medcv/nruns #prop of times Ho is rejected mncv <- 1 - mncv/nruns tmncv <- 1 - tmncv/nruns mncut <- apply(cutoffs,2,mean) ccv <- crej/nruns list(chisqcut = chisqcut, mncut=mncut, medcv = medcv, mncv = mncv,tmncv=tmncv,ccv=ccv) } manbtsim3<-function(n1 = 100, n2 = 100, n3 = 100, m = 2, csteps = 5, B= 100, gam = 0.4, nruns = 100, ytype = 1, outliers = 0, pm = 10, sig2 = 1, sig3 = 1, eps = 0.4, dd=4, delta1 = 0, delta2 = 0, delta3 = 0, cov3I = F){ #needs library(MASS) # This R function simulates a one way Manova type bootstrap test for # Ho: mu_1 = mu_2 = mu_3 where p = g = 3 = number of groups. #Can vary the mean vectors mui better than in manbtsim2. #Here y2 and y3 are clean, y1 may have outliers, and y1 is m by 1. # Uses the coordinatewise mean, median, and 25% trimmed mean. #Also does the classical Hotelling Lawley one way MANOVA test. #Partially written by Hasthika S. Rupasinghe Arachchige Don. ## Does not use the slow RMVN location estimators. # Need m > 1. Want n > 20m, n2 > 20m, n3 > 20m, B > 20m. # Multiply y by A where ytype = 1 for MVN Nm(0,I), # 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I), # 3 for multivariate t_d with d = dd, # 4 for lognormal. # outliers = 0 for no outliers and Y~N(0,diag(1,...,m)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers Y~N((pm,...,pm)',diag(1,...,m)) # 4 for outliers Y[i,m] = pm # 5 for outliers Y[i,1] = pm # For the point mass outlier types 4 and 5, need gam much smaller than 0.4. # Power can be estimated by using unequal deltai so mu1 = delta1(1,...,1) # and mu2 = delta2(1, ..., 1), mu3 = delta3(1,...,1). # Cov(y1) = diag(1,2,...,m), Cov(y2) = sig^2 Cov(y1) for clean data. # Cov(y3) = sig^3 Cov(y1) for clean data if cov3I = F, # or Cov(y3) = cI_3 if cov3I = T. # For outliers=0, want hquant and rquant approx 1. A <- sqrt(diag(1:m)) munot <- 0 * (1:m) mu1 <- delta1 * (1 + munot) mu2 <- delta2 * (1 + munot) mu3 <- delta3 * (1 + munot) val <- floor(gam * n1) indx1 <- 1:n1 indx2 <- 1:n2 indx3 <- 1:n3 medmus <- matrix(0,nrow=B,ncol=2*m) mnmus <- medmus tmnmus <- medmus medcv <- 0 mncv <- 0 tmncv <- 0 crej <- 0 gp <- 0 grp <- 0 pval <- 0 out <- 0 chisqcut <- qchisq(p=0.95,df=2*m) cutoffs <- matrix(0,nrow=nruns,ncol=3) for(i in 1:nruns) { #make data y1 <- matrix(rnorm(n1 * m), ncol = m, nrow = n1) y2 <- matrix(rnorm(n2 * m), ncol = m, nrow = n2) y3 <- matrix(rnorm(n3 * m), ncol = m, nrow = n3) if(ytype == 2) { zu <- runif(n) y1[zu < eps, ] <- y1[zu < eps, ] * 5 zu <- runif(n2) y2[zu < eps, ] <- y2[zu < eps, ] * 5 zu <- runif(n3) y3[zu < eps, ] <- y3[zu < eps, ] * 5 } if(ytype == 3) { zu <- sqrt(rchisq(n1, dd)/dd) y1 <- y1/zu zu <- sqrt(rchisq(n2, dd)/dd) y2 <- y2/zu zu <- sqrt(rchisq(n3, dd)/dd) y3 <- y3/zu } if(ytype == 4){ #Want pop coord med(y) = 0. y1 <- exp(y1) y1 <- y1 - 1 y2 <- exp(y2) y2 <- y2 - 1 y3 <- exp(y3) y3 <- y3 - 1 } y1 <- y1 %*% A y2 <- y2 %*% A y2 <- sig2 * y2 if( cov3I != T){ y3 <- y3 %*% A y3 <- sig3 * y3} if(outliers == 1) { y1[1:val, ] <- matrix(rnorm(val * m, sd = 0.01), ncol = m, nrow = val) y1[1:val, m] <- y1[1:val, m] + pm } if(outliers == 2) { y1[1:val, ] <- matrix(rnorm(val * m, sd = 0.01), ncol = m, nrow = val) y1[1:val, 1] <- y1[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:m y1[1:val, ] <- y1[1:val, ] + tem } if(outliers == 4) { y1[1:val, m] <- pm } if(outliers == 5) { y1[1:val, 1] <- pm } y1 <- mu1 + y1 y2 <- mu2 + y2 y3 <- mu3 + y3 #clean y1 has mean mu1 =delta1(1,...,1)^T, #y2 has mean delta2(1,..,1)^T and y3 has mean delta3 (1,...,1)^T #get bootstrapped Ty - Ty3, Ty2 - Ty3 for various statistics T for(j in 1:B){ tem <- sample(indx1,n1,replace=T) medy <- apply(y1[tem,],2,median) mny <- apply(y1[tem,],2,mean) tmny <- apply(y1[tem,],2,tmn) tem <- sample(indx2,n2,replace=T) medy2 <- apply(y2[tem,],2,median) mny2 <- apply(y2[tem,],2,mean) tmny2 <- apply(y2[tem,],2,tmn) tem <- sample(indx3,n3,replace=T) medy3 <- apply(y3[tem,],2,median) mny3 <- apply(y3[tem,],2,mean) tmny3 <- apply(y3[tem,],2,tmn) medmus[j,] <- c(medy-medy3,medy2-medy3) mnmus[j,] <- c(mny-mny3,mny2-mny3) tmnmus[j,] <- c(tmny-tmny3,tmny2-tmny3) } outmed<-predreg(medmus) medcv <- medcv + outmed$inr outmn <- predreg(mnmus) mncv <- mncv + outmn$inr outtmn <- predreg(tmnmus) tmncv <- tmncv + outtmn$inr cutoffs[i,]<-c(outmed$cuplim,outmn$cuplim,outtmn$cuplim)^2 #Get the classical test coverage yall<-rbind(y1,y2,y3) yall<-as.matrix(yall) gp <- c(rep(1, n1),rep(2, n2),rep(3, n3)) grp<-factor(gp) out<-manova(yall~grp) pval <- summary(out,test="Hotelling-Lawley")$stats[1,6] #pvalue for Hotelling-Lawley's test if(pval < 0.05){ crej <- crej +1 } } medcv <- 1 - medcv/nruns #prop of times Ho is rejected mncv <- 1 - mncv/nruns tmncv <- 1 - tmncv/nruns mncut <- apply(cutoffs,2,mean) ccv <- crej/nruns list(chisqcut = chisqcut, mncut=mncut, medcv = medcv, mncv = mncv,tmncv=tmncv,ccv=ccv) } manbtsim4<-function(n1 = 100, n2 = 100, n3 = 100, m = 2, csteps = 5, B= 100, gam = 0.4, nruns = 100, ytype = 1, outliers = 0, pm = 10, sig2 = 1, sig3 = 1, eps = 0.4, dd=4, delta1 = 0, delta2 = 0, delta3 = 0, cov3I = F){ #needs library(MASS) and library(Matrix) # This R function simulates one way Manova type tests for # Ho: mu_1 = mu_2 = mu_3 where p = g = 3 = number of groups. #Can vary the mean vectors mui better than in manbtsim2. #Has one more test than manbtsim3: #the large sample Manova type test based on the sample mean. #Here y2 and y3 are clean, y1 may have outliers, and y1 is m by 1. # Uses the coordinatewise mean, median, and 25% trimmed mean. #Also does the classical Hotelling Lawley one way MANOVA test. #Partially written by Hasthika S. Rupasinghe Arachchige Don. ## Does not use the slow RMVN location estimators. # Need m > 1. Want n > 20m, n2 > 20m, n3 > 20m, B > 20m. # Multiply y by A where ytype = 1 for MVN Nm(0,I), # 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I), # 3 for multivariate t_d with d = dd, # 4 for lognormal. # outliers = 0 for no outliers and Y~N(0,diag(1,...,m)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers Y~N((pm,...,pm)',diag(1,...,m)) # 4 for outliers Y[i,m] = pm # 5 for outliers Y[i,1] = pm # For the point mass outlier types 4 and 5, need gam much smaller than 0.4. # Power can be estimated by using unequal deltai so mu1 = delta1(1,...,1) # and mu2 = delta2(1, ..., 1), mu3 = delta3(1,...,1). # Cov(y1) = diag(1,2,...,m), Cov(y2) = sig^2 Cov(y1) for clean data. # Cov(y3) = sig^3 Cov(y1) for clean data if cov3I = F, # or Cov(y3) = cI_3 if cov3I = T. # For outliers=0, want hquant and rquant approx 1. A <- sqrt(diag(1:m)) munot <- 0 * (1:m) mu1 <- delta1 * (1 + munot) mu2 <- delta2 * (1 + munot) mu3 <- delta3 * (1 + munot) val <- floor(gam * n1) indx1 <- 1:n1 indx2 <- 1:n2 indx3 <- 1:n3 medmus <- matrix(0,nrow=B,ncol=2*m) mnmus <- medmus tmnmus <- medmus medcv <- 0 mncv <- 0 tmncv <- 0 mantcov <- 0 crej <- 0 gp <- 0 grp <- 0 pval <- 0 out <- 0 chisqcut <- qchisq(p=0.95,df=2*m) cutoffs <- matrix(0,nrow=nruns,ncol=3) for(i in 1:nruns) { #make data y1 <- matrix(rnorm(n1 * m), ncol = m, nrow = n1) y2 <- matrix(rnorm(n2 * m), ncol = m, nrow = n2) y3 <- matrix(rnorm(n3 * m), ncol = m, nrow = n3) if(ytype == 2) { zu <- runif(n) y1[zu < eps, ] <- y1[zu < eps, ] * 5 zu <- runif(n2) y2[zu < eps, ] <- y2[zu < eps, ] * 5 zu <- runif(n3) y3[zu < eps, ] <- y3[zu < eps, ] * 5 } if(ytype == 3) { zu <- sqrt(rchisq(n1, dd)/dd) y1 <- y1/zu zu <- sqrt(rchisq(n2, dd)/dd) y2 <- y2/zu zu <- sqrt(rchisq(n3, dd)/dd) y3 <- y3/zu } if(ytype == 4){ #Want pop coord med(y) = 0. y1 <- exp(y1) y1 <- y1 - 1 y2 <- exp(y2) y2 <- y2 - 1 y3 <- exp(y3) y3 <- y3 - 1 } y1 <- y1 %*% A y2 <- y2 %*% A y2 <- sig2 * y2 if( cov3I != T){ y3 <- y3 %*% A y3 <- sig3 * y3} if(outliers == 1) { y1[1:val, ] <- matrix(rnorm(val * m, sd = 0.01), ncol = m, nrow = val) y1[1:val, m] <- y1[1:val, m] + pm } if(outliers == 2) { y1[1:val, ] <- matrix(rnorm(val * m, sd = 0.01), ncol = m, nrow = val) y1[1:val, 1] <- y1[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:m y1[1:val, ] <- y1[1:val, ] + tem } if(outliers == 4) { y1[1:val, m] <- pm } if(outliers == 5) { y1[1:val, 1] <- pm } y1 <- mu1 + y1 y2 <- mu2 + y2 y3 <- mu3 + y3 #clean y1 has mean mu1 =delta1(1,...,1)^T, #y2 has mean delta2(1,..,1)^T and y3 has mean delta3 (1,...,1)^T #get bootstrapped Ty - Ty3, Ty2 - Ty3 for various statistics T for(j in 1:B){ tem <- sample(indx1,n1,replace=T) medy <- apply(y1[tem,],2,median) mny <- apply(y1[tem,],2,mean) tmny <- apply(y1[tem,],2,tmn) tem <- sample(indx2,n2,replace=T) medy2 <- apply(y2[tem,],2,median) mny2 <- apply(y2[tem,],2,mean) tmny2 <- apply(y2[tem,],2,tmn) tem <- sample(indx3,n3,replace=T) medy3 <- apply(y3[tem,],2,median) mny3 <- apply(y3[tem,],2,mean) tmny3 <- apply(y3[tem,],2,tmn) medmus[j,] <- c(medy-medy3,medy2-medy3) mnmus[j,] <- c(mny-mny3,mny2-mny3) tmnmus[j,] <- c(tmny-tmny3,tmny2-tmny3) } outmed<-predreg(medmus) medcv <- medcv + outmed$inr outmn <- predreg(mnmus) mncv <- mncv + outmn$inr outtmn <- predreg(tmnmus) tmncv <- tmncv + outtmn$inr cutoffs[i,]<-c(outmed$cuplim,outmn$cuplim,outtmn$cuplim)^2 #Get the classical test coverage yall<-rbind(y1,y2,y3) yall<-as.matrix(yall) gp <- c(rep(1, n1),rep(2, n2),rep(3, n3)) grp<-factor(gp) out<-manova(yall~grp) pval <- summary(out,test="Hotelling-Lawley")$stats[1,6] #pvalue for Hotelling-Lawley's test if(pval < 0.05){ crej <- crej +1 } #large sample Manova type test based on the sample mean pval <- mantyp(y=yall,p=3,group=grp)$pval if(pval < 0.05) mantcov <- mantcov + 1 } medcv <- 1 - medcv/nruns #prop of times Ho is rejected mncv <- 1 - mncv/nruns tmncv <- 1 - tmncv/nruns mncut <- apply(cutoffs,2,mean) ccv <- crej/nruns mantcov <- mantcov/nruns list(chisqcut = chisqcut, mncut=mncut, medcv = medcv, mncv = mncv,tmncv=tmncv,ccv=ccv,mantcov=mantcov) } manovasim<-function(n1=100,n2=100,n3=100,m=2,B=100,nruns=100,ytype=1, sig2=1,sig3=1,eps=0.4,dd=4,delta1=0,delta2=0,delta3=0,cov3I=F,alpha=0.05){ #needs library(Matrix) # This function simulates one way Manova type tests for # Ho: mu_1 = mu_2 = mu_3 where p = g = 3 = number of groups. # Has the classical test, bootstrap cutoff test, # large sample test, and the test using C_n = I. # yi is m by 1. # Need m > 1. Want n1 > 20m, n2 > 20m, n3 > 20m, B > 20m. # Multiply y by A where ytype = 1 for MVN Nm(0,I), # 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I), # 3 for multivariate t_d with d = dd, # 4 for lognormal. # Power can be estimated by using unequal deltai so mu1 = delta1(1,...,1) # and mu2 = delta2(1, ..., 1), mu3 = delta3(1,...,1). # Cov(y1) = diag(1,2,...,m), Cov(y2) = sig^2 Cov(y1) for clean data. # Cov(y3) = sig^3 Cov(y1) for clean data if cov3I = F, # or Cov(y3) = cI_3 if cov3I = T. A <- sqrt(diag(1:m)) munot <- 0 * (1:m) mu1 <- delta1 * (1 + munot) mu2 <- delta2 * (1 + munot) mu3 <- delta3 * (1 + munot) indx1 <- 1:n1 indx2 <- 1:n2 indx3 <- 1:n3 mnmus <- matrix(0,nrow=B,ncol=2*m) mancov <- 0 mantcov <- 0 prcv <- 0 brcv <- 0 p <- 3 # number of groups q <- p-1 gp <- c(rep(1, n1),rep(2, n2),rep(3, n3)) grp<-factor(gp) num <- m*q n <- n1+n2+n3 dn <- min(n1,n2,n3) C <-matrix(1,nrow=q,ncol=q) cut <- num*qf(1-alpha,df1=num,df2=dn) bstat <- 1:B cstat <- 0 * 1:nruns bcut <- 1:nruns up <- min((1 - alpha/2), (1 - alpha + 20*alpha*m/B)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + 2*m/B)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha for(i in 1:nruns) { #make data y1 <- matrix(rnorm(n1 * m), ncol = m, nrow = n1) y2 <- matrix(rnorm(n2 * m), ncol = m, nrow = n2) y3 <- matrix(rnorm(n3 * m), ncol = m, nrow = n3) if(ytype == 2) { zu <- runif(n1) y1[zu < eps, ] <- y1[zu < eps, ] * 5 zu <- runif(n2) y2[zu < eps, ] <- y2[zu < eps, ] * 5 zu <- runif(n3) y3[zu < eps, ] <- y3[zu < eps, ] * 5 } if(ytype == 3) { zu <- sqrt(rchisq(n1, dd)/dd) y1 <- y1/zu zu <- sqrt(rchisq(n2, dd)/dd) y2 <- y2/zu zu <- sqrt(rchisq(n3, dd)/dd) y3 <- y3/zu } if(ytype == 4){ y1 <- exp(y1) y1 <- y1 - exp(0.5) y2 <- exp(y2) y2 <- y2 - exp(0.5) y3 <- exp(y3) y3 <- y3 - exp(0.5) } y1 <- y1 %*% A y2 <- y2 %*% A y2 <- sig2 * y2 if( cov3I != T){ y3 <- y3 %*% A y3 <- sig3 * y3} y1 <- mu1 + y1 y2 <- mu2 + y2 y3 <- mu3 + y3 #yi has mean mui = deltai (1,...,1)^T #Get the classical test coverage Sp <- var(y3) Tp <- apply(y3,2,mean) S1 = var(y1) T1 <- apply(y1,2,mean) S2 = var(y2) T2 <- apply(y2,2,mean) w <- c(T1-Tp,T2-Tp) Spool <- ((n1-1)*S1 + (n2-1)*S2 + (n3-1)*Sp)/(n-p) D <- Spool/n3 ssigw <- kronecker(C,D) #still need to get the block diagonal of Sigw/n BD <- bdiag(Spool/n1,Spool/n2) ssigw <- ssigw + BD #ssigw =sigw/n, n = sum(ni) ssigw <- as.matrix(ssigw) sigw <- n*ssigw sigwinv <- solve(sigw) t0 <- n* t(w) %*% sigwinv %*% w if(t0 < cut) mancov <- mancov + 1 cstat[i] <- t0 #get bootstrapped Ty - Ty3, Ty2 - Ty3 for various statistics T for(j in 1:B){ tem <- sample(indx1,n1,replace=T) mny1 <- apply(y1[tem,],2,mean) tem <- sample(indx2,n2,replace=T) mny2 <- apply(y2[tem,],2,mean) tem <- sample(indx3,n3,replace=T) mny3 <- apply(y3[tem,],2,mean) mnmus[j,] <- c(mny1-mny3,mny2-mny3) bw <- mnmus[j,] - w bstat[j] <- n*t(bw)%*%sigwinv%*%bw } bcut[i] <- quantile(bstat,up) outmn <- identrgn(mnmus,stat=w,alpha=alpha) prcv <- prcv + outmn$inr brcv <- brcv + outmn$inr2 #large sample Manova type test based on the sample mean D <- Sp/n3 ssigw <- kronecker(C,D) #still need to get the block diagonal of Sigw/n BD <- bdiag(S1/n1,S2/n2) ssigw <- ssigw + BD #ssigw =sigw/n, n = sum(ni) ssigw <- as.matrix(ssigw) sigw <- n*ssigw t0 <- n* t(w) %*% solve(sigw) %*% w if(t0 < cut) mantcov <- mantcov + 1 } mancov <- mancov/nruns mantcov <- mantcov/nruns bootcov <- sum(cstat < bcut)/nruns prcv <- prcv/nruns brcv <- brcv/nruns # list(cut=cut,mancov=mancov,bootcov=bootcov,manLScov=mantcov,prcv=prcv, brcv=brcv) } mantyp<-function(y,p,group,alph=0.05){ #Does a one way MANOVA type test based on large sample theory. #needs library(Matrix) #p is the number of groups and 1 < p < 10 #group = i if y is from the ith group #n = sum(ni) group <- as.factor(group) y <- as.matrix(y) n<-dim(y)[1] m<-dim(y)[2] Sp <- var(y[group==p,]) Tp <- apply(y[group==p,],2,mean) ni<-1:p q<-p-1 num<- m*q ni[p] <- sum(group==p) for(i in 1:q){ ni[i] <- sum(group==i) if(i==1) {S1 = var(y[group==1,]) T1 <- apply(y[group==1,],2,mean)} if(i==2) {S2 = var(y[group==2,]) T2 <- apply(y[group==2,],2,mean)} if(i==3) {S3 = var(y[group==3,]) T3 <- apply(y[group==3,],2,mean)} if(i==4) {S4 = var(y[group==4,]) T4 <- apply(y[group==4,],2,mean)} if(i==5) {S5 = var(y[group==5,]) T5 <- apply(y[group==5,],2,mean)} if(i==6) {S6 = var(y[group==6,]) T6 <- apply(y[group==6,],2,mean)} if(i==7) {S7 = var(y[group==7,]) T7 <- apply(y[group==7,],2,mean)} if(i==8) {S8 = var(y[group==8,]) T8 <- apply(y[group==8,],2,mean)} } dn <- min(ni) A<-matrix(1,nrow=q,ncol=q) B <- Sp/ni[p] ssigw <- kronecker(A,B) #still need to get the block diagonal of Sigw/n if(p==2) BD <- S1/ni[1] if(p==3) BD <- bdiag(S1/ni[1],S2/ni[2]) if(p==4) BD <- bdiag(S1/ni[1],S2/ni[2],S3/ni[3]) if(p==5) BD <- bdiag(S1/ni[1],S2/ni[2],S3/ni[3],S4/ni[4]) if(p==6) BD <- bdiag(S1/ni[1],S2/ni[2],S3/ni[3],S4/ni[4],S5/ni[5]) if(p==7) BD <- bdiag(S1/ni[1],S2/ni[2],S3/ni[3],S4/ni[4],S5/ni[5],S6/ni[6]) if(p==8) BD <- bdiag(S1/ni[1],S2/ni[2],S3/ni[3],S4/ni[4],S5/ni[5],S6/ni[6], S7/ni[7]) if(p==9) BD <- bdiag(S1/ni[1],S2/ni[2],S3/ni[3],S4/ni[4],S5/ni[5],S6/ni[6], S7/ni[7],S8/ni[8]) ssigw <- ssigw + BD #ssigw =sigw/n, n = sum(ni) n<-sum(ni) ssigw <- as.matrix(ssigw) sigw <- n*ssigw if(p==2) w <- T1 - Tp if(p==3) w <- c(T1-Tp,T2-Tp) if(p==4) w <- c(T1-Tp,T2-Tp,T3-Tp) if(p==5) w <- c(T1-Tp,T2-Tp,T3-Tp,T4-Tp) if(p==6) w <- c(T1-Tp,T2-Tp,T3-Tp,T4-Tp,T5-Tp) if(p==7) w <- c(T1-Tp,T2-Tp,T3-Tp,T4-Tp,T5-Tp,T6-Tp) if(p==8) w <- c(T1-Tp,T2-Tp,T3-Tp,T4-Tp,T5-Tp,T6-Tp,T7-Tp) if(p==9) w <- c(T1-Tp,T2-Tp,T3-Tp,T4-Tp,T5-Tp,T6-Tp,T7-Tp,T8-Tp) t0 <- n* t(w) %*% solve(sigw) %*% w pval <- 1-pf((t0/num),df1=num,df2=dn) #approx 1 - pchisq(t0,num) cut <- num*qf(1-alph,df1=num,df2=dn) #reject Ho if to > cut list(t0=t0,cut<-cut,pval=pval) } #used in rmltreg mbalata<-function(x, y, k=6, nsamp = 7) {#gets the median ball fit with 7 centers, med resid crit, 7 ball sizes x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] # q + 1 is number of predictors including intercept vals <- c(q + 3 + floor(n/100), q + 3 + floor(n/40), q + 3 + floor(n/20), q + 3 + floor(n/10), q + 3 + floor(n/5), q + 3 + floor(n/3), q + 3 + floor(n/2)) covv <- diag(q) centers <- sample(n, nsamp) temp <- lsfit(x, y) mbaf <- temp$coef ## get LATA criterion res <- temp$residuals crit <- k^2*median(res^2) cn <- sum(res^2 <= crit) absres <- sort(abs(res)) critf <- sum(absres[1:cn]) ## for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) smd2 <- sort(md2) for(j in 1:7) { temp <- lsfit(x[md2 <= smd2[vals[j]], ], y[md2 <= smd2[vals[j]]]) #Use OLS on rows with md2 <= cutoff = smd2[vals[j]] res <- y - temp$coef[1] - x %*% temp$coef[-1] ## get LATA criterion crit <- k^2*median(res^2) cn <- sum(res^2 <= crit) absres <- sort(abs(res)) crit <- sum(absres[1:cn]) ## if(crit < critf) { critf <- crit mbaf <- temp$coef } } } list(coef = mbaf, critf = critf) } mbamv<-function(x, y, nsamp = 7) {# This function is for simple linear regression. The # highlighted boxes get weight 1. Click on right # mouse button to advance plot, and in R, highlight "stop." Only uses 50% trimming. x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] covv <- diag(q) centers <- sample(n, nsamp) for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) med <- median(md2) plot(x, y) points(x[md2 < med], y[md2 < med], pch = 15) abline(lsfit(x[md2 < med],y[md2 < med])) identify(x, y) } } mbamv2<-function(x, Y, nsamp = 7) { # This function is for multiple linear regression. The # highlighted boxes get weight 1. Click on right # mouse button to advance plot, and in R, highlight "stop." Only uses 50% trimming. x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] covv <- diag(q) centers <- sample(n, nsamp) par(mfrow=c(2,1)) for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) med <- median(md2) if(q ==1){out <- lsfit(x[md2 < med],Y[md2 < med])} else{out <- lsfit(x[md2 < med,],Y[md2 < med])} FIT <- out$coef[1] + x%*%out$coef[-1] RES <- Y - FIT plot(FIT,Y) points(FIT[md2 < med], Y[md2 < med], pch = 15) abline(0,1) identify(FIT, Y) plot(FIT,RES) points(FIT[md2 < med], RES[md2 < med], pch = 15) abline(0,0) identify(FIT, RES) } par(mfrow=c(1,1)) } ##used in rmltreg, rrplot2 mbareg<-function(x, y, nsamp = 7) {#gets the mbareg fit with 7 centers, med resid crit, 7 ball sizes x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] # q + 1 is number of predictors including intercept vals <- c(q + 3 + floor(n/100), q + 3 + floor(n/40), q + 3 + floor(n/20 ), q + 3 + floor(n/10), q + 3 + floor(n/5), q + 3 + floor(n/3), q + 3 + floor(n/2)) covv <- diag(q) centers <- sample(n, nsamp) temp <- lsfit(x, y) mbaf <- temp$coef critf <- median(temp$residuals^2) for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) smd2 <- sort(md2) for(j in 1:7) { temp <- lsfit(x[md2 <= smd2[vals[j]], ], y[md2 <= smd2[ vals[j]]]) #Use OLS on rows with md2 <= cutoff = smd2[vals[j]] res <- y - temp$coef[1] - x %*% temp$coef[-1] crit <- median(res^2) if(crit < critf) { critf <- crit mbaf <- temp$coef } } } list(coef = mbaf, critf = critf) } #used in ddplot2 mbsim <-function(n = 100, p = 2, csteps = 5, gam = 0.4, runs = 20, outliers = 0, pm = 10, cscale=F){ # Need p > 1. # This R function simulates the biased MB estimator. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a point mass on major axis (0,...,0,pm)' # 2 for outliers a point mass on minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # 6 for outliers a tight cluster at major axis (0,...,0,pm)' #set.seed(974) A <- sqrt(diag(1:p)) cloc <- 0 * (1:p) csig <- 0 * A mbloc <- cloc mbsig <- csig cct <- 0 mbct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- 0 x[1:val,p] <- pm } if(outliers == 2) { x[1:val, ] <- 0 x[1:val,1] <- pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p ] <- pm } if(outliers == 5) { x[1:val, 1 ] <- pm } if(outliers == 6) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } out <- covmb(x, steps = csteps, scale = cscale) mbloc <- mbloc + out$center mbsig <- mbsig + out$cov rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 cmn <- apply(x,2,mean) cloc <- cloc + cmn ccov <- var(x) csig <- csig + ccov rd2 <- mahalanobis(x, cmn, ccov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cct <- cct + 1 } cloc <- cloc/runs csig <- csig/runs mbloc <- mbloc/runs mbsig <- mbsig/runs list(cloc = cloc, mbloc = mbloc, csig = csig, mbsig=mbsig, cct = cct, mbct=mbct) } medci<-function(x, alpha = 0.05) {#Gets Bloch and Gastwirth SE for sample median and the #corresponding resistant 100(1-alpha)% CI. The default is alpha = .05. n <- length(x) up <- 1 - alpha/2 med <- median(x) ln <- floor(n/2) - ceiling(sqrt(n/4)) un <- n - ln d <- sort(x) rdf <- un - ln - 1 cut <- qt(up, rdf) sebg <- 0.5 * (d[un] - d[ln + 1]) rval <- cut * sebg rlo <- med - rval rhi <- med + rval list(int = c(rlo, rhi), med = med, sebg = sebg) } medhotsim<-function(n = 100, p = 2, csteps = 5, gam = 0.4, nruns = 100, xtype=1, outliers = 0, pm = 10, eps=0.25, dd= 1, delta = 0, BB=1000){ # Simulates coord. median bootstrap Hotelling's T^2 type test. # Need p > 1. Want n > 20p and number of bootstrap samples BB > 20p. # Multiply x by A where xtype = 1 for MVN Np(0,I), # 2, 3, 4 and 5 (with delta = eps) for (1 - delta) Np(0,I) + delta Np(0, 25 I) # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd # 10 for lognormal. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu_o = 0*mu. A <- sqrt(diag(1:p)) hotcv <- 0 munot <- 0 * (1:p) mu <- delta * (1 + munot) val <- floor(gam * n) for(i in 1:nruns) { #make data x <- matrix(rnorm(n * p), ncol = p, nrow = n) if(xtype == 2) { zu <- runif(n) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(n) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(n) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } if(xtype == 6) { zu <- sqrt(rchisq(n, 3)/3) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(n, 5)/5) x <- x/zu } if(xtype == 8) { zu <- sqrt(rchisq(n, 19)/19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(xtype == 10){ #Want pop coord med(x) = 0. x <- exp(x) x <- x - 1 } x <- x %*% A if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } x <- mu + x #get bootstrapped coord. median out<-rhotboot(x,B=BB) hotcv <- hotcv + predreg(out$mus)$inr } hotcv <- 1 - hotcv/nruns #prop of times Ho is rejected list(hotcv = hotcv) } medout<-function(x) {# finds squared Euclidean distances from the coordinatewise median x<-as.matrix(x) p <- dim(x)[2] covv <- diag(p) med <- apply(x, 2, median) d2 <- mahalanobis(x, center = med, covv) plot(d2) list(d2 = d2) } mldsim <-function(n = 100, p = 2, steps = 5, gam = 0.4, runs = 20, outliers = 0, pm = 10, loc = 0.5){ # Need p > 1. # This R function compares the MBA, FCH, RFCH, RMVN, scaled DGK, classical, OGK, # FASTMCD, CMVE and biased MB estimators. # R needs library(robustbase), calls covrob, covdgk # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a point mass on major axis (0,...,0,pm)' # 2 for outliers a point mass on minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # 6 for outliers a tight cluster at major axis (0,...,0,pm)' # Also finds n*var(T[p]) which approx p for the classical estimator # and n var(C(p,p)) for MLD estimator (T,C). A <- sqrt(diag(1:p)) qchi <- qchisq(0.5, p) cs <- 1:runs cm <- cs cloc <- 0 * (1:p) csig <- 0 * A mbas <- 1:runs mbam <- cs mbaloc <- cloc mbasig <- csig fchs <- mbas fchm <- cs fchloc <- cloc fchsig <- csig rfchs <- mbas rfchm <- cs rfchloc <- cloc rfchsig <- csig rmvns <- mbas rmvnm <- cs rmvnloc <- cloc rmvnsig <- csig dgks <- mbas dgkm <- cs dgkloc <- cloc dgksig <- csig ogks <- mbas ogkm <- cs ogkloc <- cloc ogksig <- csig fmcds <- mbas fmcdm <- cs fmcdloc <- cloc fmcdsig <- csig cmveloc <- cloc cmvesig <- csig mbs <- mbas mbm <- cs mbsig <- csig mbloc <- cloc mbact <- 0 fchct <- 0 rfchct <- 0 rmvnct <- 0 dgkct <- 0 cct <- 0 ogkct <- 0 fmcdct <- 0 cmvect <- 0 mbct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- 0 x[1:val,p] <- pm } if(outliers == 2) { x[1:val, ] <- 0 x[1:val,1] <- pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p ] <- pm } if(outliers == 5) { x[1:val, 1 ] <- pm } if(outliers == 6) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } out <- covrob(x, csteps = steps, locc = loc) mbaloc <- mbaloc + out$center mbasig <- mbasig + out$cov mbam[i] <- out$center[p] mbas[i] <- out$cov[p,p] rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbact <- mbact + 1 rd2 <- mahalanobis(x, out$mnm, out$covm) const <- median(rd2)/qchi covm <- const * out$covm mbloc <- mbloc + out$mnm mbsig <- mbsig + covm mbm[i] <- out$mnm[p] mbs[i] <- covm[p,p] #scaling does not effect outlier count if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 fchloc <- fchloc + out$mnf fchsig <- fchsig + out$covf fchm[i] <- out$mnf[p] fchs[i] <- out$covf[p,p] rd2 <- mahalanobis(x, out$mnf, out$covf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fchct <- fchct + 1 rfchloc <- rfchloc + out$rmnf rfchsig <- rfchsig + out$rcovf rfchm[i] <- out$rmnf[p] rfchs[i] <- out$rcovf[p,p] rd2 <- mahalanobis(x, out$rmnf, out$rcovf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rfchct <- rfchct + 1 rmvnloc <- rmvnloc + out$rmnmvn rmvnsig <- rmvnsig + out$rcovmvn rmvnm[i] <- out$rmnmvn[p] rmvns[i] <- out$rcovmvn[p,p] rd2 <- mahalanobis(x, out$rmnmvn, out$rcovmvn) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rmvnct <- rmvnct + 1 cmveloc <- cmveloc + out$mncmv cmvesig <- cmvesig + out$covcmv rd2 <- mahalanobis(x, out$mncmv, out$covcmv) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cmvect <- cmvect + 1 out <- covdgk(x, csteps = 10) dgkloc <- dgkloc + out$center dgksig <- dgksig + out$cov dgkm[i] <- out$center[p] dgks[i] <- out$cov[p,p] rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) dgkct <- dgkct + 1 center <- apply(x,2,mean) cloc <- cloc + center ccov <- var(x) csig <- csig + ccov cm[i] <- center[p] cs[i] <- ccov[p,p] rd2 <- mahalanobis(x, center, ccov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cct <- cct + 1 out <- covOGK(x,sigmamu = scaleTau2) ogkloc <- ogkloc + out$center ogksig <- ogksig + out$cov ogkm[i] <- out$center[p] ogks[i] <- out$cov[p,p] rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) ogkct <- ogkct + 1 out <- covMcd(x) #for Det-MCD, use #out <- covMcd(x,nsamp="deterministic") #Use out <- covmb2(x) for the covmb2 estimator. fmcdloc <- fmcdloc + out$center fmcdsig <- fmcdsig + out$cov fmcdm[i] <- out$center[p] fmcds[i] <- out$cov[p,p] rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fmcdct <- fmcdct + 1 } mbasv <- n*var(mbas) mbam <- n*var(mbam) mbaloc <- mbaloc/runs mbasig <- mbasig/runs fchsv <- n*var(fchs) fchm <- n*var(fchm) fchloc <- fchloc/runs fchsig <- fchsig/runs rfchsv <- n*var(rfchs) rfchm <- n*var(rfchm) rfchloc <- rfchloc/runs rfchsig <- rfchsig/runs rmvnsv <- n*var(rmvns) rmvnm <- n*var(rmvnm) rmvnloc <- rmvnloc/runs rmvnsig <- rmvnsig/runs dgksv <- n*var(dgks) dgkm <- n*var(dgkm) dgkloc <- dgkloc/runs dgksig <- dgksig/runs ogksv <- n*var(ogks) ogkm <- n*var(ogkm) ogkloc <- ogkloc/runs ogksig <- ogksig/runs csv <- n*var(cs) cm <- n*var(cm) cloc <- cloc/runs csig <- csig/runs fmcdsv <- n*var(fmcds) fmcdm <- n*var(fmcdm) fmcdloc <- fmcdloc/runs fmcdsig <- fmcdsig/runs cmveloc <- cmveloc/runs cmvesig <- cmvesig/runs mbsv <- n*var(mbs) mbm <- n*var(mbm) mbloc <- mbloc/runs mbsig <- mbsig/runs list(mbaloc = mbaloc, fchloc = fchloc, rfchloc = rfchloc, rmvnloc = rmvnloc,dgkloc = dgkloc, cloc = cloc, ogkloc = ogkloc, fmcdloc = fmcdloc, cmveloc = cmveloc, mbloc = mbloc, mbasig = mbasig, fchsig = fchsig, rfchsig = rfchsig, rmvnsig = rmvnsig, dgksig = dgksig, csig = csig, ogksig = ogksig, fmcdsig = fmcdsig, cmvesig = cmvesig, mbsig=mbsig, mbasv = mbasv, fchsv=fchsv, rfchsv = rfchsv, rmvnsv=rmvnsv, dgksv=dgksv, ogksv=ogksv, csv=csv, fmcdsv=fmcdsv, mbsv=mbsv, mbam=mbam, fchm=fchm, rfchm=rfchm, rmvnm=rmvnm, dgkm=dgkm, ogkm=ogkm, cm=cm, fmcdm=fmcdm, mbm=mbm, cct = cct, dgkct=dgkct, mbact=mbact, fchct = fchct, rfchct = rfchct, rmvnct=rmvnct, ogkct=ogkct, fmcdct=fmcdct,cmvect=cmvect,mbct=mbct) } mldsim6<-function(n = 100, p = 2, steps = 5, gam = 0.4, runs = 100, outliers = 0, pm = 10, kk=5, osteps = 0){ # This R function compares the # FCH, RFCH, CMVE, RCMVE, RMVN, COVMB2, and MB estimators. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # Calls cmve, covfch, covrmvn, covmb2 A <- sqrt(diag(1:p)) fchct <- 0 rfchct <- 0 cmvect <- 0 rcmvect <- 0 rmvnct <- 0 mbct <- 0 covmb2ct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } out <- covfch(x, csteps = steps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fchct <- fchct + 1 rd2 <- mahalanobis(x, out$rmnf, out$rcovf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rfchct <- rfchct + 1 out <- cmve(x, csteps = steps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cmvect <- cmvect + 1 rd2 <- mahalanobis(x, out$rmnf, out$rcovf) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rcmvect <- rcmvect + 1 rd2 <- mahalanobis(x, out$mnm, out$covm) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 out <- covrmvn(x, csteps = steps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rmvnct <- rmvnct + 1 out <- covmb2(x, k=kk, msteps=osteps) rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) covmb2ct <- covmb2ct + 1 } list(fchct = fchct, rfchct = rfchct, cmvect = cmvect, rcmvect = rcmvect, rmvnct = rmvnct, covmb2ct = covmb2ct, mbct = mbct) } mldsim7<-function(n = 100, p = 2, gam = 0.4, runs = 100, outliers = 0, pm = 10, kk=5, osteps = 0){ # This R function examines the COVMB2 estimators. #The function mldsim6 uses the weighted covariance matrix out$cov #The function mldsim7 uses squared Euclidian distances based on Ip are used, #or the squared Mahalanobis distances using diag(out$cov) # Counts number of times all outlier distances > clean distances. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # Calls covmb2 A <- sqrt(diag(1:p)) covv <- diag(p) #identity matrix Ip covmb2ct <- 0 diagct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val ) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } out <- covmb2(x, k=kk, msteps=osteps) rd2 <- mahalanobis(x, out$center, covv) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) covmb2ct <- covmb2ct + 1 rd2 <- mahalanobis(x, out$center, diag(diag(out$cov))) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) diagct <- diagct + 1 } list( covmb2ct = covmb2ct, diagct = diagct) } MLRplot<-function(x, Y){ # Response plot and residual plot. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)" if a robust estimator replaces lsfit. # Advance the view with the right mouse button. x <- as.matrix(x) out <- lsfit(x, Y) cook <- ls.diag(out)$cooks n <- dim(x)[1] p <- dim(x)[2] + 1 tem <- cook > min(0.5, (2 * p)/n) bhat <- out$coef FIT <- Y - out$res cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) plot(FIT, Y) abline(0, 1) points(FIT[tem], Y[tem], pch = 15) title("Response Plot") identify(FIT, Y) RES <- Y - FIT plot(FIT, RES) points(FIT[tem], RES[tem], pch = 15) title("Residual Plot") identify(FIT, RES) par(mfrow = c(1, 1)) par(mar=cmar) } mlrplot2 <- function(x, Y) {# Makes the response plot and residual plot for two mbareg estimators. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(MASS)" if a robust estimator replaces lsfit. # Advance the view with the right mouse button, and in R, highlight "stop." x <- as.matrix(x) out <- mbareg(x, Y) bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] par(mfrow = c(2, 2)) plot(FIT, Y) abline(0, 1) identify(FIT, Y) title("MBA Response Plot") RES <- Y - FIT plot(FIT, RES) identify(FIT, RES) title("MBA Residual Plot") # out <- mbalata(x, Y) bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] plot(FIT, Y) abline(0, 1) identify(FIT, Y) title("MBALATA Response Plot") RES <- Y - FIT plot(FIT, RES) identify(FIT, RES) title("MBALATA Residual Plot") par(mfrow=c(1,1)) } mlrplot4<-function(x, Y){ # This function is for R. Use mlrplot2 for R and Splus. # Makes response plot and residual plot with large # Cook's distances and Pena's distances highlighted. # Squares have large Cook's distances # and crosses have large Pena's distances. x <- as.matrix(x) out <- lsfit(x, Y) cook <- ls.diag(out)$cooks n <- dim(x)[1] p <- dim(x)[2] + 1 #get Pena's distances s one <- 1 + 0 * 1:n w <- cbind(one, x) cp <- t(w) %*% w h <- w %*% solve(cp) %*% t(w) s <- 0 * 1:n for(i in 1:n) { for(j in 1:n) { s[i] <- s[i] + (cook[j] * h[i, j]^2)/(h[i, i] * h[j, j]) } } tem <- cook > min(0.5, (2 * p)/n) medd <- median(s) madd <- mad(s, constant = 1) tem2 <- abs(s - medd) > 4.5 * madd bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) plot(FIT, Y) abline(0, 1) points(FIT[tem], Y[tem], pch = 0) points(FIT[tem2], Y[tem2], pch = 3) identify(FIT, Y) title("Response Plot") RES <- Y - FIT plot(FIT, RES) points(FIT[tem], RES[tem], pch = 0) points(FIT[tem2], RES[tem2], pch = 3) identify(FIT, RES) title("Residual Plot") par(mfrow = c(1, 1)) par(mar=cmar) } MLRsim<-function(n = 100, q = 7, nruns = 4, eps = 0.1, shift = 9, type = 1){ #Right click Stop for each plot. #Generates response and residual plots for MLR #for a few iid error distributions: # if type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients b <- 0 * 1:q + 1 for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set MLRplot(x,y) #get the response and residual plots }} mltreg<-function(x, y, indices = c(1,2)){ ##Need p > 1, m > 1. # Does multivariate linear regression. # Advance the plot by highlighting Stop with the right mouse button. # Want n > mp and n > 10p for Hotelling Lawley pvalues. # The indices are the variables to be left out of the # reduced model for the MANOVA partial F test. x <- as.matrix(x) y <- as.matrix(y) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 m <- dim(y)[2] r <- length(indices) #Get L for the MANOVA partial F test L <- matrix(0,nrow=r,ncol=p) for(i in 1:r) L[i,indices[i]] <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res Bhat <- matrix(nrow = p, ncol = m, 0) # q + 1 = p is number of predictors including intercept for(i in 1:m){ out <- lsfit(x,y[,i]) res[,i] <- out$res fit[,i] <- y[,i] - res[,i] Bhat[,i] <- out$coef } for(i in 1:m){ MLRplot(x,y[,i]) } Covhat <- (n-1)/(n-p) * var(res) #Get pvalues for testing whether jth predictor is #needed in the model given the other predictors are in the model. one <- 1 + 0*1:n w <- cbind(one,x) pvals <- 0*1:p Fj <- pvals J <- t(w)%*%w J <- solve(J) Covinv <- solve(Covhat) dendf <- n - m*p if( n <= m*p) dendf <- 1 for(j in 1:p){ T <- t(Bhat[j,])%*%Covinv%*%Bhat[j,]/J[j,j] pvals[j] <- 1 - pf((T/m),m,dendf) Fj[j] <- T/m } #Get pval for MANOVA F test for whether the #nontrivial predictors are needed in the model. D <- as.matrix(Bhat[-1,]) if(p == 2) D <- t(D) tem <- as.matrix(J[-1,-1]) teminv <- solve(tem) Weinv <- Covinv/(n-p) H <- t(D)%*%teminv%*%D C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig) #as.real(eig) u <- sum(eig) MANOVAF <- (n-p)*u/((p-1)*m) pval <- 1 - pf(MANOVAF,(p-1)*m,dendf) MANOVA <- cbind(MANOVAF, pval) #got MANOVA F test summaries #Get pval for MANOVA partial F test for whether the #predictors given by indices are needed in the model. tem <- L%*%J%*%t(L) teminv <- solve(tem) H <- t(L%*%Bhat)%*%teminv%*%(L%*%Bhat) C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig)# as.real(eig) u <- sum(eig) partialF <- (n-p)*u/(r*m) Pval <- 1 - pf(partialF,r*m,dendf) partial <- cbind(partialF, Pval) ##got partial F test summaries Ftable <- cbind(Fj,pvals) list(fit = fit, res = res, Covhat = Covhat, Bhat = Bhat, partial=partial, Ftable=Ftable, MANOVA=MANOVA) } modIboot<-function(x,y,B = 1000){ #needs library(leaps), n > 5p, p > 2 #bootstrap the I_I model for forward selection regression #Does not make sense to do variable selection if there #is only one nontrivial predictor. x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vmax <- min(p,as.integer(n/5)) vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the I_I submodel tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) num <- 1:length(out$cp) tnum <- num[out$cp <= min(out$cp)+1] Icp <- out$cp[min(tnum)] modI <- out$which[out$cp==Icp] #do not need the constant in vin vin <- vars[modI[-1]] sub <- lsfit(x[,vin],y) betas <- matrix(0,nrow=B,ncol=p) #bootstrap the I_I submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) tem<-regsubsets(x,y=yb,method="forward") out<-summary(tem) num <- 1:length(out$cp) tnum <- num[out$cp <= min(out$cp)+1] Icp <- out$cp[min(tnum)] modI <- out$which[out$cp==Icp] vin <- vars[modI[-1]] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,betas=betas) } modIpisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05){ #Needs library(leaps). #Simulates PIs for forward selection variable selection for model I_I. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} if (alpha <= 0.1) {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn pilen <- 1:nruns ps <- pilen opicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the forward sel model I_I model tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) num <- 1:length(out$cp) tnum <- num[out$cp <= min(out$cp)+1] Icp <- out$cp[min(tnum)] modI <- out$which[out$cp==Icp,] #do not need the constant in vin vin <- vars[modI[-1]] sub <- lsfit(x[,vin],y) ps[i]<-length(sub$coef) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] fres <- sub$resid #get asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } psmn <- mean(ps)-k #0 if subset is selecting optimal subset pimnlen <- mean(pilen) opicov <- opicov/nruns list(psmn=psmn, opicov=opicov, pimenlen = pimnlen)} mplot<-function(x) {# Need p > 1. # Makes a DD plot only using the MDi, the RDi are not used. p <- dim(x)[2] center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) md <- sqrt(md2) rd <- md const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd plot(md, rd) abline(0, 1) identify(md, rd) } mpredsim<-function(n = 100, m = 2, p = 4, nruns = 10, etype = 1, eps = 0.25, psi = 0.1, dd = 7, mnull = F, alpha = 0.1){ # Simulates prediction regions for multivariate linear regression # p = number of predictors including intercept # m > 1 is the number of response variables # want n > mp and n > 10 p # multiply E by A where etype = 1 for MVN Nm(0,I), # etype = 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I) # eps = 0.1, 0.25, 0.4, and 0.6 are interesting # etype = 3 for multivariate t_d with d = dd degrees of freedom # dd = 1, 2, 3, 5, 7 are interesting # etype = 4 for lognormal - E(lognormal). # For MVN data multiplying E by A results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. The diagonal elements # of the covariance matrix are 1 + (m-1) psi^2, while the off # diagonal elements are 2 rho + (m-2) psi^2. Hence the correlations # are (2 psi + (m-2) psi^2)/(1 + (m-1) psi^2). As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # set.seed(974) ccvr <- 0 scvr <- 0 rcvr <- 0 volc <- 1:nruns vols <- volc volr <- volc #up <- 1 - alpha up <- min((1 - alpha/2), (1 - alpha + 10*alpha*m/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + m/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha np1 <- n + 1 q <- p - 1 A <- matrix(psi,nrow=m,ncol=m) diag(A) <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res B <- matrix(nrow = p, ncol = m, 1) if(p == m){ for(j in 1:(m-1)){ B[p-j+1,j:m] <- 0 } } if(p > m){ for(j in 1:m){ B[p-j+1,j:m] <- 0 } } if(p < m){ for(j in 1:(p-1)){ B[p-j+1,j:m] <- 0 } } if (mnull == T) B[-1,] <- 0 Bhat <- B one <- 1 + 0*1:np1 onen <- one[-1] for(i in 1:nruns) { x <- matrix(rnorm(np1 * q), nrow = np1, ncol = q) w <- cbind(one,x) y <- w %*% B #make error matrix E E <- matrix(rnorm(np1 * m), nrow = np1, ncol = m) if(etype == 2) { zu <- runif(np1) E[zu < eps, ] <- E[zu < eps, ] * 5 } if(etype == 3) { zu <- sqrt(rchisq(np1, dd)/dd) E <- E/zu } if(etype == 4) E <- exp(E) - exp(0.5) #want mean 0 error vectors E <- E %*% A # got error matrix E y <- y + E yf <- y[np1,] #want xf to be a column vector with a 1 xf <- rbind(1,as.matrix(x[np1,])) y <- y[-np1,] x <- x[-np1,] for(j in 1:m){ out <- lsfit(x,y[,j]) res[,j] <- out$res fit[,j] <- y[,j] + res[,j] Bhat[,j] <- out$coef } ##Note that ith row of xx is t(hat(E(yf)) + hat(eps)_i) ##for i = 1,...,n. ##Get prediction regions based on xx which has m columns, ##and see if yf is in the prediction regions. zz <- t(t(Bhat)%*%xf) xx <- res + onen %*% zz center <- apply(xx, 2, mean) cov <- var(xx) md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) if(mahalanobis(t(yf), center, cov) <= hsq) ccvr <- ccvr + 1 volc[i] <- sqrt(hsq)^m * prod(diag(chol(cov))) out <- covrmvn(xx) center <- out$center cov <- out$cov md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) dsq <- mahalanobis(t(yf), center, cov) if(dsq <= hsq) scvr <- scvr + 1 sqrtdet <- prod(diag(chol(cov))) vols[i] <- sqrt(hsq)^m * sqrtdet hsq <- qchisq(up, m) if(dsq <= hsq) rcvr <- rcvr + 1 volr[i] <- sqrt(hsq)^m * sqrtdet } ccvr <- ccvr/nruns scvr <- scvr/nruns rcvr <- rcvr/nruns #get a measure of efficiency wrt vols, so eff(vols) = 1 vols <- mean(vols) volc <- mean(volc)/vols volr <- mean(volr)/vols vols <- 1 list(ncvr = ccvr, scvr = scvr, mcvr = rcvr, voln = volc, vols = vols, volm = volr, up = up) } mregddsim<-function(n = 100, m = 2, p = 4, nruns = 10, etype = 1, eps = 0.25, psi = 0.1, dd = 7, mnull = F, alph=0.1){ ## Need p > 1, m > 1. # Simulates DD plots of residuals for # multivariate linear regression model. # The identity line and lines corresponding to the # 100(1-alph)% prediction regions are added. # p = number of predictors including intercept # m > 1 is the number of response variables # want n > mp + 10, n > 10m and n > 10 p # multiply E by A where etype = 1 for MVN Nm(0,I), # etype = 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I) # eps = 0.1, 0.25, 0.4, and 0.6 are interesting # etype = 3 for multivariate t_d with d = dd degrees of freedom # dd = 1, 2, 3, 5, 7 are interesting # etype = 4 for lognormal - E(lognormal). # For MVN data multiplying E by A results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. The diagonal elements # of the covariance matrix are 1 + (m-1) psi^2, while the off # diagonal elements are 2 psi + (m-2) psi^2. Hence the correlations # are (2 psi + (m-2) psi^2)/(1 + (m-1) psi^2). As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. q <- p - 1 A <- matrix(psi,nrow=m,ncol=m) diag(A) <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res B <- matrix(nrow = p, ncol = m, 1) if(p == m){ for(j in 1:(m-1)){ B[p-j+1,j:m] <- 0 } } if(p > m){ for(j in 1:m){ B[p-j+1,j:m] <- 0 } } if(p < m){ for(j in 1:(p-1)){ B[p-j+1,j:m] <- 0 } } if (mnull == T) B[-1,] <- 0 Bhat <- B one <- 1 + 0*1:n for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) w <- cbind(one,x) y <- w %*% B #make error matrix E E <- matrix(rnorm(n * m), nrow = n, ncol = m) if(etype == 2) { zu <- runif(n) E[zu < eps, ] <- E[zu < eps, ] * 5 } if(etype == 3) { zu <- sqrt(rchisq(n, dd)/dd) E <- E/zu } if(etype == 4) E <- exp(E) - exp(0.5) #want mean 0 error vectors E <- E %*% A # got error matrix E y <- y + E for(j in 1:m){ out <- lsfit(x,y[,j]) res[,j] <- out$res fit[,j] <- y[,j] + res[,j] Bhat[,j] <- out$coef } ##make DD plot of the errors ddplot4(res, alpha=alph) } Covhat <- (n-1)/(n-p) * var(res) list(Bhat = Bhat, B=B, Covhat = Covhat) } mregsim<-function(n = 100, m = 2, p = 4, nruns = 10, etype = 1, eps = 0.25, psi = 0.1, dd = 7, mnull = T){ ## Need p > 1, m > 1. # Simulates multivariate linear regression model and gets # pvalues for tests for whether the p-1 nontrivial # predictors are needed in the model and # pvalues for tests for whether Xi is needed in the model. # p = number of predictors including intercept # m > 1 is the number of response variables # want n > mp and n > 10 p # multiply E by A where etype = 1 for MVN Nm(0,I), # etype = 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I) # eps = 0.1, 0.25, 0.4, and 0.6 are interesting # etype = 3 for multivariate t_d with d = dd degrees of freedom # dd = 1, 2, 3, 5, 7 are interesting # etype = 4 for lognormal - E(lognormal). # For MVN data multiplying E by A results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. The diagonal elements # of the covariance matrix are 1 + (m-1) psi^2, while the off # diagonal elements are 2 psi + (m-2) psi^2. Hence the correlations # are (2 rho + (m-2) psi^2)/(1 + (m-1) psi^2). As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # Illustrates that Hotelling Lawley statistic = last statistic/(n-p). q <- p - 1 A <- matrix(psi,nrow=m,ncol=m) diag(A) <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res B <- matrix(nrow = p, ncol = m, 1) if(p == m){ for(j in 1:(m-1)){ B[p-j+1,j:m] <- 0 } } if(p > m){ for(j in 1:m){ B[p-j+1,j:m] <- 0 } } if(p < m){ for(j in 1:(p-1)){ B[p-j+1,j:m] <- 0 } } if (mnull == T) B[-1,] <- 0 Bhat <- B one <- 1 + 0*1:n fcov <- 0*1:p wilkcov <- fcov pilcov <- fcov hotlawcov <- fcov roycov <- fcov rden <- n - p - m + 1 rcut <- qf(0.95,m,rden) dendf <- n - m*p wcv <- 0 pcv <- 0 hlcv<-0 rcv<-0 fcv<-0 ndf <- (p-1)*m mancv <- cbind(wcv,pcv,hlcv,rcv,fcv) #L <- cbind(fcov[-1],diag(p-1)) #want n > mp fcut <- qf(0.95, m, dendf) mcut <- qf(0.95,m*(p-1),dendf) h <- max(p-1,m) mrcut <- qf(0.95,h,(n-h-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) w <- cbind(one,x) y <- w %*% B #make error matrix E E <- matrix(rnorm(n * m), nrow = n, ncol = m) if(etype == 2) { zu <- runif(n) E[zu < eps, ] <- E[zu < eps, ] * 5 } if(etype == 3) { zu <- sqrt(rchisq(n, dd)/dd) E <- E/zu } if(etype == 4) E <- exp(E) - exp(0.5) #want mean 0 error vectors E <- E %*% A # got error matrix E y <- y + E for(j in 1:m){ out <- lsfit(x,y[,j]) res[,j] <- out$res fit[,j] <- y[,j] - res[,j] Bhat[,j] <- out$coef } Covhat <- (n-1)/(n-p) * var(res) # w is the data matrix with a column of ones J <- t(w)%*%w J <- solve(J) We <- Covhat*(n-p) Covinv <- solve(Covhat) Weinv <- Covinv/(n-p) #get the test statistics for whether Xj is needed in the model for(j in 1:p){ T <- t(Bhat[j,])%*%Covinv%*%Bhat[j,]/J[j,j] if(T/m > fcut) fcov[j] <- fcov[j] + 1 H <- Bhat[j,]%*%t(Bhat[j,])/J[j,j] C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig) # as.real(eig) lmax <- eig[1] u <- sum(eig) v <- sum(eig/(eig+1)) wilk <- prod(1/(eig+1)) if(-(n-p-1-0.5*m)*log(wilk)/m > fcut) wilkcov[j] <- wilkcov[j] + 1 if((n-p)*v/m > fcut) pilcov[j] <- pilcov[j] + 1 if((n-p)*u/m > fcut) hotlawcov[j] <- hotlawcov[j] + 1 if(rden*lmax/m > rcut) roycov[j] <- roycov[j] + 1 } #get the MANOVA F test statistics for whether nontrivial #predictors are needed in the model D <- as.matrix(Bhat[-1,]) if(p == 2) D <- t(D) vecD <- as.matrix(D[,1]) for(j in 2:m){ vecD <- rbind(vecD,as.matrix(D[,j]))} tem <- as.matrix(J[-1,-1]) teminv <- solve(tem) T <- t(vecD)%*%kronecker(Covinv,teminv)%*%vecD if(T/ndf > mcut) mancv[5] <- mancv[5] + 1 H <- t(D)%*%teminv%*%D C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig) lmax <- eig[1] u <- sum(eig) v <- sum(eig/(eig+1)) wilk <- prod(1/(eig+1)) if(-(n-0.5*p-0.5*m-2)*log(wilk)/ndf > mcut) mancv[1] <- mancv[1] + 1 if((n-p)*v/ndf > mcut) mancv[2] <- mancv[2] + 1 if((n-p)*u/ndf > mcut) mancv[3] <- mancv[3] + 1 if((n-h-1)*lmax/h > mrcut) mancv[4] <- mancv[4] + 1 } wilkcov <- wilkcov/nruns pilcov <- pilcov/nruns hotlawcov <- hotlawcov/nruns roycov <- roycov/nruns fcov <- fcov/nruns mancv <- mancv/nruns list(Bhat = Bhat, B=B, Covhat = Covhat, wilkcov=wilkcov, pilcov=pilcov, hotlawcov=hotlawcov, roycov=roycov, fcov=fcov, mancv=mancv) } nltv<-function(gam = 0.5) {# Gets asy var for lts(h) and lta(h) at standard normal # where h/n -> gam. k <- qnorm(0.5 + gam/2) den <- gam - 2 * k * dnorm(k) ltsv <- 1/den tem <- (1 - exp( - (k^2)/2))^2 ltav <- (2 * pi * gam)/(4 * tem) list(ltsv=ltsv, ltav=ltav) } pcaboot<-function(x, corr=T, rob=F, B = 1000){ #Bootstraps PCA. Likely only accurate for positive eigenvalues. #If rob = T, bootstrap the robust PCA. #Nominal 95% CIs x <- as.matrix(x) n <- dim(x)[1] p <- dim(x)[2] lsciL <- 1:p lsciU <- 1:p indx <- 1:n lams <- matrix(0,nrow=B,ncol=p) if(rob==F){ for(i in 1:B){ tem <- sample(indx,n,replace=T) lams[i,] <- prcomp(x[tem,], scale = corr)$sd^2 } } else{ for(i in 1:B){ tem <- sample(indx,n,replace=T) lams[i,] <- rprcomp(x[tem,], corr = corr)$out$sd^2 } } shorci <- apply(lams,2,shorth3) covhat <- var(lams) for(i in 1:p){ lsciL[i] <- mean(lams[,i]) - 2*sqrt(covhat[i,i]) lsciU[i] <- mean(lams[,i]) + 2*sqrt(covhat[i,i]) } lscis <- cbind(lsciL,lsciU) list(lams=lams,covhat=covhat,shorci=shorci,lscis=lscis) } pcabootsim<-function(n = 100, p = 4, nruns = 1, xtype = 1, dd = 1, eps = 0.25, corr=F,rob=F,B=1000){ # Generates a simulated data set for the PCA bootstrap CIs. # Need n > 2q. # MAY NOT WORK IF p = 1 # If corr = F, then eigenvalues = c(1,2,...,p). # If corr = T, then eigenvalues = c(1,1,...,1). # Multiply x by A where xtype = 1 for MVN Np(0,I), # 2, 3, 4 and 5 for (1 - eps) Np(0,I) + eps Np(0, 25 I) # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd # 10 for lognormal. # Can't get accurate counts if xtype = 9. set.seed(974) A <- sqrt(diag(1:p)) lscict <- 0*(1:p) shcict <- lscict sigsq <- 1 for(i in 1:nruns) { #make data x <- matrix(rnorm(n * p), nrow = n, ncol = p) if(xtype == 2) { zu <- runif(n) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 sigsq <- 10.6 } if(xtype == 3) { zu <- runif(n) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 sigsq <- 15.4 } if(xtype == 4) { zu <- runif(n) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 sigsq <- 3.4 } if(xtype == 5) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 sigsq <- 1 +eps*24 } if(xtype == 6) { zu <- sqrt(rchisq(n, 3)/3) x <- x/zu sigsq <- 3 } if(xtype == 7) { zu <- sqrt(rchisq(n, 5)/5) x <- x/zu sigsq <- 5/3 } if(xtype == 8) { zu <- sqrt(rchisq(n, 19)/19) x <- x/zu sigsq <- 19/17 } if(xtype == 9) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu #sigsq <- dd/(dd-2) } if(xtype == 10){ x <- exp(x) sigsq <- exp(1)*(exp(1)-1) } x <- x %*% A out <- pcaboot(x,corr=corr,rob=rob,B=B) lam <- sigsq*(1:p) for(j in 1:p){ if(out$lscis[j,1] <= lam[p+1-j] & lam[p+1-j] <= out$lscis[j,2]) lscict[j] <- lscict[j] + 1 if(out$shorci[[j]]$shorth[1] <= lam[p+1-j] & lam[p+1-j] <= out$shorci[[j]]$shorth[2]) shcict[j] <- shcict[j]+1 } } covh <- cov(x) lscict <- lscict/nruns shcict <- shcict/nruns list(out=out,covh=covh,lscicv=lscict,shcicv=shcict) } pcasim<-function(n = 100, q = 4, nruns = 100, xtype = 1, corr = T, dd = 1, eps = 0.25, rot = T, rho = 0.99) {# Need n > 2q, q > 2. R users need to type library(MASS). # Need q > 2 (want at least 3 principal components). # If rot = F, multiply x by A where xtype = 1 for MVN Nq(0,I), # 2, 3, 4 and 5 (with delta = eps) for (1 - delta) Nq(0,I) + delta Nq(0, 25 I) # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd # 10 for lognormal. # If rot = T, multiply x by RR: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As rho gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # Note: prcomp(x)$sd gives the vector of the square roots of # the eigenvalues of the dispersion or correlation matrix. # Note: prcomp$rot gives the matrix of eigenvectors. # Robust and classical eigenvectors sometimes have # very low absolute correlation. #set.seed(974) RR <- matrix(rho,nrow=q,ncol=q) diag(RR) <- 1 corrs <- 1:nruns eig1 <- corrs eig2 <- eig1 eig3 <- eig1 pc1 <- eig1 pc2 <- pc1 pc3 <- pc1 ve1 <- eig1 ve2 <- ve1 ve3 <- ve1 rve1 <- ve1 rve2 <- ve1 rve3 <- ve1 vexpl <- 1:3 rvexpl <- 1:3 A <- sqrt(diag(1:q)) for(i in 1:nruns) { #make data x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(xtype == 2) { zu <- runif(n) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(n) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(n) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) { zu <- runif(n) x[zu < eps, ] <- x[zu < eps, ] * 5 } if(xtype == 6) { zu <- sqrt(rchisq(n, 3)/3) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(n, 5)/5) x <- x/zu } if(xtype == 8) { zu <- sqrt(rchisq(n, 19)/19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(n, dd)/dd) x <- x/zu } if(xtype == 10) x <- exp(x) if(rot == F) x <- x %*% A else x <- x %*% RR z <- prcomp(x,scale=corr) zz <- rprcomp(x,corr=corr) tem <- z$sd^2 temr <- zz$out$sd^2 corrs[i] <- cor(tem,temr) den <- sum(tem) ve1[i] <- tem[1]/den ve2[i] <- tem[2]/den ve3[i] <- tem[3]/den den <- sum(temr) rve1[i] <- temr[1]/den rve2[i] <- temr[2]/den rve3[i] <- temr[3]/den eig1[i] <- abs(cor(z$rot[,1],zz$out$rot[,1])) eig2[i] <- abs(cor(z$rot[,2],zz$out$rot[,2])) eig3[i] <- abs(cor(z$rot[,3],zz$out$rot[,3])) if(corr==F){ pc1[i] <- abs(cor(x%*%z$rot[,1],x%*%zz$out$rot[,1])) pc2[i] <- abs(cor(x%*%z$rot[,2],x%*%zz$out$rot[,2])) pc3[i] <- abs(cor(x%*%z$rot[,3],x%*%zz$out$rot[,3])) } else { tem <- 1/sqrt(diag(cov(x))) dinv <- diag(tem) w <- x%*%dinv pc1[i] <- abs(cor(w%*%z$rot[,1],w%*%zz$out$rot[,1])) pc2[i] <- abs(cor(w%*%z$rot[,2],w%*%zz$out$rot[,2])) pc3[i] <- abs(cor(w%*%z$rot[,3],w%*%zz$out$rot[,3])) } } mncor <- mean(corrs) vexpl[1] <- mean(ve1) vexpl[2] <- mean(ve2) vexpl[3] <- mean(ve3) rvexpl[1] <- mean(rve1) rvexpl[2] <- mean(rve2) rvexpl[3] <- mean(rve3) m1 <- mean(eig1) m2 <- mean(eig2) m3 <- mean(eig3) v1 <- sqrt(var(ve1)) v2 <- sqrt(var(ve2)) v3 <- sqrt(var(ve3)) sdc <- c(v1,v2,v3) rv1 <- sqrt(var(rve1)) rv2 <- sqrt(var(rve2)) rv3 <- sqrt(var(rve3)) sdr <- c(rv1,rv2,rv3) p1 <- mean(pc1) p2 <- mean(pc2) p3 <- mean(pc3) abscorpc <- c(p1,p2,p3) list(mncor = mncor, vexpl=vexpl, sdc=sdc, rvexpl=rvexpl, sdr=sdr, abscoreigv1=m1, abscoreigv2=m2, abscoreigv3=m3, abscorpc=abscorpc) } pcisim2<-function(nruns=100,n1=10,n2=10,mu1=0,mu2=0,var1=1,var2=1,dist=1,BB=100, alph=0.05){ #gets the pooled, modified pooled, and Welch 100 (1-alpha)% CIs for mu1 - mu2 #defaults are alpha = .05, lengths make the most sense if n1 = n2 #Also gets the bootstrap pooled CI and #three bootstrap CIs: shorth, prediction region method, Bickel and Ren. #dist = 1 for x ~ N(mu1, var1), y ~ N(mu2, var2) #dist = 2 for x ~ EXP with mean mu1 variance var1, # y ~ EXP with mean mu2 variance var2 #dist = 3 for x ~ N(mu1, var1), y EXP with mean mu2 variance var2 #calls shorth3 and bootci bup <- min((1 - alph/2), (1 - alph + 10*alph/BB)) if(alph > 0.1) bup <- min((1 - alph + 0.05), (1 - alph + 1/BB)) qB <- bup if(qB < 1 - alph + 0.001) bup <- 1 - alph indx1 <- 1:n1 indx2 <- 1:n2 diff <- mu1 - mu2 up <- 1 - alph/2 pdf <- n1 + n2 - 2 rhohat <- n1/(n1 + n2) pcov <- 0 mpcov <- 0 wcov <- 0 shcov <- 0 shlen <- 0 prcov <- 0 prlen <- 0 brcov <- 0 brlen <- 0 bpcov <- 0 bplen <- 0 plow <- 1:nruns pup <- plow mplow <- plow mpup <- plow wlow <- plow wup <- plow tpcut <- qt(up, pdf) mtpcut <- qt(up, n1 + n2 - 4) bdiff <- 1:BB for(i in 1:nruns) { if(dist == 1) { x <- mu1 + sqrt(var1) * rnorm(n1) y <- mu2 + sqrt(var2) * rnorm(n2) } if(dist == 2) { x <- mu1 - sqrt(var1) + sqrt(var1) * rexp(n1) y <- mu2 - sqrt(var2) + sqrt(var2) * rexp(n2) } if(dist == 3) { x <- mu1 + sqrt(var1) * rnorm(n1) y <- mu2 - sqrt(var2) + sqrt(var2) * rexp(n2) } xbar <- mean(x) s1sq <- var(x) ybar <- mean(y) s2sq <- var(y) xmybar <- xbar - ybar sp <- (n1 - 1) * s1sq + (n2 - 1) * s2sq sp <- sqrt(sp/pdf) val <- sp * sqrt(1/n1 + 1/n2) #get pooled CI den <- val temp <- tpcut * val plow[i] <- xmybar - temp pup[i] <- xmybar + temp if(plow[i] < diff && pup[i] > diff) pcov <- pcov + 1 #get modified pooled CI thetahat <- s2sq/s1sq tauhatsq <- (1 - rhohat + rhohat * thetahat)/(rhohat + (1 - rhohat) * thetahat) tauhat <- sqrt(tauhatsq) tem <- mtpcut * val * tauhat mplow[i] <- xmybar - tem mpup[i] <- xmybar + tem if(mplow[i] < diff && mpup[i] > diff) mpcov <- mpcov + 1 #get Welch CI t1 <- s1sq/n1 t2 <- s2sq/n2 do <- (t1 + t2)^2/(t1^2/(n1 - 1) + t2^2/(n2 - 1)) d <- max(1, floor(do)) wtcut <- qt(up, d) val <- sqrt(s1sq/n1 + s2sq/n2) tem <- wtcut * val wlow[i] <- xmybar - tem wup[i] <- xmybar + tem if(wlow[i] < diff && wup[i] > diff) wcov <- wcov + 1 for(j in 1:BB){ tem <- sample(indx1,n1,replace=T) mnx <- mean(x[tem]) tem <- sample(indx2,n2,replace=T) mny <- mean(y[tem]) bdiff[j] <- mnx - mny } #get shorth CI tem <- shorth3(bdiff,alpha=alph) if(diff >= tem$shorth[1] && diff <= tem$shorth[2]) shcov <- shcov + 1 shlen <- shlen + tem$shorth[2] - tem$shorth[1] #get prediction region CI tem<-bootci(bstat=bdiff,stat=xmybar,upc=bup) if(diff >= tem$prci[1] && diff <= tem$prci[2]) prcov <- prcov + 1 prlen <- prlen + tem$prcilen #get Bickel and Ren CI if(diff >= tem$brci[1] && diff <= tem$brci[2]) brcov <- brcov + 1 brlen <- brlen + tem$brcilen #get bootstrap pooled CI dsq <- ((bdiff - xmybar)/den)^2 Dhat <- sqrt(quantile(dsq,bup)) bplow <- xmybar - Dhat*den bpup <- xmybar + Dhat*den if(bplow < diff && bpup > diff) bpcov <- bpcov + 1 bplen <- bplen + bpup - bplow } pcov <- pcov/nruns plen <- sqrt(n1) * mean(pup - plow) mpcov <- mpcov/nruns mplen <- sqrt(n1) * mean(mpup - mplow) wcov <- wcov/nruns wlen <- sqrt(n1) * mean(wup - wlow) shcov <- shcov/nruns shlen <- sqrt(n1)*shlen/nruns prcov <- prcov/nruns prlen <- sqrt(n1)*prlen/nruns brcov <- brcov/nruns brlen <- sqrt(n1)*brlen/nruns bpcov <- bpcov/nruns bplen <- sqrt(n1)*bplen/nruns #print(mean(x)) #print(var(x)) #print(mean(y)) #print(var(y)) list(pcov=pcov,plen=plen,bpcov=bpcov,bplen=bplen,mpcov=mpcov,mplen=mplen, wcov=wcov,wlen=wlen,shcov=shcov,shlen=shlen,prcov=prcov,prlen=prlen, brcov=brcov,brlen=brlen) } pcrpisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi=0.0, type = 1, alpha = 0.05){ #Needs library(pls). #Simulates PIs for principle components regression with 10 fold CV. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} if (alpha <= 0.1) {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn ncvec<-1:nruns pilen <- 1:nruns opicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the PCR 10 fold CV estimator z <- as.data.frame(cbind(y,x)) zz <- rbind(z,c(0,xf)) out<-pcr(V1~.,data=z,scale=T,validation="CV") #If y is used instead of V1, predict does not work, #and nc tends to equal p, which should be impossible. tem<-MSEP(out) cvmse<-tem$val[,,1:out$ncomp][1,] nc <-which.min(cvmse) #using predict is rather difficult #if xf is used, predict does not work yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] ncvec[i] <- nc #get the number of components to use #if nc=p-1=q, same as OLS res <- out$residuals[,,nc] #get asymptotically optimal PI sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- mean(pilen) opicov <- opicov/nruns qminncmn <- q - mean(ncvec) #If qminncmn is 0 then PCR is equivalent to OLS. list(qminncmn, opicov=opicov, pimenlen = pimnlen)} pifclean<-function(k, gam) { p <- floor(log(3/k)/log(1 - gam)) list(p = p) } plspisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi=0.0, type = 1, alpha = 0.05){ #Needs library(pls). #Simulates PIs for partial least squares with 10 fold CV. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} if (alpha <= 0.1) {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn ncvec<-1:nruns pilen <- 1:nruns opicov <- 0 q <- p-1 rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the PLS 10 fold CV estimator z <- as.data.frame(cbind(y,x)) zz <- rbind(z,c(0,xf)) out<-plsr(V1~.,data=z,scale=T,validation="CV") #If y is used instead of V1, predict does not work, #and nc tends to equal p, which should be impossible. tem<-MSEP(out) cvmse<-tem$val[,,1:out$ncomp][1,] nc <-which.min(cvmse) #using predict is rather difficult #if xf is used, predict does not work yfhat<-predict(out,zz[(n+1),-1],ncomp=nc)[1,1,1] ncvec[i] <- nc #get the number of components to use #if nc=p-1=q, same as OLS res <- out$residuals[,,nc] #get asymptotically optimal PI sres <- sort(res) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- mean(pilen) opicov <- opicov/nruns qminncmn <- q - mean(ncvec) #If qminncmn is 0 then PLS is equivalent to OLS. list(qminncmn, opicov=opicov, pimenlen = pimnlen)} predreg<-function(x, alpha = 0.05){ # Makes a prediction region for the rows of x. # If p = 1, the shorth interval should work better. #Also computes the distance for the 0 vector #to check if the 0 vector is in the prediction region. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] inr<-0 zero <- 0*(1:p) up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical distance MD <- sqrt(md2) #get nonparametric prediction region boundary cuplim <- sqrt(quantile(md2, up)) D0 <- sqrt(mahalanobis(zero, center, cov)) if(D0 <= cuplim) inr <- 1 list(MD=MD, center=center,cov=cov,cuplim = cuplim, D0=D0, inr=inr) } predrgn <-function(x, xf, alpha = 0.05){ # Makes a prediction region for xf when cases are rows of x. # If p = 1, the shorth interval should work better. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] inr <- 0 up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # md2 is the classical squared Mahalanobis distance #get nonparametric prediction region boundary cuplim <- quantile(md2, up) #inr <- 1 if xf is in the prediction region xfd <- mahalanobis(xf, center, cov) if(xfd <= cuplim) inr <- 1 list(md2=md2, center=center,cov=cov, cuplim = cuplim, xfd, inr=inr) } predsim<-function(n = 100, p = 4, nruns = 100, xtype = 1, dd = 1, eps = 0.25, alpha = 0.1) {# MAY NOT WORK IF p = 1. # Gets coverages of nonparametric, semiparametric and # parametric MVN prediction regions. # Multiply x by A where xtype = 1 for MVN Nq(0,I), # 2, 3, 4 and 10 (with delta = eps) for delta Np(0,I) + (1-delta) Np(0, 25 I) # 5 for lognormal, # 6, 7, 8 and 9 for multivariate t_d with d = 3, 5, 19 or dd # mahalanobis gives squared Maha distances # set.seed(974) ccvr <- 0 scvr <- 0 rcvr <- 0 volc <- 1:nruns vols <- volc volr <- volc #up <- 1 - alpha up <- min((1 - alpha/2), (1 - alpha + 10*alpha*p/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + p/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha A <- sqrt(diag(1:p)) m <- n + 1 for(i in 1:nruns) { #make data x <- matrix(rnorm(m * p), nrow = m, ncol = p) if(xtype == 2) { zu <- runif(m) x[zu < 0.4, ] <- x[zu < 0.4, ] * 5 } if(xtype == 3) { zu <- runif(m) x[zu < 0.6, ] <- x[zu < 0.6, ] * 5 } if(xtype == 4) { zu <- runif(m) x[zu < 0.1, ] <- x[zu < 0.1, ] * 5 } if(xtype == 5) x <- exp(x) if(xtype == 6) { zu <- sqrt(rchisq(m, 3)/3) x <- x/zu } if(xtype == 7) { zu <- sqrt(rchisq(m, 5)/5) x <- x/zu } if(xtype == 8) { zu <- sqrt(rchisq(m, 19)/19) x <- x/zu } if(xtype == 9) { zu <- sqrt(rchisq(m, dd)/dd) x <- x/zu } if(xtype == 10) { zu <- runif(m) x[zu < eps, ] <- x[zu < eps, ] * 5 } x <- x %*% A xx <- x[ - m, ] center <- apply(xx, 2, mean) cov <- var(xx) md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) if(mahalanobis(t(x[m, ]), center, cov) <= hsq) ccvr <- ccvr + 1 volc[i] <- sqrt(hsq)^p * prod(diag(chol(cov))) out <- covrmvn(xx) center <- out$center cov <- out$cov md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) dsq <- mahalanobis(t(x[m, ]), center, cov) if(dsq <= hsq) scvr <- scvr + 1 sqrtdet <- prod(diag(chol(cov))) vols[i] <- sqrt(hsq)^p * sqrtdet hsq <- qchisq(up, p) if(dsq <= hsq) rcvr <- rcvr + 1 volr[i] <- sqrt(hsq)^p * sqrtdet } ccvr <- ccvr/nruns scvr <- scvr/nruns rcvr <- rcvr/nruns #get a measure of efficiency wrt vols, so eff(vols) = 1 vols <- mean(vols) volc <- mean(volc)/vols volr <- mean(volr)/vols vols <- 1 list(ncvr = ccvr, scvr = scvr, mcvr = rcvr, voln = volc, vols = vols, volm = volr, up = up) } rcancor<-function(x, y, csteps = 5, locc = 0.5, cent = T) {# This does robust canonical correlations analysis. x <- as.matrix(x) y <- as.matrix(y) px <- dim(x)[2] w <- cbind(x,y) p <- dim(w)[2] n <- dim(w)[1] up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) ##get the DGK estimator covs <- var(w) mns <- apply(w, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(w, mns, covs) medd2 <- median(md2) mns <- apply(w[md2 <= medd2, ], 2, mean) covs <- var(w[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(w, 2, median) md2 <- mahalanobis(w, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(w[md2 <= medd2, ], 2, mean) covs <- var(w[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(w, mns, covs) medd2 <- median(md2) mns <- apply(w[md2 <= medd2, ], 2, mean) covs <- var(w[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(w, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) rd2 <- mahalanobis(w, mnf, covf) rmnmvn <- apply(w[rd2 <= up, ], 2, mean) rcovmvn <- var(w[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(w, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ##now get the subset of data z used to construct the RMVN estimator rd2 <- mahalanobis(w, rmnmvn, rcovmvn) z <- w[rd2 <= up, ] zx <- z[,1:px] zy <- z[,(px+1):p] out <- cancor(zx, zy, xcenter = cent, ycenter = cent) list(out = out) } rcovsim<-function(n = 100, p = 2, steps = 5, gam = 0.4, runs = 20, outliers = T) {# Need p > 1. # This function demonstrates that covmba estimates mu # but is slightly biased for sigma if n is small. The function rmba # is better for small n. Similar results hold for covfch and rfch. # R users need to type "library(MASS)." A <- sqrt(diag(1:p)) cloc <- 0 * (1:p) csig <- 0 * A mbaloc <- cloc mbasig <- csig rmbaloc <- cloc rmbasig <- csig fmcdloc <- cloc fmcdsig <- csig fchloc <- cloc fchsig <- csig rfchloc <- cloc rfchsig <- csig for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) # code below would give mean = (10, ..., 10)^T to the outliers # if(outliers == T) { # val <- floor(gam * n) # tem <- 10 + 0 * 1:p # x <- x %*% A # x[1:val, ] <- x[1:val, ] + # tem # } # else { # x <- x %*% A # } ## code below: outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T if(outliers == T) { val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } x <- x %*% A out <- covmba(x, csteps = steps) mbaloc <- mbaloc + out$center mbasig <- mbasig + out$cov out <- rmba(x) rmbaloc <- rmbaloc + out$center rmbasig <- rmbasig + out$cov cloc <- cloc + apply(x, 2, mean) csig <- csig + var(x) out <- cov.mcd(x) fmcdloc <- fmcdloc + out$center fmcdsig <- fmcdsig + out$cov out <- covfch(x, csteps = steps) fchloc <- fchloc + out$center fchsig <- fchsig + out$cov rfchloc <- rfchloc + out$rmnf rfchsig <- rfchsig + out$rcovf } mbaloc <- mbaloc/runs mbasig <- mbasig/runs rmbaloc <- rmbaloc/runs rmbasig <- rmbasig/runs cloc <- cloc/runs csig <- csig/runs fmcdloc <- fmcdloc/runs fmcdsig <- fmcdsig/runs fchloc <- fchloc/runs fchsig <- fchsig/runs rfchloc <- rfchloc/runs rfchsig <- rfchsig/runs list(mbaloc = mbaloc, rmbaloc = rmbaloc, fchloc = fchloc, rfchloc = rfchloc, cloc = cloc, fmcdloc = fmcdloc, mbasig = mbasig, rmbasig = rmbasig, fchsig = fchsig, rfchsig = rfchsig, csig = csig, fmcdsig = fmcdsig) } regboot<-function(x,y, B = 1000){ #bootstrap with residuals for MLR #asymptotically the same as MSE (X'X)^(-1), so use to compare vselboot #out <- regboot(belx,bely) #ddplot4(out$betas) #this does not work for vselboot: too many 0s #mplot(out$betas) #cov(out$betas) full <- lsfit(x,y) res <- full$resid fit <- y - res n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ yb <- fit + sample(res,n,replace=T) betas[i,] <- lsfit(x,yb)$coef } list(betas=betas) } regbootsim<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #Simulates residual bootstrap for regression. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones, p-k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-regboot(x,y) #residual bootstrap for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} rhotboot<-function(x, B=1000, med=T){ #Bootstraps RMVN center (med=F) or coordinatewise median. # Need p > 1. x <- as.matrix(x) n <- dim(x)[1] p <- dim(x)[2] indx <- 1:n munot <- 0 * (1:p) mus <- matrix(0,nrow=B,ncol=p) #get Hotelling's T^2 statistic for munot = 0 #covs <- var(x) #mns <- apply(x, 2, mean) #loc <- mns - munot #hot <- n * t(loc) %*% solve(covs) %*% loc #get robust statistic for munot = 0 #out <- covrmvn(x) #covs <- out$cov #mns <- out$center #loc <- mns - munot #rhot <- n * t(loc) %*% solve(covs) %*% loc #get bootstrap sample of the robust estimator of center if(med==T){ for(j in 1:B){ tem <- sample(indx,n,replace=T) mus[j,] <- apply(x[tem,],2,median) } } else{ for(j in 1:B){ tem <- sample(indx,n,replace=T) mus[j,] <- covrmvn(x[tem,])$center } } list(mus=mus) } rhot2boot<-function(x, y, B=1000, med=T){ #Bootstraps RMVN or coord median 2 sample Hotelling Type T^2 test. # Need p > 1. x <- as.matrix(x) nx <- dim(x)[1] p <- dim(x)[2] y <- as.matrix(y) ny <- dim(y)[1] indx <- 1:nx indy <- 1:ny mus <- matrix(0,nrow=B,ncol=p) #get bootstrap sample of the robust statistic Tx - Ty if(med == T){ for(j in 1:B){ tem <- sample(indx,nx,replace=T) Tx <- apply(x[tem,],2,median) tem <- sample(indy,ny,replace=T) Ty <- apply(y[tem,],2,median) mus[j,] <- Tx-Ty } } else{ for(j in 1:B){ tem <- sample(indx,nx,replace=T) Tx <- covrmvn(x[tem,])$center tem <- sample(indy,ny,replace=T) Ty <- covrmvn(y[tem,])$center mus[j,] <- Tx-Ty } } list(mus=mus) } rhotsim<-function(n = 100,p = 2,csteps = 5,gam = 0.4,runs = 100,outliers = 0,pm = 10,delta = 0) {# Need p > 1. # This R function simulates an ad hod robust Hotelling's T^2 test. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu_o = 0*mu. # For outliers=0, want hquant and rquant approx 1. A <- sqrt(diag(1:p)) up <- ((n - 1) * p * qf(0.95, df1 = p, df2 = (n - p)))/(n - p) rup <- (1.04 + 0.12/p + (40+p)/n) * up hot <- 1:runs rhot <- hot munot <- 0 * (1:p) mu <- delta * (1 + munot) val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } x <- mu + x #get Hotelling's T^2 statistic covs <- var(x) mns <- apply(x, 2, mean) loc <- mns - munot hot[i] <- n * t(loc) %*% solve(covs) %*% loc #get robust statistic out <- covrmvn(x) covs <- out$cov mns <- out$center loc <- mns - munot rhot[i] <- n * t(loc) %*% solve(covs) %*% loc } hcvr <- sum(hot > up)/runs rhcvr <- sum(rhot > rup)/runs hquant <- quantile(hot, probs = 0.95)/up rquant <- quantile(rhot, probs = 0.95)/rup #plot of scaled rhot vs hot should scatter about the identity line #rhot <- rhot/(1.04 + 0.12/p + (40+p)/n) list(hot = hot, rhot = rhot, hcv = hcvr, rhcv = rhcvr, hquant = hquant, rquant = rquant) } rhotsim2<-function(n = 100, p = 2, csteps = 5, B= 1000, gam = 0.4, runs = 100, outliers = 0, pm = 10, delta = 0){ # This R function simulates robust Hotelling's T^2 test based on the #bootstrap. ########### Pretty slow. # Need p > 1. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a tight cluster at major axis (0,...,0,pm)' # 2 for outliers a tight cluster at minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # Power can be estimated by increasing delta so mu = delta(1,...,1) # and mu_o = 0*mu. # For outliers=0, want hquant and rquant approx 1. A <- sqrt(diag(1:p)) up <- ((n - 1) * p * qf(0.95, df1 = p, df2 = (n - p)))/(n - p) rup <- (1.04 + 0.12/p + (40+p)/n) * up hot <- 1:runs rhot <- hot munot <- 0 * (1:p) mu <- delta * (1 + munot) val <- floor(gam * n) indx <- 1:n mus <- matrix(0,nrow=B,ncol=p) rcv <- 0 avelen <- 0 for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } if(outliers == 2) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, 1] <- x[1:val, 1] + pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p] <- pm } if(outliers == 5) { x[1:val, 1] <- pm } x <- mu + x #get Hotelling's T^2 statistic covs <- var(x) mns <- apply(x, 2, mean) loc <- mns - munot hot[i] <- n * t(loc) %*% solve(covs) %*% loc #get robust statistic out <- covrmvn(x) covs <- out$cov mns <- out$center loc <- mns - munot rhot[i] <- n * t(loc) %*% solve(covs) %*% loc #do the prediction region bootstrap test for(j in 1:B){ tem <- sample(indx,n,replace=T) mus[j,] <- covrmvn(x[tem,])$center } temp <- predreg(mus,alpha=0.05) if(temp$D0 > temp$cuplim) rcv <- rcv + 1 avelen <- avelen + temp$cuplim } hcvr <- sum(hot > up)/runs rhcvr <- sum(rhot > rup)/runs rcv <- rcv/runs avelen <- avelen/runs hquant <- quantile(hot, probs = 0.95)/up rquant <- quantile(rhot, probs = 0.95)/rup #plot of scaled rhot vs hot should scatter about the identity line #rhot <- rhot/(1.04 + 0.12/p + (40+p)/n) list(hot = hot, rhot = rhot, hcv = hcvr, rhcv = rhcvr, hquant = hquant, rquant = rquant, bootcv = rcv, bootlen = avelen) } rmaha<-function(x) {# Need p > 1 or x a matrix. # Produces robust Mahalanobis distances (scaled for normal data). p <- dim(x)[2] out <- cov.mcd(x) center <- out$center cov <- out$cov rd <- mahalanobis(x, center, cov) const <- sqrt(qchisq(0.5, p))/median(rd) return(const * sqrt(rd)) } rmba<-function(x, csteps = 5) {# gets the reweighted MBA estimator, works for p = 1 zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1) { mns <- apply(x[md2 <= medd2, ], 2, mean) } if(p == 1) { mns <- mean(zx[md2 <= medd2]) } covs <- var(x[md2 <= medd2, ]) } crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb ##reweight the above MBA estimator (mnb,covb) for efficiency rd2 <- mahalanobis(x, mnb, covb) up <- qchisq(0.975, p) if(p > 1) { rmnb <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnb = mean(zx[rd2 <= up]) } rcovb <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnb, rcovb) const <- median(rd2)/(qchisq(0.5, p)) rcovb <- const * rcovb ## reweight again rd2 <- mahalanobis(x, rmnb, rcovb) up <- qchisq(0.975, p) if(p > 1){ rmnb <- apply(x[rd2 <= up, ], 2, mean) } if(p == 1){ rmnb = mean(zx[rd2 <= up]) } rcovb <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnb, rcovb) const <- median(rd2)/(qchisq(0.5, p)) rcovb <- const * rcovb list(center = rmnb, cov = rcovb) } rmbsim <-function(n = 100, p = 2, csteps = 5, gam = 0.4, runs = 20, outliers = 0, pm = 10){ # This R function simulates the MB and RMB estimators. # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a point mass on major axis (0,...,0,pm)' # 2 for outliers a point mass on minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # 6 for outliers a tight cluster at major axis (0,...,0,pm)' #set.seed(974) A <- sqrt(diag(1:p)) cloc <- 0 * (1:p) csig <- 0 * A mbloc <- cloc mbsig <- csig rmbloc <- cloc rmbsig <- csig cct <- 0 mbct <- 0 rmbct <- 0 val <- floor(gam * n) for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- 0 x[1:val,p] <- pm } if(outliers == 2) { x[1:val, ] <- 0 x[1:val,1] <- pm } if(outliers == 3) { tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p ] <- pm } if(outliers == 5) { x[1:val, 1 ] <- pm } if(outliers == 6) { x[1:val, ] <- matrix(rnorm(val * p, sd = 0.01), ncol = p, nrow = val) x[1:val, p] <- x[1:val, p] + pm } out <- covrmb(x, csteps = csteps) rmbloc <- rmbloc + out$center rmbsig <- rmbsig + out$cov rd2 <- mahalanobis(x, out$center, out$cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) rmbct <- rmbct + 1 mbloc <- mbloc + out$mnm mbsig <- mbsig + out$covm rd2 <- mahalanobis(x, out$mnm, out$covm) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 cmn <- apply(x,2,mean) cloc <- cloc + cmn ccov <- var(x) csig <- csig + ccov rd2 <- mahalanobis(x, cmn, ccov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) cct <- cct + 1 } cloc <- cloc/runs csig <- csig/runs mbloc <- mbloc/runs mbsig <- mbsig/runs rmbloc <- rmbloc/runs rmbsig <- rmbsig/runs list(cloc = cloc, mbloc = mbloc, rmbloc = rmbloc, csig = csig, mbsig=mbsig, rmbsig=rmbsig, cct = cct, mbct=mbct, rmbct=rmbct) } rmltreg<-function(x, y,indices=c(1,2)){ ##Need p > 1, m > 1. # Does robust multivariate linear regression. # Advance the plot by highlighting Stop with the right mouse button. # Want n > mp and n > 10p for Hotelling Lawley pvalues. # The indices are the variables to be left out of the # reduced model for the MANOVA partial F test. x <- as.matrix(x) y <- as.matrix(y) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 m <- dim(y)[2] r <- length(indices) #Get L for the MANOVA partial F test L <- matrix(0,nrow=r,ncol=p) for(i in 1:r) L[i,indices[i]] <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res Bhat <- matrix(nrow = p, ncol = m, 0) # q + 1 = p is number of predictors including intercept cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) for(i in 1:m){ out <- hbreg(x,y[,i]) res[,i] <- y[,i] - out$coef[1] - x %*% out$coef[-1] fit[,i] <- y[,i] - res[,i] Bhat[,i] <- out$coef } for(i in 1:m){ plot(fit[,i],y[,i]) abline(0, 1) title("Response Plot") identify(fit[,i],y[,i]) plot(fit[,i], res[,i]) title("Residual Plot") identify(fit[,i], res[,i]) } par(mfrow = c(1, 1)) par(mar=cmar) Covhat <- (n-1)/(n-p) * var(res) #Get pvalues for testing whether jth predictor is #needed in the model given the other predictors are in the model. one <- 1 + 0*1:n w <- cbind(one,x) pvals <- 0*1:p Fj <- pvals J <- t(w)%*%w J <- solve(J) Covinv <- solve(Covhat) dendf <- n - m*p if( n <= m*p) dendf <- 1 for(j in 1:p){ T <- t(Bhat[j,])%*%Covinv%*%Bhat[j,]/J[j,j] pvals[j] <- 1 - pf((T/m),m,dendf) Fj[j] <- T/m } #get the MANOVA F test statistics for whether nontrivial #predictors are needed in the model D <- as.matrix(Bhat[-1,]) if(p == 2) D <- t(D) tem <- as.matrix(J[-1,-1]) teminv <- solve(tem) Weinv <- Covinv/(n-p) H <- t(D)%*%teminv%*%D C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig) u <- sum(eig) MANOVAF <- (n-p)*u/((p-1)*m) pval <- 1 - pf(MANOVAF,(p-1)*m,dendf) MANOVA <- cbind(MANOVAF, pval) #got MANOVA F test summaries #Get pval for MANOVA partial F test for whether the #predictors given by indices are needed in the model. tem <- L%*%J%*%t(L) teminv <- solve(tem) H <- t(L%*%Bhat)%*%teminv%*%(L%*%Bhat) C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig) u <- sum(eig) partialF <- (n-p)*u/(r*m) Pval <- 1 - pf(partialF,r*m,dendf) partial <- cbind(partialF, Pval) ##got partial F test summaries Ftable <- cbind(Fj,pvals) list(fit = fit, res = res, Covhat = Covhat, Bhat = Bhat, partial=partial, Ftable=Ftable, MANOVA=MANOVA) } rmpredsim<-function(n = 100, m = 2, p = 4, nruns = 10, etype = 1, eps = 0.25, psi = 0.1, dd = 7, mnull = F, alpha=0.1){ # Need p > 1, m > 1. # Simulates prediciton regions for # robust multivariate linear regression model. # p = number of predictors including intercept # m > 1 is the number of response variables # want n > mp, n > 10m and n > 10 p # multiply E by A where etype = 1 for MVN Nm(0,I), # etype = 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I) # eps = 0.1, 0.25, 0.4, and 0.6 are interesting # etype = 3 for multivariate t_d with d = dd degrees of freedom # dd = 1, 2, 3, 5, 7 are interesting # etype = 4 for lognormal - E(lognormal). # For MVN data multiplying E by A results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. The diagonal elements # of the covariance matrix are 1 + (m-1) psi^2, while the off # diagonal elements are 2 psi + (m-2) psi^2. Hence the correlations # are (2 rho + (m-2) psi^2)/(1 + (m-1) psi^2). As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. ccvr <- 0 scvr <- 0 rcvr <- 0 volc <- 1:nruns vols <- volc volr <- volc #up <- 1 - alpha up <- min((1 - alpha/2), (1 - alpha + 10*alpha*m/n)) if(alpha > 0.1) up <- min((1 - alpha + 0.05), (1 - alpha + m/n)) qn <- up if(qn < 1 - alpha + 0.001) up <- 1 - alpha np1 <- n + 1 q <- p - 1 A <- matrix(psi,nrow=m,ncol=m) diag(A) <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res B <- matrix(nrow = p, ncol = m, 1) if(p == m){ for(j in 1:(m-1)){ B[p-j+1,j:m] <- 0 } } if(p > m){ for(j in 1:m){ B[p-j+1,j:m] <- 0 } } if(p < m){ for(j in 1:(p-1)){ B[p-j+1,j:m] <- 0 } } if (mnull == T) B[-1,] <- 0 Bhat <- B one <- 1 + 0*1:np1 onen <- one[-1] for(i in 1:nruns){ x <- matrix(rnorm(np1 * q), nrow = np1, ncol = q) w <- cbind(one,x) y <- w %*% B #make error matrix E E <- matrix(rnorm(np1 * m), nrow = np1, ncol = m) if(etype == 2) { zu <- runif(np1) E[zu < eps, ] <- E[zu < eps, ] * 5 } if(etype == 3) { zu <- sqrt(rchisq(np1, dd)/dd) E <- E/zu } if(etype == 4) E <- exp(E) - exp(0.5) #want mean 0 error vectors E <- E %*% A # got error matrix E y <- y + E yf <- y[np1,] #want xf to be a column vector with a 1 xf <- rbind(1,as.matrix(x[np1,])) y <- y[-np1,] x <- x[-np1,] w <- w[-np1,] for(j in 1:m){ out <- hbreg(x,y[,j]) res[,j] <- y[,j] - w %*% out$coef fit[,j] <- y[,j] + res[,j] Bhat[,j] <- out$coef } ##Note that ith row of xx is t(hat(E(yf)) + hat(eps)_i) ##for i = 1,...,n. ##Get prediction regions based on xx which has m columns, ##and see if yf is in the prediction regions. zz <- t(t(Bhat)%*%xf) xx<- res + onen %*% zz center <- apply(xx, 2, mean) cov <- var(xx) md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) if(mahalanobis(t(yf), center, cov) <= hsq) ccvr <- ccvr + 1 volc[i] <- sqrt(hsq)^m * prod(diag(chol(cov))) out <- covrmvn(xx) center <- out$center cov <- out$cov md2 <- mahalanobis(xx, center, cov) hsq <- quantile(md2, up) dsq <- mahalanobis(t(yf), center, cov) if(dsq <= hsq) scvr <- scvr + 1 sqrtdet <- prod(diag(chol(cov))) vols[i] <- sqrt(hsq)^m * sqrtdet hsq <- qchisq(up, m) if(dsq <= hsq) rcvr <- rcvr + 1 volr[i] <- sqrt(hsq)^m * sqrtdet } ccvr <- ccvr/nruns scvr <- scvr/nruns rcvr <- rcvr/nruns #get a measure of efficiency wrt vols, so eff(vols) = 1 vols <- mean(vols) volc <- mean(volc)/vols volr <- mean(volr)/vols vols <- 1 list(ncvr = ccvr, scvr = scvr, mcvr = rcvr, voln = volc, vols = vols, volm = volr, up = up) } rmregboot<-function(x,y, B = 1000){ #bootstrap with residuals for MLR using rmreg3 #out <- rmregboot(belx,bely) #ddplot4(out$betas); ddplot5(out$betas) #mplot(out$betas) #cov(out$betas) full <- lsfit(x,y) res <- full$resid fit <- y - res n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ yb <- fit + sample(res,n,replace=T) betas[i,] <- rmreg3(x,yb)$Bhat } list(betas=betas) } rmregbootsim<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #Very Slow. #Simulates residual bootstrap for rmreg3 with m = 1. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones, p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-rmregboot(x,y) #residual bootstrap using rmreg3 for (j in 1:p){ tem <- shorth2(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} rmreg2<-function(x, y, csteps = 5, locc = 0.5){ # Does robust multivariate linear regression based on RMVN. # Here x contains the nontrivial predictors. # Advance the plot by highlighting Stop with the right mouse button. x <- as.matrix(x) y <- as.matrix(y) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 m <- dim(y)[2] d <- m + q u <- cbind(x,y) res <- matrix(nrow = n, ncol = m, 0) fit <- res Bhat <- matrix(nrow = p, ncol = m, 0) # q + 1 = p is number of predictors including intercept cmar <- par("mar") par(mfrow = c(2, 1)) par(mar=c(4.0,4.0,2.0,0.5)) #get the RMVN subset of the predictors and response u up <- qchisq(0.975, d) qchi <- qchisq(0.5, d) ##get the DGK estimator covs <- var(u) mns <- apply(u, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(d) med <- apply(u, 2, median) md2 <- mahalanobis(u, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(u, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) rd2 <- mahalanobis(u, mnf, covf) rmnmvn <- apply(u[rd2 <= up, ], 2, mean) rcovmvn <- var(u[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(u, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, d) rcovmvn <- const * rcovmvn ##now get the subset of data w used to construct the RMVN estimator rd2 <- mahalanobis(u, rmnmvn, rcovmvn) w <- u[rd2 <= up, ] xw <- as.matrix(w[,1:q]) yw <- as.matrix(w[,(q+1):d]) #get the multivariate regression after ellipsoidal trimming for(i in 1:m){ out <- lsfit(xw,yw[,i]) res[,i] <- y[,i] - out$coef[1] - x %*% out$coef[-1] fit[,i] <- y[,i] - res[,i] Bhat[,i] <- out$coef } for(i in 1:m){ plot(fit[,i],y[,i]) abline(0, 1) title("Response Plot") identify(fit[,i],y[,i]) plot(fit[,i], res[,i]) title("Residual Plot") identify(fit[,i], res[,i]) } par(mfrow = c(1, 1)) par(mar=cmar) list(fit = fit, res = res, Bhat = Bhat) } rmreg3<-function(x, y, csteps = 5, locc = 0.5){ # Does robust multiple linear regression based on RMVN. # Here x contains the nontrivial predictors. # This is rmreg2 except plots are not made. # Used by hbreg. x <- as.matrix(x) y <- as.matrix(y) n <- dim(x)[1] q <- dim(x)[2] p <- q+1 m <- dim(y)[2] d <- m + q u <- cbind(x,y) res <- matrix(nrow = n, ncol = m, 0) fit <- res Bhat <- matrix(nrow = p, ncol = m, 0) # q + 1 = p is number of predictors including intercept #get the RMVN subset of the predictors and response u up <- qchisq(0.975, d) qchi <- qchisq(0.5, d) ##get the DGK estimator covs <- var(u) mns <- apply(u, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(d) med <- apply(u, 2, median) md2 <- mahalanobis(u, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(u, mns, covs) medd2 <- median(md2) mns <- apply(u[md2 <= medd2, ], 2, mean) covs <- var(u[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(u, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) rd2 <- mahalanobis(u, mnf, covf) rmnmvn <- apply(u[rd2 <= up, ], 2, mean) rcovmvn <- var(u[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(u, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, d) rcovmvn <- const * rcovmvn ##now get the subset of data w used to construct the RMVN estimator rd2 <- mahalanobis(u, rmnmvn, rcovmvn) w <- u[rd2 <= up, ] xw <- as.matrix(w[,1:q]) yw <- as.matrix(w[,(q+1):d]) #get the multivariate regression after ellipsoidal trimming for(i in 1:m){ out <- lsfit(xw,yw[,i]) res[,i] <- y[,i] - out$coef[1] - x %*% out$coef[-1] fit[,i] <- y[,i] - res[,i] Bhat[,i] <- out$coef } list(fit = fit, res = res, Bhat = Bhat) } rmregddsim<-function(n = 100, m = 2, p = 4, nruns = 10, etype = 1, eps = 0.25, psi = 0.1, dd = 7, mnull = F, alph=0.1){ # Need p > 1, m > 1. # Simulates DD plots robust multivariate linear regression model. # The identity line and lines corresponding to the # 100(1-alph)% prediction regions are added. # p = number of predictors including intercept # m > 1 is the number of response variables # want n > mp, n > 10m and n > 10 p # multiply E by A where etype = 1 for MVN Nm(0,I), # etype = 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I) # eps = 0.1, 0.25, 0.4, and 0.6 are interesting # etype = 3 for multivariate t_d with d = dd degrees of freedom # dd = 1, 2, 3, 5, 7 are interesting # etype = 4 for lognormal - E(lognormal). # For MVN data multiplying E by A results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. The diagonal elements # of the covariance matrix are 1 + (m-1) psi^2, while the off # diagonal elements are 2 psi + (m-2) psi^2. Hence the correlations # are (2 rho + (m-2) psi^2)/(1 + (m-1) psi^2). As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. q <- p - 1 A <- matrix(psi,nrow=m,ncol=m) diag(A) <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res B <- matrix(nrow = p, ncol = m, 1) if(p == m){ for(j in 1:(m-1)){ B[p-j+1,j:m] <- 0 } } if(p > m){ for(j in 1:m){ B[p-j+1,j:m] <- 0 } } if(p < m){ for(j in 1:(p-1)){ B[p-j+1,j:m] <- 0 } } if (mnull == T) B[-1,] <- 0 Bhat <- B one <- 1 + 0*1:n for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) w <- cbind(one,x) y <- w %*% B #make error matrix E E <- matrix(rnorm(n * m), nrow = n, ncol = m) if(etype == 2) { zu <- runif(n) E[zu < eps, ] <- E[zu < eps, ] * 5 } if(etype == 3) { zu <- sqrt(rchisq(n, dd)/dd) E <- E/zu } if(etype == 4) E <- exp(E) - exp(0.5) #want mean 0 error vectors E <- E %*% A # got error matrix E y <- y + E for(j in 1:m){ out <- hbreg(x,y[,j]) res[,j] <- y[,j] - out$coef[1] - x %*% out$coef[-1] fit[,j] <- y[,j] + res[,j] Bhat[,j] <- out$coef } ddplot4(res, alpha = alph) } Covhat <- (n-1)/(n-p) * var(res) list(Bhat = Bhat, B=B, Covhat = Covhat) } rmregsim<-function(n = 100, m = 2, p = 4, nruns = 10, etype = 1, eps = 0.25, psi = 0.1, dd = 7, mnull = T){ # Need p > 1, m > 1. # Simulates robust multivariate linear regression model and gets # pvalues for tests for whether the p-1 nontrivial # predictors are needed in the model and # pvalues for tests for whether Xi is needed in the model. # p = number of predictors including intercept # m > 1 is the number of response variables # want n > mp and n > 10 p # multiply E by A where etype = 1 for MVN Nm(0,I), # etype = 2 for (1 - eps) Nm(0,I) + eps Nm(0, 25 I) # eps = 0.1, 0.25, 0.4, and 0.6 are interesting # etype = 3 for multivariate t_d with d = dd degrees of freedom # dd = 1, 2, 3, 5, 7 are interesting # etype = 4 for lognormal - E(lognormal). # For MVN data multiplying E by A results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. The diagonal elements # of the covariance matrix are 1 + (m-1) psi^2, while the off # diagonal elements are 2 psi + (m-2) psi^2. Hence the correlations # are (2 rho + (m-2) psi^2)/(1 + (m-1) psi^2). As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. # Illustrates that Hotelling Lawley statistic = last statistic/(n-p). q <- p - 1 A <- matrix(psi,nrow=m,ncol=m) diag(A) <- 1 res <- matrix(nrow = n, ncol = m, 0) fit <- res B <- matrix(nrow = p, ncol = m, 1) if(p == m){ for(j in 1:(m-1)){ B[p-j+1,j:m] <- 0 } } if(p > m){ for(j in 1:m){ B[p-j+1,j:m] <- 0 } } if(p < m){ for(j in 1:(p-1)){ B[p-j+1,j:m] <- 0 } } if (mnull == T) B[-1,] <- 0 Bhat <- B one <- 1 + 0*1:n fcov <- 0*1:p wilkcov <- fcov pilcov <- fcov hotlawcov <- fcov roycov <- fcov rden <- n - p - m + 1 rcut <- qf(0.95,m,rden) dendf <- n - m*p wcv <- 0 pcv <- 0 hlcv<-0 rcv<-0 fcv<-0 ndf <- (p-1)*m mancv <- cbind(wcv,pcv,hlcv,rcv,fcv) #L <- cbind(fcov[-1],diag(p-1)) #want n > mp fcut <- qf(0.95, m, dendf) mcut <- qf(0.95,m*(p-1),dendf) h <- max(p-1,m) mrcut <- qf(0.95,h,(n-h-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) w <- cbind(one,x) y <- w %*% B #make error matrix E E <- matrix(rnorm(n * m), nrow = n, ncol = m) if(etype == 2) { zu <- runif(n) E[zu < eps, ] <- E[zu < eps, ] * 5 } if(etype == 3) { zu <- sqrt(rchisq(n, dd)/dd) E <- E/zu } if(etype == 4) E <- exp(E) - exp(0.5) #want mean 0 error vectors E <- E %*% A # got error matrix E y <- y + E for(j in 1:m){ out <- hbreg(x,y[,j]) res[,j] <- y[,j] - out$coef[1] - x %*% out$coef[-1] fit[,j] <- y[,j] - res[,j] Bhat[,j] <- out$coef } Covhat <- (n-1)/(n-p) * var(res) # w is the data matrix with a column of ones J <- t(w)%*%w J <- solve(J) We <- Covhat*(n-p) Covinv <- solve(Covhat) Weinv <- Covinv/(n-p) #get the test statistics for whether Xj is needed in the model for(j in 1:p){ T <- t(Bhat[j,])%*%Covinv%*%Bhat[j,]/J[j,j] if(T/m > fcut) fcov[j] <- fcov[j] + 1 H <- Bhat[j,]%*%t(Bhat[j,])/J[j,j] C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig<- as.double(eig) lmax <- eig[1] u <- sum(eig) v <- sum(eig/(eig+1)) wilk <- prod(1/(eig+1)) if(-(n-p-1-0.5*m)*log(wilk)/m > fcut) wilkcov[j] <- wilkcov[j] + 1 if((n-p)*v/m > fcut) pilcov[j] <- pilcov[j] + 1 if((n-p)*u/m > fcut) hotlawcov[j] <- hotlawcov[j] + 1 if(rden*lmax/m > rcut) roycov[j] <- roycov[j] + 1 } #get the MANOVA F test statistics for whether nontrivial #predictors are needed in the model D <- as.matrix(Bhat[-1,]) if(p == 2) D <- t(D) vecD <- as.matrix(D[,1]) for(j in 2:m){ vecD <- rbind(vecD,as.matrix(D[,j]))} tem <- as.matrix(J[-1,-1]) teminv <- solve(tem) T <- t(vecD)%*%kronecker(Covinv,teminv)%*%vecD if(T/ndf > mcut) mancv[5] <- mancv[5] + 1 H <- t(D)%*%teminv%*%D C <- Weinv%*%H eig <- eigen(C,symmetric=FALSE,only.values=TRUE)$values eig <- as.double(eig) lmax <- eig[1] u <- sum(eig) v <- sum(eig/(eig+1)) wilk <- prod(1/(eig+1)) if(-(n-0.5*p-0.5*m-2)*log(wilk)/ndf > mcut) mancv[1] <- mancv[1] + 1 if((n-p)*v/ndf > mcut) mancv[2] <- mancv[2] + 1 if((n-p)*u/ndf > mcut) mancv[3] <- mancv[3] + 1 if((n-h-1)*lmax/h > mrcut) mancv[4] <- mancv[4] + 1 } wilkcov <- wilkcov/nruns pilcov <- pilcov/nruns hotlawcov <- hotlawcov/nruns roycov <- roycov/nruns fcov <- fcov/nruns mancv <- mancv/nruns list(Bhat = Bhat, B=B, Covhat = Covhat, wilkcov=wilkcov, pilcov=pilcov, hotlawcov=hotlawcov, roycov=roycov, fcov=fcov, mancv=mancv) } rowboot<-function(x,y, B = 1000){ #rowwise nonparametric bootstrap for MLR #use if (y^T, x^T) are iid from some distribution #out <- rowboot(belx,bely) #ddplot4(out$betas) #mplot(out$betas) #cov(out$betas) full <- lsfit(x,y) n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 indx <- 1:n betas <- matrix(0,nrow=B,ncol=p) for(i in 1:B){ tem <- sample(indx,n,replace=T) betas[i,] <- lsfit(x[tem,],y[tem])$coef } list(betas=betas,full=full) } rprcomp<-function(x, csteps = 5, locc = 0.5, corr = T) {# Needs number of predictors > 1. # This does robust principal components analysis. x <- as.matrix(x) p <- dim(x)[2] n <- dim(x)[1] up <- qchisq(0.975, p) qchi <- qchisq(0.5, p) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covd <- covs mnd <- mns ##get the MB estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2)##get the location criterion cutoff lcut <- medd2 if(locc != 0.5) lcut <- quantile(md2,locc) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covm <- covs mnm <- mns ##get FCH attractor covf <- covm mnf <- mnm val2 <- mahalanobis(t(mnd), med, covv) if(val2 < lcut) { ##crit = square root of det(cov) critd <- prod(diag(chol(covd))) critm <- prod(diag(chol(covm))) if(critd < critm) { covf <- covd mnf <- mnd } } ## get the FCH estimator rd2 <- mahalanobis(x, mnf, covf) const <- median(rd2)/qchi covf <- const * covf ##reweight the above FCH estimator (mnf,covf) rd2 <- mahalanobis(x, mnf, covf) rmnmvn <- apply(x[rd2 <= up, ], 2, mean) rcovmvn <- var(x[rd2 <= up, ]) d1 <- sum(rd2 <= up) rd2 <- mahalanobis(x, rmnmvn, rcovmvn) qchi2 <- (0.5 * 0.975 * n)/d1 qchi2 <- min(qchi2, 0.995) const <- median(rd2)/qchisq(qchi2, p) rcovmvn <- const * rcovmvn ##now get the subset of data w used to construct the RMVN estimator rd2 <- mahalanobis(x, rmnmvn, rcovmvn) w <- x[rd2 <= up, ] out <- prcomp(w, scale = corr) list(out = out) } rrplot2<-function(x, y, nsamps = 7) {# In Unix, use X11() to turn on the graphics device before # using this function. First type library(MASS). # Makes an RR plot. Needs hbreg, mbareg, mbalata, and rmreg3. n <- length(y) x <- as.matrix(x) rmat <- matrix(nrow = n, ncol = 7) lsres <- lsfit(x, y)$residuals print("got OLS") almsres <- lmsreg(x, y)$resid print("got ALMS") altsres <- ltsreg(x, y)$residuals print("got ALTS") mbacoef <- mbareg(x, y, nsamp = nsamps)$coef MBAres <- y - mbacoef[1] - x %*% mbacoef[-1] print("got MBA") bbcoef <- hbreg(x, y)$bbcoef BBres <- y - bbcoef[1] - x %*% bbcoef[-1] print("got BB") mbalcoef <- mbalata(x, y, nsamp = nsamps)$coef MBALATA <- y - mbalcoef[1] - x %*% mbalcoef[-1] print("got MBALATA") RMREG2 <- rmreg3(x,y)$res print("got RMREG2") rmat[, 1] <- lsres rmat[, 2] <- almsres rmat[, 3] <- altsres rmat[, 4] <- MBAres rmat[, 5] <- BBres rmat[, 6] <- MBALATA rmat[, 7] <- RMREG2 pairs(rmat, labels = c("OLS resid", "ALMS resid", "ALTS resid", "MBA resid", "BB resid", "MBALATA","RMREG2")) } sctplt <-function(n = 10,p = 2,steps = 5,gam = 0.4,outliers = 0,pm = 10,jit = F) {# Use for 1 < p < 11. # This R function makes a DD plot and scatterplot matrix # for sample data set from function mldsim. # Click left mouse button to identify points. # Click right mouse button after examining the DD plot # and in R, highlight "stop". # # outliers = 0 for no outliers and X~N(0,diag(1,...,p)), # 1 for outliers a point mass on major axis (0,...,0,pm)' # 2 for outliers a point mass on minor axis (pm,0, ...,0)' # 3 for outliers X~N((pm,...,pm)',diag(1,...,p)) # 4 for outliers X[i,p] = pm # 5 for outliers X[i,1] = pm # #generate data set A <- sqrt(diag(1:p)) cloc <- 0 * (1:p) csig <- 0 * A val <- floor(gam * n) x <- matrix(rnorm(n * p), ncol = p, nrow = n) x <- x %*% A if(outliers == 1) { x[1:val, ] <- 0 x[1:val,p] <- pm } if(outliers == 2) { x[1:val, ] <- 0 x[1:val,1] <- pm } if(outliers == 3) { val <- floor(gam * n) tem <- pm + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } if(outliers == 4) { x[1:val, p ] <- pm } if(outliers == 5) { x[1:val, 1 ] <- pm } ddplot2(x) par(mfrow=c(1,1)) z <- x if(jit == T){ z <- x + matrix(runif(n * p, max=0.4), ncol = p, nrow = n) } pairs(z) } shorth2<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = ceiling[n(1-alpha)]. n <- length(y) cc <- ceiling(n * (1 - alpha)) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } Ln <- rlow; Un <- rup list(shorth=c(Ln, Un)) } shorth3<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = cc. #shorth lists Ln and Un using Frey's correction n <- length(y) cc <- ceiling(n * (1 - alpha + 1.12*sqrt(alpha/n))) cc <- min(n,cc) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } Ln <- rlow; Un <- rup list(shorth=c(Ln, Un)) } shorthLU<-function(y, alpha = 0.05){ # computes the shorth(c) interval [Ln,Un] for c = cc. #shorth lists Ln and Un using Frey's correction #also gets left endpoint and right endpoints for one sided lower and upper CIs #[left,infty) and (-infty,right]. #So left is a lower bound and right an upper bound for the parameter theta. n <- length(y) cc <- ceiling(n * (1 - alpha + 1.12*sqrt(alpha/n))) cc <- min(n,cc) sy <- sort(y) rup <- sy[cc] rlow <- sy[1] olen <- rup - rlow if(cc < n){ for(j in (cc + 1):n){ zlen <- sy[j] - sy[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sy[j] rlow <- sy[j - cc + 1] } } } Ln <- rlow; Un <- rup left <- sy[n-cc+1] right <- sy[cc] list(shorth=c(Ln, Un),left=left,right=right) } symviews<-function(x, Y) {# Need number of predictors > 1. # Makes trimmed views for 90, 80, ..., 0 # percent trimming and sometimes works even if m # is symmetric about E(x^t beta) where # y = m(x^T beta ) + e. # For work stations, activate a graphics # device with command "X11()" or "motif()." # Use the rightmost mouse button to advance # the view, and in R, highlight ``stop." x <- as.matrix(x) tem <- seq(0.1, 1, 0.1) bols <- lsfit(x, Y)$coef fit <- x %*% bols[-1] temx <- x[fit > median(fit), ] temy <- Y[fit > median(fit)] out <- covmba(temx) center <- out$center cov <- out$cov rd2 <- mahalanobis(temx, center, cov) for(i in 1:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(temx[rd2 <= val, ], temy[rd2 <= val])$coef ESP <- x %*% bhat[-1] + bhat[1] plot(ESP, Y) identify(ESP, Y) print(bhat) } } tmn<-function(x, tp = 0.25) {# computes 100tp% trimmed mean mean(x, trim = tp) } trviews<-function(x, Y, ii = 1, type = 3) {# Trimmed views for 0, 10, ..., 90 percent trimming. # Use ii = 10 for 0, ii = 9 for 10 then 0, ..., # ii = 1 for 90 then 80 ..., then 0 percent trimming. # Allows visualization of m and crudely estimation of # c beta in models of the form y = m(x^T beta) + e. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(MASS)." # Advance the view with the right mouse button and # in R, highight "stop." x <- as.matrix(x) if(type == 1) out <- cov.mcd(x) if(type == 2) out <- covmba(x) if(type == 3) out <- covfch(x) if(type == 4) out <- covrmvn(x) if (type <= 4){ center <- out$center cov <- out$cov} #type = 5 for rfch estimator if (type == 5){ out <- covfch(x) center <- out$rmnf cov <- out$rcovf} rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- x %*% bhat[-1] + bhat[1] plot(ESP, Y) title(labs[i]) identify(ESP, Y) print(bhat) } } tvreg<-function(x, Y, ii = 1, type = 4){ # Trimmed views (TV) regression for 0, 10, ..., 90 percent # trimming. Use ii = 10 for 0, ii = 9 for 10 then 0, ... # ii = 1 for 90 then 80 ... then 0 percent trimming. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(MASS)." # Advance the view with the right mouse button and # in R, highight "stop." x <- as.matrix(x) if(type == 1) out <- cov.mcd(x) if(type == 2) out <- covmba(x) if (type == 3) out <- covfch(x) if(type > 3) out <- covrmvn(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef FIT <- x %*% bhat[-1] + bhat[1] plot(FIT, Y) abline(0, 1) title(labs[i]) identify(FIT, Y) print(bhat) } } tvreg2<-function(X, Y, M = 0, type = 4) {# Trimmed views regression for M percent trimming. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(MASS)." X <- as.matrix(X) if(type == 1) out <- cov.mcd(X) if(type == 2) out <- covmba(X) if(type == 3) out <- covfch(X) if(type > 3) out <- covrmvn(X) center <- out$center cov <- out$cov rd2 <- mahalanobis(X, center, cov) tem <- (100 - M)/100 val <- quantile(rd2, tem) bhat <- lsfit(X[rd2 <= val, ], Y[rd2 <= val])$coef FIT <- X %*% bhat[-1] + bhat[1] plot(FIT, Y) abline(0, 1) identify(FIT, Y) list(coef = bhat) } vecw <- function(z){ #Computes vec(z): stacks the columns of z into a column vector. z <- as.matrix(z) vecz <- as.vector(z[,1]) p <- dim(z)[2] if(p > 1){ for(i in 2:p) vecz <- c(vecz,as.vector(z[,i])) } list(vecz=vecz) } vsbootsim<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #needs library(leaps) #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for variable selection using all subsets variable #selection. So need p small. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-vselboot(x,y) #bootstrap the minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} vsbootsim2<-function(n = 100, p = 4, nruns = 100, eps = 0.1, shift = 9, type = 1, alph = 0.05){ #needs library(leaps) #PROGRAM FAILS IF A VARIABLE IS NEVER SELECTED IN THE B BOOTSTRAPS. #Simulates bootstrap for forward selection variable selection. #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k ones p - k zeroes q <- p-1 k <- floor(p/2) b <- 0 * 1:q b[1:(k-1)] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp1 <- p + 1 cicov <- 0*(1:pp1) avelen <- 0*(1:pp1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err } #make an MLR data set out <-fselboot(x,y) #bootstrap the forward sel minimum Cp model for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) #tem <- locpi2(out$betas[,j],alpha=alph) #if(beta[j] >= tem$LOCPI[1] && beta[j] <= tem$LOCPI[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] #avelen[j] <- avelen[j] + tem$LOCPI[2] - tem$LOCPI[1] } #test whether the last p - k values of beta are 0 tem <- predreg(out$betas[,(k+1):p],alpha=alph) if(tem$D0 <= tem$cuplim) cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim } cicov <- cicov/nruns avelen <- avelen/nruns list(cicov=cicov,avelen=avelen,beta=beta,k=k)} vselboot<-function(x,y,B = 1000){ #needs library(leaps) #need n and p small, 2 < p < 30 #allsubsets minimum Cp regression #does not make sense to do variable selection if there #is only one nontrivial predictor x <- as.matrix(x) n <- length(y) p <- 1 + dim(x)[2] vars <- as.vector(1:(p-1)) #get the full model full <- lsfit(x,y) res <- full$resid fit <- y - res #get the minimum Cp submodel out<-leaps(x,y) mincp <- out$which[out$Cp==min(out$Cp)] vin <- vars[mincp] sub <- lsfit(x[,vin],y) betas <- matrix(0,nrow=B,ncol=p) #bootstrap the minimum Cp submodel for(i in 1:B){ yb <- fit + sample(res,n,replace=T) out<-leaps(x,y=yb) mincp <- out$which[out$Cp==min(out$Cp)] vin <- vars[mincp] indx <- c(1,1+vin) betas[i,indx] <- lsfit(x[,vin],yb)$coef } list(full=full,sub=sub,betas=betas) } vspisim<-function(n = 100, p = 4, k = 1, nruns = 100, eps = 0.1, shift = 9, psi = 0.0, type = 1, alpha = 0.05){ #Needs library(leaps). #Simulates PIs for forward selection variable selection. # 1 <= k <= p-1, k is the number of nonnoise variables #Uses five iid error distributions: # type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. # constant = 1 so there are p = q+1 coefficients #need p > 1, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones, p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(q-2)psi^2]/[1 + (q-1)psi^2], i not = j # when the correlation exists. set.seed(974) corfac <- (1 + 15/n) * sqrt( (n+2*p)/(n - p) ) if (alpha > 0.1) {qn <- min(1 - alpha + 0.05, 1 - alpha + p/n)} if (alpha <= 0.1) {qn <- min(1 - alpha/2, 1 - alpha + 10*alpha*p/n)} pn <- qn if(pn < 1 - alpha + 0.001) qn <- 1 - alpha alphan <- 1 - qn pilen <- 1:nruns ps <- pilen opicov <- 0 q <- p-1 vmax <- min(p,as.integer(n/5)) rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 b <- 0 * 1:q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] vars <- as.vector(1:(p-1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A xf <- rnorm(q) %*% A if(type == 1) { y <- 1 + x %*% b + rnorm(n) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err ef <- rnorm(1, sd = 1 + rbinom(1, 1, eps) * shift) yf <- 1 + xf %*% b + ef } #make an MLR data set #find the forward sel minimum Cp model tem<-regsubsets(x,y,nvmax=vmax,method="forward") out<-summary(tem) mincp <- out$which[out$cp==min(out$cp),] #do not need the constant in vin vin <- vars[mincp[-1]] sub <- lsfit(x[,vin],y) ps[i]<-length(sub$coef) yfhat <- sub$coef[1] + xf[vin] %*% sub$coef[-1] fres <- sub$resid #get asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alphan)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac*rup low <- yfhat + corfac*rlow pilen[i] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } psmn <- mean(ps)-k #0 if subset is selecting optimal subset pimnlen <- mean(pilen) opicov <- opicov/nruns list(psmn=psmn,opicov=opicov, pimenlen = pimnlen)} wddplot<-function(x) {# Shows the southwest corner of the DD plot. Need p > 1. n <- dim(x)[1] wt <- 0 * (1:n) p <- dim(x)[2] center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) out <- covfch(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) md <- sqrt(md2) rd <- sqrt(rd2) const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd wt[rd < sqrt(qchisq(0.975, p))] <- 1 MD <- md[wt > 0] RD <- rd[wt > 0] plot(MD, RD) } wildboot<-function(x,y,B=1000,slices=7) { #nonparametric, wild, parametric bootstrap for WLS #want n/slices > 13 n <- length(y) x <- as.matrix(x) p <- dim(x)[2]+1 full <- lsfit(x,y) res <- full$resid fit <- y - res #ESP indx <- sort.list(fit) vars<-1:n val <- as.integer(n/slices) for(i in 1:slices){ lo <- (i-1)*val + 1 hi <- i*val if(i == slices) hi <- n vars[indx[lo:hi]] <- var(res[indx[lo:hi]]) } sds <- sqrt( n*vars/(n-p) ) sres <- res * sqrt(n/(n-p)) betas <- matrix(0,nrow=B,ncol=p) wbetas <- betas pbetas <- betas indx <- 1:n for(i in 1:B){ tem <- sample(indx,n,replace=T) betas[i,] <- lsfit(x[tem,],y[tem])$coef #nonparametric bootstrap eps <- 2*rbinom(n,size=1,0.5)-1 #wild bootstrap yb <- fit + eps*sres wbetas[i,] <- lsfit(x,yb)$coef yb <- fit + rnorm(n,mean=0,sd=sds) pbetas[i,] <- lsfit(x,yb)$coef #parametric bootstrap } list(betas=betas,wbetas=wbetas,pbetas=pbetas,full=full) } wlsbootsim<-function(n = 100, p = 4, k=1, nruns = 100, nslices=7, eps = 0.1, shift = 9, etype = 1, wtype=1, psi = 0.0, BB=200, alph = 0.05) {#calls confreg, rowboot, shorth3 #Simulates WLS nonparamtric and wild bootstrap. # Use 0 <= psi < 1. Want n/nslices > 13. #Uses five iid error distributions: # etype = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) #errors. #wtype = 1 for OLS, 2 if err = abs(SP - 5) * e, 3 if sqrt(1 + 0.5* x[,1]^2)*e #4 for exp[1 + log(|x_2|) + ... + log(|x_p|] * e, #5 for [1 + log(|x_2|) + ... + log(|x_p|] * e # constant = 1 so there are p = q+1 coefficients #1 <= k <= p-2, k is the number of nonnoise variables #need p > 2, beta = (1, 1, ..., 1, 0, ..., 0) with k+1 ones p-k-1 zeroes # Multiply x by A: for MVN data this results # in a covariance matrix with eigenvector c(1, ..., 1)^T # corresponding to the largest eigenvalue. As psi gets # close to 1, the data clusters about the line in the # direction of (1, ..., 1)^T. See Maronna and Zamar (2002). # cor(X_i,X_j) = [2 psi +(p-3)psi^2]/[1 + (p-2)psi^2], i not = j, p > 2 # when the correlation exists. q <- p-1 b <- 0 * 1:q dd <- b + 1 dq <- dd/q b[1:k] <- 1 #b[1:0] acts like b[1:1] = b[1] beta <- c(1,b) pp6<-p+6; pp5<-p+5; pp4<-p+4;pp3<-p+3; pp1<-p+1; pp2<-p+2 cicov <- 0*(1:pp6) avelen <- 0*(1:pp6) wcicov <- cicov wavelen <- avelen pcicov <- cicov pavelen <- avelen rho <- (2*psi + (q-2)*psi^2)/(1 + (q-1)*psi^2) A <- matrix(psi,nrow=q,ncol=q) diag(A) <- 1 one <- as.vector(0*1:(k+1) + 1) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- x %*% A SP <- 1 + x %*% b if(etype == 1) err <- rnorm(n) if(etype == 2) err <- rt(n, df = 3) if(etype == 3) err <- rexp(n) - 1 if(etype == 4) err <- runif(n, min = -1, max = 1) if(etype == 5) err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) *shift) if(wtype == 2) err <- abs(SP - 5) * err if(wtype == 3) err <- sqrt(1 + x[,1]^2)*err if(wtype == 4) err <- exp(1 + log(abs(x))%*%dd) * err if(wtype == 5) err <- (1 + log(abs(x))%*%dd) * err if(wtype == 6) err <- exp(log(abs(x))%*%dq) * err if(wtype == 7) err <- (log(abs(x))%*%dq) * err #make an MLR data set y <- SP + err out <- wildboot(x,y,B=BB,slices=nslices) #nonparametric bootstrap for (j in 1:p){ tem <- shorth3(out$betas[,j],alpha=alph) temw <- shorth3(out$wbetas[,j],alpha=alph) temp <- shorth3(out$pbetas[,j],alpha=alph) if(beta[j] >= tem$shorth[1] && beta[j] <= tem$shorth[2]) cicov[j] <- cicov[j] + 1 avelen[j] <- avelen[j] + tem$shorth[2] - tem$shorth[1] if(beta[j] >= temw$shorth[1] && beta[j] <= temw$shorth[2]) wcicov[j] <- wcicov[j] + 1 wavelen[j] <- wavelen[j] + temw$shorth[2] - temw$shorth[1] if(beta[j] >= temp$shorth[1] && beta[j] <= temp$shorth[2]) pcicov[j] <- pcicov[j] + 1 pavelen[j] <- pavelen[j] + temp$shorth[2] - temp$shorth[1] } #test whether the last p-k-1 values of beta are 0 gg <- p - k - 1 tstat <- out$full$coef[(k+2):p] tem <- confreg(out$betas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(tem$D0 <= tem$cuplim) #pred. reg. method cicov[pp1] <- cicov[pp1] + 1 avelen[pp1] <- avelen[pp1] + tem$cuplim if(tem$br0 <= tem$cuplim) #hybrid method cicov[pp2] <- cicov[pp2] + 1 avelen[pp2] <- avelen[pp1] #same cutoff so same length if(tem$br0 <= tem$brlim) #Bickel and Ren method cicov[pp3] <- cicov[pp3] + 1 avelen[pp3] <- avelen[pp3] + tem$brlim # temw <- confreg(out$wbetas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(temw$D0 <= temw$cuplim) #pred. reg. method wcicov[pp1] <- wcicov[pp1] + 1 wavelen[pp1] <- wavelen[pp1] + temw$cuplim if(temw$br0 <= temw$cuplim) #hybrid method wcicov[pp2] <- wcicov[pp2] + 1 wavelen[pp2] <- wavelen[pp1] #same cutoff so same length if(temw$br0 <= temw$brlim) #Bickel and Ren method wcicov[pp3] <- wcicov[pp3] + 1 wavelen[pp3] <- wavelen[pp3] + temw$brlim # temp <- confreg(out$pbetas[,(k+2):p],g=gg,that=tstat,alpha=alph) if(temp$D0 <= temp$cuplim) #pred. reg. method pcicov[pp1] <- pcicov[pp1] + 1 pavelen[pp1] <- pavelen[pp1] + temp$cuplim if(temp$br0 <= temp$cuplim) #hybrid method pcicov[pp2] <- pcicov[pp2] + 1 pavelen[pp2] <- pavelen[pp1] #same cutoff so same length if(temp$br0 <= temp$brlim) #Bickel and Ren method pcicov[pp3] <- pcicov[pp3] + 1 pavelen[pp3] <- pavelen[pp3] + temp$brlim #test whether the first k+1 values of beta are 1 gg <- k + 1 tstat <- out$full$coef[1:(k+1)] tem <- confreg(out$betas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, tem$center, tem$cov)) if(D0 <= tem$cuplim) #pred. reg. method cicov[pp4] <- cicov[pp4] + 1 avelen[pp4] <- avelen[pp4] + tem$cuplim D1 <- sqrt(mahalanobis(one, tstat, tem$cov)) if(D1 <= tem$cuplim) #hybrid method cicov[pp5] <- cicov[pp5] + 1 avelen[pp5] <- avelen[pp4] #same cutoff so same length if(D1 <= tem$brlim) #Bickel and Ren method cicov[pp6] <- cicov[pp6] + 1 avelen[pp6] <- avelen[pp6] + tem$brlim # temw <- confreg(out$wbetas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, temw$center, temw$cov)) if(D0 <= temw$cuplim) #pred. reg. method wcicov[pp4] <- wcicov[pp4] + 1 wavelen[pp4] <- wavelen[pp4] + temw$cuplim D1 <- sqrt(mahalanobis(one, tstat, temw$cov)) if(D1 <= temw$cuplim) #hybrid method wcicov[pp5] <- wcicov[pp5] + 1 wavelen[pp5] <- wavelen[pp4] #same cutoff so same length if(D1 <= temw$brlim) #Bickel and Ren method wcicov[pp6] <- wcicov[pp6] + 1 wavelen[pp6] <- wavelen[pp6] + temw$brlim # temp <- confreg(out$pbetas[,1:(k+1)],g=gg,that=tstat,alpha=alph) D0 <- sqrt(mahalanobis(one, temp$center, temp$cov)) if(D0 <= temp$cuplim) #pred. reg. method pcicov[pp4] <- pcicov[pp4] + 1 pavelen[pp4] <- pavelen[pp4] + temp$cuplim D1 <- sqrt(mahalanobis(one, tstat, temp$cov)) if(D1 <= temp$cuplim) #hybrid method pcicov[pp5] <- pcicov[pp5] + 1 pavelen[pp5] <- pavelen[pp4] #same cutoff so same length if(D1 <= temp$brlim) #Bickel and Ren method pcicov[pp6] <- pcicov[pp6] + 1 pavelen[pp6] <- pavelen[pp6] + temp$brlim #FIT <- y-out$full$res #plot(FIT,y) #abline(0,1) } #FIT <- y-out$full$res #plot(FIT,y) #abline(0,1) cicov <- cicov/nruns avelen <- avelen/nruns wcicov <- wcicov/nruns wavelen <- wavelen/nruns pcicov <- pcicov/nruns pavelen <- pavelen/nruns list(cicov=cicov,avelen=avelen,wcicov=wcicov,wavelen=wavelen, pcicov=pcicov,pavelen=pavelen,beta=beta,k=k) }