## Golub.RData holds the matrix Golub & the data frame golub.info ## Functions in cvplot.Rdata are ## "balanced.sample" "calc.matrixrows.f" "cvdisc" ## "cvplot" "cvscores" "golub.info" ## "order.features" "pcp" "plot.train.test" ## "scoreplot" ## As of now (March 20, 2005), the following is the only documentation. ## The methodology is similar to, but not identical to, that in ## Maindonald & Burden (2005). Two differences are: ## (1) At present, these codes use leave-one-out cross-validation; ## (2) The methodology for forming the "global" scores is different. ## @inProceedings{Main2005, ## author = "J. H. Maindonald and C. J. Burden ", ## title = "Selection bias in plots of microarray or other data that have ## been sampled from a high-dimensional space", ## journal = {Australian & New Zealand Industrial and Applied Mathematics ## Journal}, ## volume = "46", ## pages = "C59--C74", ## year = 2005, ## booktitle = "Proceedings of 12th Computational Techniques and Applications ## Conference CTAC-2004", ## editor = "Rob May and A. J. Roberts", ## month = mar, ## note = "\protect \url {http://anziamj.austms.org.au/V46/CTAC2004/Main} ## [March 15, 2005]", ## keywords = "selection bias, microarray data, prior groupings, ## discrimination", ## } with(golub.info, table(cancer, tissue.mf)) boxplot(data.frame(Golub[,1:20])) # First 20 columns (observations) ## Random selection of 20 rows (features) boxplot(data.frame(Golub[sample(1:7129, 20), ])) library(MASS) attach(golub.info) ## For now, omit the single PB:f sample subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m") ord15 <- order.features(Golub, cl=tissue.mf, subset=subsetB)[1:15] dfB.ord <- data.frame(t(Golub[ord15, subsetB])) # NB: transpose to observations by features, & coerce to data frame tissue.mfB <- factor(tissue.mf[subsetB]) ord15.lda <- lda(dfB.ord, grouping=tissue.mfB) scores <- predict(ord15.lda, dimen=2)$x ## Scores for the single PB:f observation df.PBf <- data.frame(t(Golub[ord15, tissue.mf=="PB:f" & cancer=="allB", drop=FALSE])) scores.PBf <- predict(ord15.lda, newdata=df.PBf, dimen=2)$x ## Warning! The plot that now follows may be misleading! scoreplot(scores=scores, cl=tissue.mfB, scores.other=scores.PBf, cl.other="PB:f") detach(golub.info) simulate.scores(nfeatures=7129, cl=rep(1:3, c(19,10,2)), cl.other=4, nf.use=15, plot.scores=TRUE) ## Use the function simulate.scores() ## Alternatively, set: ## dimen <- dim(Golub); dset <- array(rnorm(prod(dimen)), dim=dimen) ## Then replace Golub with dset, and repeat the lines above. ## Simplified version. The DAAG version has an additional parameters ## subset (to extract a subset of observations) simple.order.features <- function(dset, cl){ ## Ensure that cl is a factor & has no redundant levels cl <- factor(cl) stat <- calc.matrixrows.f(dset, cl) order(-stat) # Require largest first } attach(golub.info) Golub.BM <- Golub[, BM.PB=="BM"] cancer.BM <- cancer[BM.PB=="BM"] ## Now split each of the BTM categories between two subsets ## Uses balanced.sample(), from DAAG gp.id <- balanced.sample(cancer.BM, nset=2, seed=41) # Set seed to allow exact reproduction of the results below table(gp.id, cancer.BM) acc.train.test(dset = Golub.BM, cl = cancer.BM, traintest=gp.id) ## Use function plot.train.test() from bioDAAG plot.train.test(dset=Golub.BM, nf.use=c(5,5), cl=cancer.BM, traintest=gp.id) gp.id <- balanced.sample(cl=BTM, nset=2, seed=NULL) subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m") tissue.mfB <- factor(tissue.mf[subsetB]) dsetB <- Golub[,subsetB] tissue.mfB.acc <- cvdisc(dsetB, cl=tissue.mfB, nf.use=1:26, selectonce=TRUE, cv=TRUE) # Accuracy measures are cv: tissue.mfB.acc$acc.cv # Resubstitution (red points): tissue.mfB.acc$acc.resub # "Select once" (gray): tissue.mfB.acc$acc.sel1 detach(golub.info) ## This code does (less than) half the required task. ## It gives biased and therefore incorrect results. maxfeatures <- 26 ord <- order.features(dsetB, tissue.mfB) selectonce.df <- data.frame(t(dsetB[ord, , drop=F])) acc.sel1 <- numeric(maxfeatures) for(nf in 1:maxfeatures){ hat.selB <- lda(tissue.mfB ~ ., data=selectonce.df[,1:nf, drop=FALSE], CV=TRUE)$class tab1 <- table(hat.selB, tissue.mfB) acc.sel1[nf] <- sum(tab1[row(tab1)==col(tab1)])/sum(tab1) } attach(golub.info) BMonly.acc <- cvdisc(Golub, cl=cancer, nf.use=1:20, subset=BM.PB=="BM") round(BMonly.acc$acc.cv, 2) tabf <- table(tissue.mfB.acc$genelist[1:3,]) nam <- names(sort(-tabf)) tab <- with(tissue.mfB.acc, table(genelist[1:3,], row(genelist[1:3,]))) tab[nam, ] ## Uses dsetB, tissue.mfB, and tissue.mfB.acc from above tissue.mfB.scores <- cvscores(cvlist = tissue.mfB.acc, nf.use = 3, cl.other = NULL) cvplot(scorelist = tissue.mfB.scores, cl.circle=NULL, prefix="B-cell subset -") detach(golub.info) BMonly.scores <- cvscores(cvlist=BMonly.acc, nf.use=13, cl.other=NULL) cvplot(scorelist=BMonly.scores, cl.circle=tissue.mf[BM.PB=="BM"], circle=tissue.mf[BM.PB=="BM"]%in%c("BM:f","BM:m"), circle.colr=c("cyan","gray"), prefix="B: BM samples -")