## Sec 11.5: *High-dimensional data, classification, and plots ## What groups are of interest? with(golub.info, table(cancer, tissue.mf)) attach(golub.info) ## Identify allB samples for that are BM:f or BM:m or PB:m subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m") ## Form vector that identifies these as BM:f or BM:m or PB:m tissue.mfB <- factor(tissue.mf[subsetB]) ## Separate off the relevant columns of the matrix Golub GolubB <- Golub[, subsetB] detach(golub.info) ## ss 11.5.1: Classifications and associated graphs ## Preliminary data manipulation ## Footnote code ## Try e.g. boxplot(data.frame(GolubB[, 1:20])) # First 20 columns (observations) ## Random selection of 20 rows (features) boxplot(data.frame(GolubB[sample(1:7129, 20), ])) ## ss 11.5.2: Flawed graphs ## Footnote code ## Uses order.features() (DAAG); see below ord15 <- order.features(GolubB, cl=tissue.mfB)[1:15] ## Footnote code dfB.ord <- data.frame(t(GolubB[ord15, ])) ## Calculations for the left panel ## Transpose to observations by features dfB15 <- data.frame(t(GolubB[ord15, ])) library(MASS) dfB15.lda <- lda(dfB15, grouping=tissue.mfB) scores <- predict(dfB15.lda, dimen=2)$x ## Scores for the single PB:f observation attach(golub.info) df.PBf <- data.frame(t(Golub[ord15, tissue.mf=="PB:f" & cancer=="allB", drop=FALSE])) scores.PBf <- predict(dfB15.lda, newdata=df.PBf, dimen=2)$x detach(golub.info) ## Warning! The plot that now follows may be misleading! scoreplot(scores=scores, cl=tissue.mfB, scores.other=scores.PBf, cl.other="PB:f") simscores <- simulate.scores(nfeatures=7129, cl=rep(1:3, c(19,10,2)), cl.other=4, nf.use=15, seed=41) # Returns list elements: scores, cl, scores.other & cl.other with(simscores, scoreplot(scores, cl, scores.other, cl.other)) ## Footnote code ## Use the function simulate.scores() ## Alternatively, set: ## dimen <- dim(GolubB); rsetB <- array(rnorm(prod(dimen)), dim=dimen) ## Then replace GolubB with rsetB, and repeat the lines above. ## Distributional extremes ## No legal code: ## The function \texttt{order.features()} ## Footnote code ## Simplified version. The DAAG version has an additional parameters ## subset (to extract a subset of observations) "simple.order.features" <- function(dset, cl){ ## Ensure that cl is a factor & has no redundant levels cl <- factor(cl) stat <- calc.matrixrows.f(dset, cl) order(-stat) # Require largest first } ## ss 11.5.3: Accuracies and Scores for test data attach(golub.info) Golub.BM <- Golub[, BM.PB=="BM"] cancer.BM <- cancer[BM.PB=="BM"] ## Now split each of the cancer.BM categories between two subsets ## Uses balanced.sample(), from DAAG gp.id <- balanced.sample(cancer.BM, nset=2, seed=31) # Set seed to allow exact reproduction of the results below detach(golub.info) table(gp.id, cancer.BM) accboth <- acc.train.test(dset = Golub.BM, cl = cancer.BM, ## Footnote code ## Use function plot.train.test() from hddplot plot.train.test(dset=Golub.BM, nfeatures=c(7,12), cl=cancer.BM, traintest=gp.id) gp.id <- balanced.sample(cl=cancer.BM, nset=2, seed=NULL) rbind(accboth$sub1.2[1:12],accboth$sub2.1[1:12]) ## Find the order of the first list in the second, if present match(accboth$sub1.2[1:12],accboth$sub2.1[1:12]) ## Cross-validation to determine the optimum number of features ## No legal code: ## Feature selection at each fold ## This code does (less than) half the required task. ## It gives biased and therefore incorrect results. maxfeatures <- 26 ord <- order.features(GolubB, source.mfB) selectonce.df <- data.frame(t(GolubB[ord, , drop=F])) acc.sel1 <- numeric(maxfeatures) for(nf in 1:maxfeatures){ hat.selB <- lda(tissue.mfB ~ ., data=selectonce.df[, 1:nf, drop=FALSE], CV=TRUE)$class tab1 <- table(hat.selB, tissue.mfB) acc.sel1[nf] <- sum(tab1[row(tab1)==col(tab1)])/sum(tab1) } ## Cross-validation: bone marrow (\texttt{BM}) samples only ## Footnote code attach(golub.info) BMonly.acc <- cvdisc(Golub, cl=cancer, nf.use=1:20, subset=BM.PB=="BM") round(BMonly.acc$acc.cv, 2) detach(golub.info) ## ss 11.5.4: Graphs derived from the cross-validation process tabf <- table(tissue.mfB.acc$genelist[1:3,]) nam <- names(sort(-tabf)) tab <- with(tissue.mfB.acc, table(genelist[1:3,], row(genelist[1:3,]))) tab[nam, ] ## Footnote code attach(golub.info) ## Uses tissue.mfB.acc from above tissue.mfB.scores <- cvscores(cvlist = tissue.mfB.acc, nfeatures = 3, cl.other = NULL) cvplot(scorelist = tissue.mfB.scores, cl.circle=NULL, prefix="B-cell subset -") detach(golub.info) ## Footnote code BMonly.scores <- cvscores(cvlist=BMonly.acc, nfeatures=13, cl.other=NULL) cvplot(scorelist=BMonly.scores, cl.circle=tissue.mf[BM.PB=="BM"], circle=source.mf[BM.PB=="BM"]%in%c("BM:f","BM:m"), circle.colr=c("cyan","gray"), prefix="B: BM samples -")