## 1. Starting Up ## 1.1 Getting started under Windows ## 1.2 Use of an Editor Script Window ## 1.3 A Short R Session austpop <- read.table("d:/austpop.txt", header=TRUE) austpop names(austpop) ## 1.3.1 Entry of Data at the Command Line elasticband <- data.frame(stretch=c(46,54,48,50,44,42,52), distance=c(148,182,173,166,109,141,166)) ## 1.3.2 Entry and/or editing of data in an editor window elasticband <- edit(elasticband) ## 1.3.3 Options for read.table() ## 1.3.4 Options for plot() and allied functions ## 1.4 Further Notational Details ## 1.5 On-line Help help() help(plot) help.search("matrix") apropos("matrix") ## 1.6 The Loading or Attaching of Datasets attach("usingR.RData") ## 1.7 Exercises 2. An Overview of R ## 2.1 The Uses of R ## 2.1.1 R may be used as a calculator. 2+2 sqrt(10) 2*3*4*5 1000*(1+0.075)^5 - 1000 # Interest on $1000, compounded annually # at 7.5% p.a. for five years pi # R knows about pi 2*pi*6378 #Circumference of Earth at Equator, in km; radius is 6378 km sin(c(30,60,90)*pi/180) # Convert angles to radians, then take sin() ## 2.1.2 R will provide numerical or graphical summaries of data load("hills.Rdata") # Assumes hills.Rdata is in the working directory summary(hills) 2.1.3 R has extensive graphical abilities pairs(hills) ## 2.1.4 R will handle a variety of specific analyses options(digits=3) cor(hills) cor(log(hills)) plot(distance ~ stretch,data=elasticband, pch=16) elastic.lm <- lm(distance~stretch,data=elasticband) lm(distance ~stretch,data=elasticband) summary(lm(distance~stretch,data=elasticband)) ## 2.1.5 R is an Interactive Programming Language celsius <- 25:30 fahrenheit <- 9/5*celsius+32 conversion <- data.frame(Celsius=celsius, Fahrenheit=fahrenheit) print(conversion) ## 2.2 R Objects save.image() # Save contents of workspace, into the file .RData save.image(file="archive.RData") # Save into the file archive.RData save(celsius, fahrenheit, file="tempscales.RData") attach("tempscales.RData") ls(pos=2) # Check the contents of the file that has been attached ## *2.3 Looping for (i in 1:10) print(i) # Celsius to Fahrenheit for (celsius in 25:30) + print(c(celsius, 9/5*celsius + 32)) ## 2.3.1 More on looping answer <- 0 for (j in c(31,51,91)){answer <- j+answer} answer sum(c(31,51,91)) ## 2.4 Vectors c(2,3,5,2,7,1) 3:10 # The numbers 3, 4, .., 10 c(TRUE,FALSE,FALSE,FALSE,TRUE,TRUE,FALSE) c("Canberra","Sydney","Newcastle","Darwin") ## 2.4.1 Joining (concatenating) vectors x <- c(2,3,5,2,7,1) x y <- c(10,15,12) y z <- c(x, y) z ## 2.4.2 Subsets of Vectors x <- c(3,11,8,15,12) # Assign to x the values 3, 11, 8, 15, 12 x[c(2,4)] # Extract elements (rows) 2 and 4 x <- c(3,11,8,15,12) x[-c(2,3)] x>10 # This generates a vector of logical (T or F) x[x>10] ## 2.4.3 The Use of NA in Vector Subscripts y <- c(1, NA, 3, 0, NA) y[is.na(y)] <- 0 ## 2.4.4 Factors gender <- c(rep("female",691), rep("male",692)) gender <- factor(gender): levels(gender) # Assumes gender is a factor, created as above gender <- relevel(gender, ref="male") gender <- factor(gender, levels=c("male", "female")) gender <- factor(c(rep("female",691), rep("male",692))) table(gender) gender <- factor(gender, levels=c("male", "female")) table(gender) gender <- factor(gender, levels=c("Male", "female")) # Erroneous - "male" rows now hold missing values table(gender) rm(gender) # Remove gender ## 2.5 Data Frames Cars93.summary type <- Cars93.summary$abbrev type <- Cars93.summary[,4] type <- Cars93.summary[,"abbrev"] type <- Cars93.summary[[4]] # Take the object that is stored # in the fourth list element. ## 2.5.1 Data frames as lists ## 2.5.2 Inclusion of character string vectors in data frames ## 2.5.3 Built-in data sets summary(trees) ## 2.6 Common Useful Functions print() # Prints a single R object cat() # Prints multiple objects, one after the other length() # Number of elements in a vector or of a list mean() median() range() unique() # Gives the vector of distinct values diff() # Replace a vector by the vector of first differences # N. B. diff(x) has one less element than x sort() # Sort elements into order, but omitting NAs order() # x[order(x)] orders elements of x, with NAs last cumsum() cumprod() rev() # reverse the order of vector elements x <- c(1, 20, 2, NA, 22) order(x) x[order(x)] sort(x) ## 2.6.1 Applying a function to all columns of a data frame sapply(rainforest, is.factor) sapply(rainforest[,-7], range) # The final column (7) is a factor range(rainforest$branch, na.rm=TRUE) # Omit NAs, then determine the range sapply(rainforest[,-7], range, na.rm=TRUE) ## 2.7 Making Tables library(lattice) # The data frame barley accompanies lattice table(barley$year, barley$site) x <- c(1,5,NA,8) x <- factor(x) x factor(x,exclude=NULL) ## 2.7.1 Numbers of NAs in subgroups of the data table(rainforest$species, !is.na(rainforest$branch)) ## 2.8 The Search List search() library(MASS) search() names(primates) Bodywt attach(primates) # R will now know where to find Bodywt Bodywt av <- with(primates, mean(Bodywt)) ## 2.9 Functions in R ## 2.9.1 An Approximate Miles to Kilometers Conversion miles.to.km <- function(miles)miles*8/5 miles.to.km(175) # Approximate distance to Sydney, in miles miles.to.km(c(100,200,300)) ## 2.9.2 A Plotting function attach(florida) plot(BUSH, BUCHANAN, xlab="Bush", ylab="Buchanan") detach(florida) # In S-PLUS, specify detach("florida") plot.florida <- function(xvar="BUSH", yvar="BUCHANAN"){ x <- florida[,xvar] y <- florida[,yvar] plot(x, y, xlab=xvar,ylab=yvar) mtext(side=3, line=1.75, "Votes in Florida, by county, in \nthe 2000 US Presidential election") } plot.florida(yvar="NADER") # yvar="NADER" over-rides the default plot.florida(xvar="GORE", yvar="NADER") ## 2.10 More Detailed Information ## 2.11 Exercises ## ex1. ## a) answer <- 0 for (j in 3:5){ answer <- j+answer } ## b) answer<- 10 for (j in 3:5){ answer <- j+answer } ## c) answer <- 10 for (j in 3:5){ answer <- j*answer } ## 3. Plotting demo(graphics) ## 3.1 plot () and allied functions plot(y ~ x) # Use a formula to specify the graph plot(x, y) # plot((0:20)*pi/10, sin((0:20)*pi/10)) plot((1:30)*0.92, sin((1:30)*0.92)) attach(elasticband) # R now knows where to find distance & stretch plot(distance ~ stretch) plot(ACT ~ Year, data=austpop, type="l") plot(ACT ~ Year, data=austpop, type="b") attach(austpop) plot(spline(Year, ACT), type="l") # Fit smooth curve through points detach(austpop) # In S-PLUS, specify detach("austpop") ## 3.1.1 Plot methods for other classes of object plot(hills) # Has the same effect as pairs(hills) ## 3.2 Fine control - Parameter settings par(cex=1.25) # character expansion oldpar <- par(cex=1.25, mex=1.25) # mex=1.25 expands the margin by 25% attach(elasticband) oldpar <- par(cex=1.5) plot(distance ~ stretch) par(oldpar) # Restores the earlier settings detach(elasticband) oldpar <- par(cex=1.25) on.exit(par(oldpar)) ## 3.2.1 Multiple plots on the one page par(mfrow=c(2,2), pch=16) attach(Animals) # This dataset is in the MASS package, which must be attached plot(body, brain) plot(sqrt(body), sqrt(brain)) plot((body)^0.1, (brain)^0.1) plot(log(body),log(brain)) detach(Animals) par(mfrow=c(1,1), pch=1) # Restore to 1 figure per page ## 3.2.2 The shape of the graph sheet ## 3.3 Adding points, lines and text primates attach(primates) # Needed if primates is not already attached. plot(Bodywt, Brainwt) text(x=Bodywt, y=Brainwt, labels=row.names(primates), pos=4) # pos=4 positions text to the right of the point plot(x=Bodywt, y=Brainwt, pch=16, xlab="Body weight (kg)", ylab="Brain weight (g)", xlim=c(0,280), ylim=c(0,1350)) # Specify xlim so that there is room for the labels text(x=Bodywt, y=Brainwt, labels=row.names(primates), pos=4) detach(primates) text(x=Bodywt, y=Brainwt, labels=row.names(primates), pos=2) ## 3.3.1 Size, colour and choice of plotting symbol plot(1, 1, xlim=c(1, 7.5), ylim=c(1.75,5), type="n", axes=F, xlab="", ylab="") # Do not plot points box() points(1:7, rep(4.5, 7), cex=1:7, col=1:7, pch=0:6) text(1:7,rep(3.5, 7), labels=paste(0:6), cex=1:7, col=1:7) points(1:7,rep(2.5,7), pch=(0:6)+7) # Plot symbols 7 to 13 text((1:7), rep(2.5,7), paste((0:6)+7), pos=4) # Label with symbol number points(1:7,rep(2,7), pch=(0:6)+14) # Plot symbols 14 to 20 text((1:7), rep(2,7), paste((0:6)+14), pos=4) # Labels with symbol number view.colours <- function(){ plot(1, 1, xlim=c(0,14), ylim=c(0,3), type="n", axes=F, xlab="",ylab="") text(1:6, rep(2.5,6), paste(1:6), col=palette()[1:6], cex=2.5) text(10, 2.5, "Default palette", adj=0) rainchars <- c("R","O","Y","G","B","I","V") text(1:7, rep(1.5,7), rainchars, col=rainbow(7), cex=2.5) text(10, 1.5, "rainbow(7)", adj=0) cmtxt <- substring("cm.colors", 1:9,1:9) # Split "cm.colors" into its 9 characters text(1:9, rep(0.5,9), cmtxt, col=cm.colors(9), cex=3) text(10, 0.5, "cm.colors(9)", adj=0) } view.colours() ## 3.3.2 Adding Text in the Margin ## 3.4 Identification and Location on the Figure Region ## 3.4.1 identify() attach(florida) plot(BUSH, BUCHANAN, xlab="Bush", ylab="Buchanan") identify(BUSH, BUCHANAN, County) detach(florida) ## 3.4.2 locator() attach(florida) # if not already attached plot(BUSH, BUCHANAN, xlab="Bush", ylab="Buchanan") locator() detach(florida) ## 3.5 Plots that show the distribution of data values ## 3.5.1 Histograms and density plots attach(possum) here <- sex == "f" hist(totlngth[here], breaks = 72.5 + (0:5) * 5, ylim = c(0, 22), xlab="Total length", main ="A: Breaks at 72.5, 77.5, ...") detach(possum) attach(possum) plot(density(totlngth[here]),type="l") detach(possum) attach(possum) here <- sex == "f" dens <- density(totlngth[here]) xlim <- range(dens$x) ylim <- range(dens$y) hist(totlngth[here], breaks = 72.5 + (0:5) * 5, probability = TRUE, xlim = xlim, ylim = ylim, xlab="Total length", main="") lines(dens) detach(possum) ## 3.5.3 Boxplots attach(possum) boxplot(totlngth[here], horizontal=TRUE) rug(totlngth[here], side=1) detach(possum) ## 3.5.4 Normal probability plots x11(width=8, height=6) # This is a better shape for this plot attach(possum) here <- sex == "f" par(mfrow=c(2,4)) # A 2 by 4 layout of plots y <- totlngth[here] qqnorm(y,xlab="", ylab="Length", main="Possums") for(i in 1:7)qqnorm(rnorm(43),xlab="", ylab="Simulated lengths", main="Simulated") detach(possum) # Before continuing, type dev.off() ## 3.6 Other Useful Plotting Functions ## 3.6.1 Scatterplot smoothing attach(ais) here<- sex=="f" plot(pcBfat[here]~ht[here], xlab = "Height", ylab = "% Body fat") panel.smooth(ht[here],pcBfat[here]) detach(ais) ## 3.6.2 Adding lines to plots attach(ais) here<- sex=="f" plot(pcBfat[here] ~ ht[here], xlab = "Height", ylab = "% Body fat") abline(lm(pcBfat[here] ~ ht[here])) detach(ais) ## 3.6.3 Rugplots attach(ais) here <- sex == "f" oldpar <- par(mar=c(4.1,1.1,1.1, 1.1), mgp=c(2.25, 0.75,0)) boxplot(ht[here], boxwex = 0.35, ylab = "Height", horizontal=TRUE) rug(ht[here], side = 1) par(oldpar) detach(ais) ## 3.6.4 Scatterplot matrices ## 3.6.5 Dotcharts dotchart(islands) # vector of named numeric values, in the datasets base package dotchart(islands, cex=0.5) ## 3.7 Plotting Mathematical Symbols ##x <- 1:15 plot(x, y, xlab="Radius", ylab=expression(Area == pi*r^2)) demo(graphics) ## 3.8 Guidelines for Graphs 3.9 Exercises 3.10 References 4. Lattice graphics ## 4.1 Examples that Present Panels of Scatterplots - Using xyplot() xyplot(csoa ~ it | sex * agegp, data=tinting) # Simple use of xyplot() xyplot(csoa~it|sex*agegp, data=tinting, groups=target, auto.key=list(columns=2)) xyplot(csoa~it|sex*agegp, data=tinting, panel=panel.superpose, groups=target, type=c("p","smooth") xyplot(csoa~it|sex*agegp, data=tinting, groups=tint, auto.key=list(columns=3)) ## 4.2 Some further examples of lattice plots ## 4.2.1 Plotting columns in parallel library(DAAGxtras) xyplot(Beer+Spirit+Wine ~ Year | Country, outer=TRUE, data=grog) xyplot(Beer+Spirit+Wine ~ Year, groups=Country, outer=TRUE, data=grog) xyplot(Beer+Spirit+Wine ~ Year | Country, outer=FALSE, data=grog, auto.key=list(columns=3), par.settings=simpleTheme(pch=16, cex=2) ) ## 4.2.2 Fixed, sliced and free scales library(DAAG) ## scale="fixed" xyplot(BC+Alberta ~ Date, data=jobs, outer=TRUE) ## scale="sliced" - different slices of same scale xyplot(BC+Alberta ~ Date, data=jobs, outer=TRUE, scales=list(y=list(relation="sliced")) ) ## scale="free" - independent scales xyplot(BC+Alberta ~ Date, data=jobs, outer=TRUE, scales=list(y=list(relation="free")) ) ## 4.3 An incomplete list of lattice Functions splom( ~ data.frame) # Scatterplot matrix bwplot(factor ~ numeric , . .) # Box and whisker plot qqnorm(numeric , . .) # normal probability plots dotplot(factor ~ numeric , . .) # 1-dim. Display stripplot(factor ~ numeric , . .) # 1-dim. Display barchart(character ~ numeric , . .) histogram( ~ numeric , . .) densityplot( ~ numeric , . .) # Smoothed version of histogram qqmath(numeric ~ numeric , . .) # QQ plot splom( ~ dataframe, . .) # Scatterplot matrix parallel( ~ dataframe, . .) # Parallel coordinate plots cloud(numeric ~ numeric * numeric, . .) # 3-D plot contourplot(numeric ~ numeric * numeric, . .) # Contour plot levelplot(numeric ~ numeric * numeric, . .) # Variation on a contour plot ## 4.4 Exercises 5. Linear (Multiple Regression) Models and Analysis of Variance ## 5.1 The Model Formula in Straight Line Regression plot(distance ~ stretch, data=elasticband) elastic.lm <- lm(distance ~ stretch, data=elasticband) options(digits=4) summary(elastic.lm) ## 5.2 Regression Objects names(elastic.lm) coef(elastic.lm) resid(elastic.lm) x11(width=7, height=2, pointsize=10) par(mfrow = c(1, 4), mar=c(5.1,4.1,2.1,1.1)) plot(elastic.lm) par(mfrow=c(1,1)) ## 5.3 Model Formulae, and the X Matrix model.matrix(distance ~ stretch, data=elasticband) model.matrix(elastic.lm) ## 5.3.1 Model Formulae in General *5.3.2 Manipulating Model Formulae names(elasticband) nam <- names(elasticband) formds <- formula(paste(nam[1],"~",nam[2])) lm(formds, data=elasticband) ## 5.4 Multiple Linear Regression Models ## 5.4.1 The data frame Rubber library(MASS) # if needed (the dataset Rubber is in the MASS package) pairs(Rubber) Rubber.lm <- lm(loss~hard+tens, data=Rubber) options(digits=3) summary(Rubber.lm) par(mfrow=c(1,2)) termplot(Rubber.lm, partial=TRUE, smooth=panel.smooth) par(mfrow=c(1,1)) ## 5.4.2 Weights of Books oddbooks logbooks <- log(oddbooks) # We might expect weight to be # proportional to thick * height * width logbooks.lm1<-lm(weight~thick,data=logbooks) summary(logbooks.lm1)$coef logbooks.lm2<-lm(weight~thick+height,data=logbooks) summary(logbooks.lm2)$coef logbooks.lm3<-lm(weight~thick+height+width,data=logbooks) summary(logbooks.lm3)$coef ## 5.5 Polynomial and Spline Regression ## 5.5.1 Polynomial Terms in Linear Models plot(grain ~ rate, data=seedrates) # Plot the data seedrates.lm2 <- lm(grain ~ rate+I(rate^2), data=seedrates) summary(seedrates.lm2) hat <- predict(seedrates.lm2) lines(spline(seedrates$rate, hat)) # Placing the spline fit through the fitted points allows a smooth curve. # For this to work the values of seedrates$rate must be ordered. model.matrix(grain~rate+I(rate^2),data=seedrates) ## 5.5.2 What order of polynomial? seedrates.lm1<-lm(grain~rate,data=seedrates) seedrates.lm2<-lm(grain~rate+I(rate^2),data=seedrates) anova(seedrates.lm2,seedrates.lm1) # However we have an independent estimate of the error mean # square. The estimate is 0.17^2, on 35 df. 1-pf(0.16/0.17^2, 1, 35) ## 5.5.3 Pointwise confidence bounds for the fitted curve plot(grain ~ rate, data = seedrates, pch = 16, xlim = c(50, 175), ylim = c(15.5, 22),xlab="Seeding rate",ylab="Grains per head") new.df <- data.frame(rate = c((4:14) * 12.5)) seedrates.lm2 <- lm(grain ~ rate + I(rate^2), data = seedrates) pred2 <- predict(seedrates.lm2, newdata = new.df, interval="confidence") hat2 <- data.frame(fit=pred2[,"fit"],lower=pred2[,"lwr"], upper=pred2[,"upr"]) attach(new.df) lines(rate, hat2$fit) lines(rate,hat2$lower,lty=2) lines(rate, hat2$upper,lty=2) detach(new.df) ## 5.5.4 Spline Terms in Linear Models plot(dist~speed,data=cars) library(splines) cars.lm<-lm(dist~bs(speed),data=cars) # By default, there are no knots hat<-predict(cars.lm) lines(cars$speed,hat,lty=3) # NB assumes values of speed are sorted cars.lm5 <- lm(dist~bs(speed,5),data=cars) # try for a closer fit (1 knot) ci5<-predict(cars.lm5,interval="confidence",se.fit=TRUE) names(ci5) lines(cars$speed,ci5$fit[,"fit"]) lines(cars$speed,ci5$fit[,"lwr"],lty=2) lines(cars$speed,ci5$fit[,"upr"],lty=2) ## 5.6 Using Factors in R Models insulation <- factor(c(rep("without", 8), rep("with", 7))) # 8 without, then 7 with # `with' precedes `without' in alphanumeric order, & is the baseline kWh <- c(10225, 10689, 14683, 6584, 8541, 12086, 12467, 12669, 9708, 6700, 4307, 10315, 8017, 8162, 8022) insulation <- factor(c(rep("without", 8), rep("with", 7))) # 8 without, then 7 with kWh <- c(10225, 10689, 14683, 6584, 8541, 12086, 12467, + 12669, 9708, 6700, 4307, 10315, 8017, 8162, 8022) insulation.lm <- lm(kWh ~ insulation) summary(insulation.lm, corr=F) ## 5.6.1 The Model Matrix model.matrix(kWh~insulation) *5.6.2 Other Choices of Contrasts insulation <- relevel(insulation, baseline="without") # Make `without' the baseline options(contrasts = c("contr.sum", "contr.poly"), digits = 2) # Try the `sum' contrasts insulation <- factor(insulation, levels=c("without", "with")) # Make `without' the baseline insulation.lm <- lm(kWh ~ insulation) summary(insulation.lm, corr=F) insulation <- C(insulation, contr=treatment) ## 5.7 Multiple Lines - Different Regression Lines for Different Species plot(logheart ~ logweight, data=dolphins) # Plot the data options(digits=4) cet.lm1 <- lm(logheart ~ logweight, data = dolphins) summary(cet.lm1, corr=F) cet.lm2 <- lm(logheart ~ factor(species) + logweight, data=dolphins) model.matrix(cet.lm2) cet.lm3 <- lm(logheart ~ factor(species) + logweight + factor(species):logweight, data=dolphins) model.matrix(cet.lm3) anova(cet.lm1,cet.lm2,cet.lm3) ## 5.8 aov models (Analysis of Variance) ## 5.8.1 Plant Growth Example attach(PlantGrowth) # PlantGrowth is from the base datasets package boxplot(split(weight,group)) # Looks OK PlantGrowth.aov <- aov(weight~group) summary(PlantGrowth.aov) summary.lm(PlantGrowth.aov) help(cabbages) # cabbages is from the MASS package names(cabbages) coplot(HeadWt~VitC|Cult+Date,data=cabbages) VitC.aov<-aov(VitC~Cult+Date,data=cabbages) summary(VitC.aov) ## *5.8.2 Shading of Kiwifruit Vines kiwishade$shade <- relevel(kiwishade$shade, ref="none") ## Make sure that the level "none" (no shade) is used as reference kiwishade.aov<-aov(yield~block+shade+Error(block:shade),data=kiwishade) summary(kiwishade.aov) coef(kiwishade.aov) ## 5.9 Exercises ## ex6. Type hosp<-rep(c("RNC","Hunter","Mater"),2) hosp fhosp<-factor(hosp) levels(fhosp) ## 5.10 References ## 6. Multivariate and Tree-based Methods ## 6.1 Multivariate EDA, and Principal Components Analysis pairs(possum[,6:14], col=palette()[as.integer(possum$sex)]) pairs(possum[,6:14], col=palette()[as.integer(possum$site)]) here<-!is.na(possum$footlgth) # We need to exclude missing values print(sum(!here)) # Check how many values are missing possum.prc <- princomp(log(possum[here,6:14])) # Principal components # Print scores on second pc versus scores on first pc, # by populations and sex, identified by site xyplot(possum.prc$scores[,2] ~ possum.prc$scores[,1] | possum$Pop[here]+possum$sex[here], groups=possum$site, auto.key=list(columns=3)) ## 6.2 Cluster Analysis ## 6.3 Discriminant Analysis library(MASS) # Only if not already attached. here<- !is.na(possum$footlgth) possum.lda <- lda(site ~ hdlngth+skullw+totlngth+ + taillgth+footlgth+earconch+eye+chest+belly,data=possum, subset=here) options(digits=4) possum.lda$svd # Examine the singular values > plot(possum.lda, dimen=3) # Scatterplot matrix for scores on 1st 3 canonical variates, as in Figure22 ## 6.4 Decision Tree models (Tree-based models) library(rpart) # Use fgl: Forensic glass fragment data; from MASS package glass.tree <- rpart(type ~ RI+Na+Mg+Al+Si+K+Ca+Ba+Fe, data=fgl) plot(glass.tree); text(glass.tree) summary(glass.tree) ## 6.5 Exercises . ## 6.6 References ## *7. R Data Structures ## 7.1 Vectors ## 7.1.1 Subsets of Vectors c(Andreas=178, John=185, Jeff=183)[c("John","Jeff")] ## 7.1.2 Patterned Data rep(c(2,3,5),4) rep(c(2,3,5),c(4,4,4)) # An alternative is rep(c(2,3,5), each=4) ## 7.2 Missing Values x <- c(1,6,2,NA) is.na(x) # TRUE for when NA appears, and otherwise FALSE x==NA # All elements are set to NA NA==NA y[x>2] <- x[x>2] ## 7.3 Data frames ## 7.3.1 Extraction of Component Parts of Data frames names(barley) levels(barley$site) Duluth1932 <- barley[barley$year=="1932" & barley$site=="Duluth", + c("variety","yield")] ## 7.3.2 Data Sets that Accompany R Packages data(package="datasets") ## 7.4 Data Entry Issues ## 7.4.1 Idiosyncrasies ## 7.4.2 Missing values when using read.table() ## 7.4.3 Separators when using read.table() ## 7.5 Factors and Ordered Factors as.character(islandcities$country) unclass(islandcities$country) table(islandcities$country) lev <- levels(islandcities$country) lev[c(7,4,6,2,5,3,1)] country <- factor(islandcities$country, levels=lev[c(7,4,6,2,5,3,1)]) table(country) ## 7.6 Ordered Factors stress.level<-rep(c("low","medium","high"),2) ordf.stress<-ordered(stress.level, levels=c("low","medium","high")) ordf.stress ordf.stress<"medium" ordf.stress>="medium" class(ordf.stress) ## 7.7 Lists elastic.lm <- lm(distance~stretch, data=elasticband) names(elastic.lm) elastic.lm$coefficients elastic.lm[["coefficients"]] elastic.lm[[1]] elastic.lm[1] options(digits=3) elastic.lm$residuals elastic.lm$call mode(elastic.lm$call) ## *7.8 Matrices and Arrays xx <- matrix(1:6,ncol=3) # Equivalently, enter matrix(1:6,nrow=2) xx x <- as.vector(xx) dim(xx) [1] 2 3 x34 <- matrix(1:12,ncol=4) x34 x34[2:3,c(1,4)] # Extract rows 2 & 3 & columns 1 & 4 x34[2,] # Extract the second row x34[-2,] # Extract all rows except the second x34[-2,-3] # Extract the matrix obtained by omitting row 2 & column 3 ## 7.8.1 Arrays x <- 1:24 dim(x) <- c(2,12) x dim(x) <-c(3,4,2) x ## 7.8.2 Conversion of Numeric Data frames into Matrices ## 7.9 Exercises ## a) answer <- c(2, 7, 1, 5, 12, 3, 4) for (j in 2:length(answer)){ answer[j] <- max(answer[j],answer[j-1])} ## b) answer <- c(2, 7, 1, 5, 12, 3, 4) for (j in 2:length(answer)){ answer[j] <- sum(answer[j],answer[j-1])} 8. Functions ## 8.1 Functions for Confidence Intervals and Tests ## 8.1.1 The t-test and associated confidence interval ## 8.1.2 Chi-Square tests for two-way tables ## 8.2 Matching and Ordering match(, ) ## For each element of , returns the ## position of the first occurrence in order() ## Returns the vector of subscripts giving ## the order in which elements must be taken ## so that will be sorted. rank() ## Returns the ranks of the successive elements. x <- rep(1:5,rep(3,5)) x two4 <- x %in% c(2,4) two4 # Now pick out the 2s and the 4s x[two4] ## 8.3 String Functions substring(, , ) nchar() ## Returns vector of number of characters in each element. ## *8.3.1 Operations with Vectors of Text Strings - A Further Example car.brandnames <- sapply(strsplit(as.character(Cars93$Make), " ", fixed=TRUE), + function(x)x[1]) car.brandnames[1:5] ## 8.4 Application of a Function to the Columns of an Array or Data Frame apply(, , ) lapply(, ) ## N. B. A dataframe is a list. Output is a list. sapply(, ) ## As lapply(), but simplify (e.g. to a vector ## or matrix), if possible. ## 8.4.1 apply() apply(airquality,2,mean) # All elements must be numeric! apply(airquality,2,mean,na.rm=TRUE) ## 8.4.2 sapply() sapply(airquality, function(x)sum(is.na(x))) sapply(moths,is.factor) # Determine which columns are factors # How many levels does each factor have? sapply(moths, function(x)if(!is.factor(x))return(0) else length(levels(x))) ## *8.5 aggregate() and tapply() str(cabbages) attach(cabbages) aggregate(HeadWt, by=list(Cult=Cult, Date=Date), FUN=mean) *8.6 Merging Data Frames Cars93.summary new.Cars93 <- merge(x=Cars93,y=Cars93.summary[,4,drop=F], by.x="Type",by.y="row.names") ## 8.7 Dates # Electricity Billing Dates dd <- as.Date(c("2003/08/24","2003/11/23","2004/02/22","2004/05/23")) diff(dd) as.Date("1/1/1960", format="%d/%m/%Y") as.Date("1:12:1960",format="%d:%m:%Y") as.Date("1960-12-1")-as.Date("1960-1-1") as.Date("31/12/1960","%d/%m/%Y") as.integer(as.Date("1/1/1970","%d/%m/%Y") as.integer(as.Date("1/1/2000","%d/%m/%Y")) dec1 <- as.Date("2004-12-1") format(dec1, format="%b %d %Y") format(dec1, format="%a %b %d %Y") ## 8.8. Writing Functions and other Code fahrenheit2celsius <- function(fahrenheit=32:40)(fahrenheit-32)*5/9 # Now invoke the function fahrenheit2celsius(c(40,50,60)) mean.and.sd <- function(x=1:10){ + av <- mean(x) + sd <- sqrt(var(x)) + c(mean=av, SD=sd) + } > # Now invoke the function mean.and.sd() mean.and.sd(hills$climb) ## 8.8.1 Syntax and Semantics ## 8.8.2 A Function that gives Data Frame Details faclev <- function(x)if(!is.factor(x))return(0) else length(levels(x)) sapply(moths, faclev) sapply(moths, function(x)if(!is.factor(x))return(0) else length(levels(x))) check.df <- function(df=moths) sapply(df, function(x)if(!is.factor(x))return(0) else length(levels(x))) ## 8.8.3 Compare Working Directory Data Sets with a Reference Set dsetnames <- objects() additions <- function(objnames = dsetnames) { newnames <- objects(pos=1) existing <- as.logical(match(newnames, objnames, nomatch = 0)) newnames[!existing] } additions(dsetnames) ## 8.8.4 Issues for the Writing and Use of Functions ## 8.8.5 Functions as aids to Data Management attributes(elasticband)$title <- "Extent of stretch of band, and Resulting Distance" ## 8.8.6 Graphs ## 8.8.7 A Simulation Example guesses <- runif(100) correct.answers <- 1*(guesses < .2) correct.answers ## 8.8.8 Poisson Random Numbers ## 8.9 Exercises ## ex4. plot.one <- function(){ xyrange <- range(milk) # Calculates the range of all values in the data frame par(pin=c(6.75, 6.75)) # Set plotting area = 6.75 in. by 6.75 in. plot(four, one, data=milk, xlim=xyrange, ylim=xyrange, pch=16) abline(0,1) # Line where four = one } ## *9. GLM, and General Non-linear Models ## 9.1 A Taxonomy of Extensions to the Linear Model ## 9.2 Logistic Regression ## 9.2.1 Anesthetic Depth Example anaes.logit <- glm(nomove ~ conc, family = binomial(link = logit), + data = anesthetic) summary(anaes.logit) ## 9.3 glm models (Generalized Linear Regression Modelling) anaes.logit <- glm(nomove ~ conc, family = binomial(link = logit), data=anesthetic) ## 9.3.2 Data in the form of counts ## 9.3.3 The gaussian family # Dataset airquality, from datasets package air.glm<-glm(Ozone^(1/3) ~ Solar.R + Wind + Temp, data = airquality) # Assumes gaussian family, i.e. normal errors model summary(air.glm) ## 9.4 Models that Include Smooth Spline Terms ## 9.4.1 Dewpoint Data dewpoint.lm <- lm(dewpoint ~ bs(mintemp) + bs(maxtemp), data = dewpoint) options(digits=3) summary(dewpoint.lm) ## 9.5 Survival Analysis ## 9.6 Non-linear Models ## 9.7 Model Summaries methods(summary) 9.8 Further Elaborations ## 9.9 Exercises ## 9.10 References *10. Multi-level Models, Repeated Measures and Time Series ## 10.1 Multi-Level Models, Including Repeated Measures Models ## 10.1.1 The Kiwifruit Shading Data, Again library(nlme) kiwishade$plot<-factor(paste(kiwishade$block, kiwishade$shade, sep=".")) kiwishade.lme<-lme(yield~shade,random=~1|block/plot, data=kiwishade) summary(kiwishade.lme) anova(kiwishade.lme) intervals(kiwishade.lme) kiwishade.aov<-aov(yield~block+shade+Error(block:shade),data=kiwishade) summary(kiwishade.aov) ## 10.1.2 The Tinting of Car Windows itstar.lme<-lme(log(it)~tint*target*agegp*sex, random=~1|id, data=tinting,method="ML") it2.lme<-lme(log(it)~(tint+target+agegp+sex)^2, random=~1|id, data=tinting,method="ML") it1.lme<-lme(log(it)~(tint+target+agegp+sex), random=~1|id, data=tinting,method="ML") anova(itstar.lme,it2.lme,it1.lme) it2.reml<-update(it2.lme,method="REML") options(digits=3) summary(it2.reml)$tTable ## 10.1.3 The Michelson Speed of Light Data library(MASS) # if needed michelson$Run <- as.numeric(michelson$Run) # Ensure Run is a variable mich.lme1 <- lme(fixed = Speed ~ Run, data = michelson, random = ~ Run| Expt, correlation = corAR1(form = ~ 1 | Expt), weights = varIdent(form = ~ 1 | Expt)) summary(mich.lme1) ## 10.2 Time Series Models ## 10.3 Exercises ## 10.4 References ## *11. Advanced Programming Topics ## 11.1. Methods fac<-ordered(1:3) class(fac) print.ordered function (x, quote = FALSE) { if (length(x) <= 0) cat("ordered(0)\n") else NextMethod("print") cat("Levels: ", paste(levels(x), collapse = " < "), "\n") invisible(x) } ## 11.2 Extracting Arguments to Functions extract.arg <- function (a) { s <- substitute(a) as.character(s) } extract.arg(a=xy) deparse.args <- function (a) { s <- substitute (a) if(mode(s) == "call"){ # the first element of a 'call' is the function called # so we don't deparse that, just the arguments. print(paste("The function is: ", s[1],"()", collapse="")) lapply (s[-1], function (x) paste (deparse(x), collapse = "\n")) } else stop ("argument is not a function call") } deparse.args(list(x+y, foo(bar))) ## 11.3 Parsing and Evaluation of Expressions expression(mean(x+y)) my.exp <- expression(mean(x+y)) x <- 101:110 y <- 21:30 my.exp <- expression(mean(x+y)) my.txt <- expression("mean(x+y)") eval(my.exp) eval(my.txt) parse(text="mean(x+y)") expression(mean(x + y)) my.exp2 <- parse(text="mean(x+y)") eval(my.exp2) make.new.df <- function(old.df = austpop, colnames = c("NSW", "ACT")) { attach(old.df) on.exit(detach(old.df)) argtxt <- paste(colnames, collapse = ",") exprtxt <- paste("data.frame(", argtxt, ")", sep = "") expr <- parse(text = exprtxt) df <- eval(expr) names(df) <- colnames df } make.new.df() make.new.df <- function(old.df = austpop, colnames = c("NSW", "ACT")) { attach(old.df) on.exit(detach(old.df)) argtxt <- paste(colnames, collapse = ",") listexpr <- parse(text=paste("list(", argtxt, ")", sep = "")) df <- do.call("data.frame", eval(listexpr)) names(df) <- colnames df } ## 11.4 Plotting a mathematical expression plotcurve <- function(equation = "y = sqrt(1/(1+x^2))", ...){ leftright <- strsplit(equation, split = "=")[[1]] left <- leftright[1] # The part to the left of the "=" right <- leftright[2] # The part to the right of the "=" expr <- parse(text=right) xname <- all.vars(expr) if(length(xname) > 1) stop(paste("There are multiple variables, i.e.", paste(xname, collapse=" & "), "on the right of the equation")) if(length(list(...))==0)assign(xname, 1:10) else { nam <- names(list(...)) if(nam!=xname)stop("Clash of variable names") assign("x", list(...)[[1]]) assign(xname, x) } y <- eval(expr) yexpr <- parse(text=left)[[1]] xexpr <- parse(text=xname)[[1]] plot(x, y, ylab = yexpr, xlab = xexpr, type="n") lines(spline(x,y)) mainexpr <- parse(text=paste(left, "==", right)) title(main = mainexpr) } plotcurve() plotcurve("ang=asin(sqrt(p))", p=(1:49)/50) ## 11.5 Searching R functions for a specified token. mygrep <- function(str) { ## Assign the names of all objects in current R ## working directory to the string vector tempobj ## tempobj <- ls(envir=sys.frame(-1)) objstring <- character(0) for(i in tempobj) { myfunc <- get(i) if(is.function(myfunc)) if(length(grep(str, deparse(myfunc)))) objstring <- c(objstring, i) } return(objstring) } mygrep("for") # Find all functions that include a for loop ## 12. Appendix 1 ## 12.1 R Packages for Windows ## 12.2 Contributed Documents and Published Literature ## 12.3 Data Sets Referred to in these Notes ## 12.4 Answers to Selected Exercises ## Section 1.6 ## 1. plot(distance~stretch,data=elasticband) ## 2. (ii), (iii), (iv) plot(snow.cover ~ year, data = snow) hist(snow$snow.cover) hist(log(snow$snow.cover)) ## Section 2.7 ## 1. The value of answer is (a) 12, (b) 22, (c) 600. ## 2. prod(c(10,3:5)) ## 3(i) bigsum <- 0; for (i in 1:100) {bigsum <- bigsum+i }; bigsum ## 3(ii) sum(1:100) ## 4(i) bigprod <- 1; for (i in 1:50) {bigprod <- bigprod*i }; bigprod ## 4(ii) prod(1:50) ## 5. radius <- 3:20; volume <- 4*pi*radius^3/3 sphere.data <- data.frame(radius=radius, volume=volume) ## 6. sapply(tinting, is.factor) sapply(tinting[, 4:6], levels) sapply(tinting[, 4:6], is.ordered) ## Section 3.9 ## 2. par(mfrow = c(1,2)) plot(Animals$body, Animals$brain, pch=1, xlab="Body weight (kg)",ylab="Brain weight (g)") plot(log(Animals$body),log(Animals$brain),pch=1, xlab="Body weight (kg)", ylab="Brain weight (g)", axes=F) brainaxis <- 10^seq(-1,4) bodyaxis <-10^seq(-2,4) axis(1, at=log(bodyaxis), lab=bodyaxis) axis(2, at=log(brainaxis), lab=brainaxis) box() identify(log(Animals$body), log(Animals$brain), labels=row.names(Animals)) ## Section 7.9 ## 1. x <- seq(101,112) or x <- 101:112 ## 2. rep(c(4,6,3),4) ## 3. c(rep(4,8),rep(6,7),rep(3,9)) or rep(c(4,6,3),c(8,7,9)) mat64 <- matrix(c(rep(4,8),rep(6,7),rep(3,9)), nrow=6, ncol=4) ## 4. rep(seq(1,9),seq(1,9)) or rep(1:9, 1:9) ## 6. (a) Use summary(airquality) to get this information. (b) airquality[airquality$Ozone == max(airquality$Ozone),] (c) airquality$Wind[airquality$Ozone > quantile(airquality$Ozone, .75)] ## 7. mean(snow$snow.cover[seq(2,10,2)]) mean(snow$snow.cover[seq(1,9,2)]) ## 9. s summary(attitude); summary(cpus) ## 10. mtcars6<-mtcars[mtcars$cyl==6,] ## 11. Cars93[Cars93$Type=="Small"|Cars93$Type=="Sporty",]