5. Linear (Multiple Regression) Models and Analysis of Variance ## 5.1 The Model Formula in Straight Line Regression plot(distance ~ stretch, data=elasticband) elastic.lm <- lm(distance ~ stretch, data=elasticband) options(digits=4) summary(elastic.lm) ## 5.2 Regression Objects names(elastic.lm) coef(elastic.lm) resid(elastic.lm) x11(width=7, height=2, pointsize=10) par(mfrow = c(1, 4), mar=c(5.1,4.1,2.1,1.1)) plot(elastic.lm) par(mfrow=c(1,1)) ## 5.3 Model Formulae, and the X Matrix model.matrix(distance ~ stretch, data=elasticband) model.matrix(elastic.lm) ## 5.3.1 Model Formulae in General *5.3.2 Manipulating Model Formulae names(elasticband) nam <- names(elasticband) formds <- formula(paste(nam[1],"~",nam[2])) lm(formds, data=elasticband) ## 5.4 Multiple Linear Regression Models ## 5.4.1 The data frame Rubber library(MASS) # if needed (the dataset Rubber is in the MASS package) pairs(Rubber) Rubber.lm <- lm(loss~hard+tens, data=Rubber) options(digits=3) summary(Rubber.lm) par(mfrow=c(1,2)) termplot(Rubber.lm, partial=TRUE, smooth=panel.smooth) par(mfrow=c(1,1)) ## 5.4.2 Weights of Books oddbooks logbooks <- log(oddbooks) # We might expect weight to be # proportional to thick * height * width logbooks.lm1<-lm(weight~thick,data=logbooks) summary(logbooks.lm1)$coef logbooks.lm2<-lm(weight~thick+height,data=logbooks) summary(logbooks.lm2)$coef logbooks.lm3<-lm(weight~thick+height+width,data=logbooks) summary(logbooks.lm3)$coef ## 5.5 Polynomial and Spline Regression ## 5.5.1 Polynomial Terms in Linear Models plot(grain ~ rate, data=seedrates) # Plot the data seedrates.lm2 <- lm(grain ~ rate+I(rate^2), data=seedrates) summary(seedrates.lm2) hat <- predict(seedrates.lm2) lines(spline(seedrates$rate, hat)) # Placing the spline fit through the fitted points allows a smooth curve. # For this to work the values of seedrates$rate must be ordered. model.matrix(grain~rate+I(rate^2),data=seedrates) ## 5.5.2 What order of polynomial? seedrates.lm1<-lm(grain~rate,data=seedrates) seedrates.lm2<-lm(grain~rate+I(rate^2),data=seedrates) anova(seedrates.lm2,seedrates.lm1) # However we have an independent estimate of the error mean # square. The estimate is 0.17^2, on 35 df. 1-pf(0.16/0.17^2, 1, 35) ## 5.5.3 Pointwise confidence bounds for the fitted curve plot(grain ~ rate, data = seedrates, pch = 16, xlim = c(50, 175), ylim = c(15.5, 22),xlab="Seeding rate",ylab="Grains per head") new.df <- data.frame(rate = c((4:14) * 12.5)) seedrates.lm2 <- lm(grain ~ rate + I(rate^2), data = seedrates) pred2 <- predict(seedrates.lm2, newdata = new.df, interval="confidence") hat2 <- data.frame(fit=pred2[,"fit"],lower=pred2[,"lwr"], upper=pred2[,"upr"]) attach(new.df) lines(rate, hat2$fit) lines(rate,hat2$lower,lty=2) lines(rate, hat2$upper,lty=2) detach(new.df) ## 5.5.4 Spline Terms in Linear Models plot(dist~speed,data=cars) library(splines) cars.lm<-lm(dist~bs(speed),data=cars) # By default, there are no knots hat<-predict(cars.lm) lines(cars$speed,hat,lty=3) # NB assumes values of speed are sorted cars.lm5 <- lm(dist~bs(speed,5),data=cars) # try for a closer fit (1 knot) ci5<-predict(cars.lm5,interval="confidence",se.fit=TRUE) names(ci5) lines(cars$speed,ci5$fit[,"fit"]) lines(cars$speed,ci5$fit[,"lwr"],lty=2) lines(cars$speed,ci5$fit[,"upr"],lty=2) ## 5.6 Using Factors in R Models insulation <- factor(c(rep("without", 8), rep("with", 7))) # 8 without, then 7 with # `with' precedes `without' in alphanumeric order, & is the baseline kWh <- c(10225, 10689, 14683, 6584, 8541, 12086, 12467, 12669, 9708, 6700, 4307, 10315, 8017, 8162, 8022) insulation <- factor(c(rep("without", 8), rep("with", 7))) # 8 without, then 7 with kWh <- c(10225, 10689, 14683, 6584, 8541, 12086, 12467, + 12669, 9708, 6700, 4307, 10315, 8017, 8162, 8022) insulation.lm <- lm(kWh ~ insulation) summary(insulation.lm, corr=F) ## 5.6.1 The Model Matrix model.matrix(kWh~insulation) *5.6.2 Other Choices of Contrasts insulation <- relevel(insulation, baseline="without") # Make `without' the baseline options(contrasts = c("contr.sum", "contr.poly"), digits = 2) # Try the `sum' contrasts insulation <- factor(insulation, levels=c("without", "with")) # Make `without' the baseline insulation.lm <- lm(kWh ~ insulation) summary(insulation.lm, corr=F) insulation <- C(insulation, contr=treatment) ## 5.7 Multiple Lines - Different Regression Lines for Different Species plot(logheart ~ logweight, data=dolphins) # Plot the data options(digits=4) cet.lm1 <- lm(logheart ~ logweight, data = dolphins) summary(cet.lm1, corr=F) cet.lm2 <- lm(logheart ~ factor(species) + logweight, data=dolphins) model.matrix(cet.lm2) cet.lm3 <- lm(logheart ~ factor(species) + logweight + factor(species):logweight, data=dolphins) model.matrix(cet.lm3) anova(cet.lm1,cet.lm2,cet.lm3) ## 5.8 aov models (Analysis of Variance) ## 5.8.1 Plant Growth Example attach(PlantGrowth) # PlantGrowth is from the base datasets package boxplot(split(weight,group)) # Looks OK PlantGrowth.aov <- aov(weight~group) summary(PlantGrowth.aov) summary.lm(PlantGrowth.aov) help(cabbages) # cabbages is from the MASS package names(cabbages) coplot(HeadWt~VitC|Cult+Date,data=cabbages) VitC.aov<-aov(VitC~Cult+Date,data=cabbages) summary(VitC.aov) ## *5.8.2 Shading of Kiwifruit Vines kiwishade$shade <- relevel(kiwishade$shade, ref="none") ## Make sure that the level "none" (no shade) is used as reference kiwishade.aov<-aov(yield~block+shade+Error(block:shade),data=kiwishade) summary(kiwishade.aov) coef(kiwishade.aov) ## 5.9 Exercises ## ex6. Type hosp<-rep(c("RNC","Hunter","Mater"),2) hosp fhosp<-factor(hosp) levels(fhosp) ## 5.10 References