# At the beginning of class, I created a couple of made-up variables that # will be useful for demonstrating what a bad residuals plot looks like. # We didn't actually have time for that demonstration, so we'll take a # look at that on Thursday: > x_hetero <- c( + 50,51,52,52,53,54,56,56,57,58,59,60,63,64,65,66,66,66,66,69,70,72,74,74,74, + 76,76,76,77,81,82,82,83,84,84,84,85,85,85,88,91,92,92,92,93,96,97,98,98,99) > y_hetero <- c( + 80, 74, 76, 85, 87, 86, 86, 94, 87, 91, 94, 83, 96,101,108,104, 89,105, 91, + 115,104, 89,101, 95, 98,110,112,119,137,113,119,111,110,128,152,127,171,131, + 129,145,146,111, 77,135,115,150,135,124,177,117) # Here's the previously run regression of Peabody on Raven: > regout Call: lm(formula = CTPEA ~ CTRA) Coefficients: (Intercept) CTRA 59.0490 0.7319 > anova(regout) Analysis of Variance Table Response: CTPEA Df Sum Sq Mean Sq F value Pr(>F) CTRA 1 2189.0 2189.00 15.541 0.0002616 *** Residuals 48 6760.8 140.85 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > summary(regout) Call: lm(formula = CTPEA ~ CTRA) Residuals: Min 1Q Median 3Q Max -26.520 -8.273 -1.516 6.915 42.459 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 59.0490 5.9663 9.897 3.55e-13 *** CTRA 0.7319 0.1856 3.942 0.000262 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 11.87 on 48 degrees of freedom Multiple R-squared: 0.2446, Adjusted R-squared: 0.2288 F-statistic: 15.54 on 1 and 48 DF, p-value: 0.0002616 # Just to review, here's how it was created: > attach(JackStatlab) > regout <- lm(CTPEA~CTRA) > regout Call: lm(formula = CTPEA ~ CTRA) Coefficients: (Intercept) CTRA 59.0490 0.7319 # We always want to consider a scatterplot when we use linear # methods such as correlation and regression. Here's a bad one: > plot(CTRA,CTPEA) # We change the aspect ratio to be more horizontal: > par(pin=c(6,4)) # We create increments that will help us add a bit of white # space to the plot: > RAinc <- .05* (max(CTRA)-min(CTRA)) > RAinc [1] 1.7 # Note that this matches what we got in a previous class: > xinc [1] 1.7 # We do the same for Peabody: > PBinc <- .05*(max(CTPEA)-min(CTPEA)) > PBinc [1] 3.15 > yinc [1] 3.15 # Here's a good scatterplot with a reasonable aspect ratio, # title, axis labels, and white space: > plot(CTRA, CTPEA, main="Regression of Peabody on Raven", + xlab="Raven Scores", ylab="Peabody Scores", + xlim=c(min(CTRA)-RAinc,max(CTRA)+RAinc), + ylim=c(min(CTPEA)-PBinc,max(CTPEA)+PBinc)) > abline(regout$coef) # Here's an unimproved residuals plot: > plot(regout$fit, regout$res) # We make our usual improvements: > resxinc <- .05 * (max(regout$fit)-min(regout$fit)) > resyinc <- .05 * (max(regout$res)-min(regout$res)) > plot(regout$fit, regout$res, main="Residuals vs. Fitted Values from\n Regression of Peabody on Raven", + xlab="Fitted Conditional Means", ylab="Residuals", + xlim=c(min(regout$fit)-resxinc,max(regout$fit)+resxinc), + ylim=c(min(regout$res)-resyinc,max(regout$res)+resyinc)) # It can be useful to add a horizontal reference line: > abline(h=0) # That residuals plot is useful for evaluating the assumption of # linearity, as well as the assumption that variability in the # errors (represented here by residuals) is the same through the # range of fitted values. # Here, we evaluate the normality of the residuals to assess # the assumption that errors are normally distributed: > stem(regout$res) The decimal point is 1 digit(s) to the right of the | -2 | 74 -1 | 3322110 -0 | 98888755544442221111 0 | 00235677789 1 | 112335569 2 | 3 | 4 | 2 > qqnorm(regout$res);qqline(regout$res) # Note that the plot for the residuals is similar to, but not # identical to, the plot for Peabody: > qqnorm(CTPEA) > qqline(CTPEA) # Note that the title for a Q-Q plot can be manipulated the same way # we control the title of a scatterplot: > qqnorm(regout$res, main="Normal Q-Q Plot for Regression\nof Peabody on Raven");qqline(regout$res) # We take a look again at the original scatterplot. There's concern about the one Peabody # score that is much higher than we expect: > plot(CTRA, CTPEA, main="Regression of Peabody on Raven", + xlab="Raven Scores", ylab="Peabody Scores", + xlim=c(min(CTRA)-RAinc,max(CTRA)+RAinc), + ylim=c(min(CTPEA)-PBinc,max(CTPEA)+PBinc)) > abline(regout$coef) # It's the 39th case that stands out as extreme: > CTPEA [1] 72 69 80 90 77 84 59 72 62 82 66 72 96 80 74 81 82 81 69 [20] 68 85 69 106 95 60 64 86 79 83 80 78 64 89 89 81 86 75 71 [39] 122 104 103 65 92 108 86 85 88 78 91 103 # A "Studentized residual" divides the residual by its standard # error. An "externally Studentized residual" does the same thing, # but using the standard error from a regression that omits that # observation. R's "rstudent()" function gives externally # Studentized residuals: > rstudent(regout) 1 2 3 4 5 6 -0.20628664 -0.08936128 -1.04756125 0.45132004 -0.15273225 0.50160128 7 8 9 10 11 12 -0.96881902 -0.32988816 -0.57820932 -0.41273931 -0.72172749 -0.32988816 13 14 15 16 17 18 0.71801825 -0.39437460 -0.71730783 -1.09811401 -0.86962423 -0.69089540 19 20 21 22 23 24 -1.15047096 -0.73395284 -0.15772921 -0.70970946 1.29231413 0.57023188 25 26 27 28 29 30 -1.00236506 -2.41285366 0.99787014 -0.04565285 -0.32759810 0.03865804 31 32 33 34 35 36 -0.31508800 -2.09524623 1.26421011 0.24317066 0.56304765 0.93133822 37 38 39 40 41 42 -0.07487684 -0.10518522 4.19724329 1.42329478 1.14620560 -1.11932626 43 44 45 46 47 48 0.74591117 1.66343261 0.17438925 0.02831102 0.59136996 -0.43919159 49 50 0.97579609 1.14620560 # They can be viewed as t-tests of the idea that the case is an # "outlier" (an extreme observation that for some reason comes from # another population than the other cases). The test has degrees of # freedom one less than the degrees of freedom of the error mean # square in the regression. In the case of simple, one-predictor # regressions, that will be N-3. So we can view the externally Studentized # residual for our worrisom point (4.197) as a test for outlier status. # Here's the p value: > 2*pt(-abs(4.19724329), 47) [1] 0.0001190733 # That's dramatic evidence that this point doesn't behave like the others. # I'm not too worried that it has skewed the regression, though, because # the corresponding Raven score is near the mean of the Raven distribution: > mean(CTRA) [1] 30.84 > CTRA[39] [1] 28 # I demonstrate the idea that the closer a predictor is to the mean # of the predictor values, the less influence the case has on the # regression slope. Here's the original regression: > regout Call: lm(formula = CTPEA ~ CTRA) Coefficients: (Intercept) CTRA 59.0490 0.7319 # If I add a ridiculously large Peabody score that has a Raven # score exactly at the mean... > Y <- c(CTPEA,10000) > X <- c(CTRA,30.84) # ...the slope is absolutely unchanged: > lm(Y~X) Call: lm(formula = Y ~ X) Coefficients: (Intercept) X 253.5271 0.7319 # A statistic called "leverage" measures the potential influence # of a data point. Its calculation involves matrix algebra, which # I'm not even going to try to explain here: > X <- cbind(rep(1,50),CTRA) > X CTRA [1,] 1 21 [2,] 1 15 [3,] 1 45 [4,] 1 35 [5,] 1 27 [6,] 1 26 [7,] 1 15 [8,] 1 23 [9,] 1 13 [10,] 1 38 [11,] 1 21 [12,] 1 23 [13,] 1 39 [14,] 1 35 [15,] 1 32 [16,] 1 47 [17,] 1 45 [18,] 1 41 [19,] 1 32 [20,] 1 24 [21,] 1 38 [22,] 1 25 [23,] 1 44 [24,] 1 40 [25,] 1 17 [26,] 1 43 [27,] 1 21 [28,] 1 28 [29,] 1 38 [30,] 1 28 [31,] 1 31 [32,] 1 39 [33,] 1 21 [34,] 1 37 [35,] 1 21 [36,] 1 22 [37,] 1 23 [38,] 1 18 [39,] 1 28 [40,] 1 39 [41,] 1 42 [42,] 1 26 [43,] 1 33 [44,] 1 41 [45,] 1 34 [46,] 1 35 [47,] 1 30 [48,] 1 33 [49,] 1 28 [50,] 1 42 > H <- X%*%solve(t(X)%*%X)%*%t(X) # The result is a huge matrix, but the elements on the diagonal # of the matrix represent this idea of potential influence: > diag(H) [1] 0.04369274 0.08139535 0.06906272 0.02423459 0.02360817 0.02573213 [7] 0.08139535 0.03504033 0.09787800 0.03254444 0.04369274 0.03504033 [13] 0.03629316 0.02423459 0.02032926 0.08390103 0.06906272 0.04525879 [19] 0.02032926 0.03144820 0.03254444 0.02834547 0.06237765 0.04053128 [25] 0.06687025 0.05618197 0.04369274 0.02197361 0.03254444 0.02197361 [31] 0.02000626 0.03629316 0.04369274 0.02928510 0.04369274 0.03912184 [37] 0.03504033 0.06034179 0.02197361 0.03629316 0.05047569 0.02573213 [43] 0.02114165 0.04525879 0.02244343 0.02423459 0.02017266 0.02114165 [49] 0.02197361 0.05047569 # The leverage for the 39th case (0.02197361) is small compared to most # of the others, because the Raven value is near the mean. # If we compare the actual regression... > regout Call: lm(formula = CTPEA ~ CTRA) Coefficients: (Intercept) CTRA 59.0490 0.7319 # ...to the regression with that case dropped, we see that the estimated # slope doesn't change very much when the extreme case is eliminated: > X <- c(CTRA[1:38],CTRA[40:50]) > Y <- c(CTPEA[1:38],CTPEA[40:50]) > lm(Y~X) Call: lm(formula = Y ~ X) Coefficients: (Intercept) X 57.250 0.762 # Cook's distance provides a measure of how much the estimates actually do # change when a point is dropped. Cooks distance is actually the highest # for the 39th case, but that is partly because it has a large effect on the # intercept, which we care less about than the slope: > cooks.distance(regout) 1 2 3 4 5 6 9.919149e-04 3.612512e-04 4.062299e-02 2.572142e-03 2.878708e-04 3.375276e-03 7 8 9 10 11 12 4.163725e-02 2.013267e-03 1.839183e-02 2.915678e-03 1.201946e-02 2.013267e-03 13 14 15 16 17 18 9.806778e-03 1.966015e-03 5.393087e-03 5.498335e-02 2.819471e-02 1.143844e-02 19 20 21 22 23 24 1.364096e-02 8.830257e-03 4.271236e-04 7.423640e-03 5.478813e-02 6.965970e-03 25 26 27 28 29 30 3.599730e-02 1.574593e-01 2.274932e-02 2.391007e-05 1.839295e-03 1.714471e-05 31 32 33 34 35 36 1.032769e-03 7.721153e-02 3.606128e-02 9.097974e-04 7.346762e-03 1.770669e-02 37 38 39 40 41 42 1.039479e-04 3.627178e-04 1.470092e-01 3.734706e-02 3.469296e-02 1.645882e-02 43 44 45 46 47 48 6.064509e-03 6.325537e-02 3.563027e-04 1.016500e-05 3.649437e-03 2.118662e-03 49 50 1.070710e-02 3.469296e-02 # We turn now to viewing familiar statistics as special cases of regression. # The sample mean and variance... > mean(CTPEA) [1] 81.62 > sd(CTPEA); var(CTPEA) [1] 13.51475 [1] 182.6486 # ...are equivalent to the intercept and error mean square from a regression # of Peabody on a constant: > temp <- lm(CTPEA~1) # Note: that's a one, not a lower case ell > temp Call: lm(formula = CTPEA ~ 1) Coefficients: (Intercept) 81.62 > anova(temp) Analysis of Variance Table Response: CTPEA Df Sum Sq Mean Sq F value Pr(>F) Residuals 49 8949.8 182.65 # A two-sample t test is equivalent to a regression on a dummy (0, 1) # variable that indicates group membership (girl=0, boy=1). # Here's the t test: > t.test(CTPEA~CBSEX, var.equal=TRUE) Two Sample t-test data: CTPEA by CBSEX t = -2.8494, df = 48, p-value = 0.006434 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -17.373374 -2.998421 sample estimates: mean in group 0 mean in group 1 76.73077 86.91667 # Here's a dummy variable for sex: > CBSEX [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 [39] 1 1 1 1 1 1 1 1 1 1 1 1 # The intercept is the girls' mean: > lm(CTPEA~CBSEX)$coef (Intercept) CBSEX 76.73077 10.18590 # And the intercept plus the slope is the boys' mean: > 76.73077 + 10.18590 [1] 86.91667 # So the slope is what lets the boys' mean differ from the girls', # and if we test the null hypothesis that the slope equals zero, # we are testing exactly the same thing that the t test tested: > lm(CTPEA~CBSEX) -> temp > summary(temp) Call: lm(formula = CTPEA ~ CBSEX) Residuals: Min 1Q Median 3Q Max -22.917 -7.870 -0.917 5.223 35.083 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 76.731 2.477 30.982 < 2e-16 *** CBSEX 10.186 3.575 2.849 0.00643 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 12.63 on 48 degrees of freedom Multiple R-squared: 0.1447, Adjusted R-squared: 0.1269 F-statistic: 8.119 on 1 and 48 DF, p-value: 0.006434 # Note that the t statistic for the slope exactly matches # the result from the t.test() function. # Actually, ANY artificially created variable that uses two # distinct values to indicate membership in two different # groups can give us the same test. Here, for example, we # create what's called an "effects coding" scheme, but using # -1's and 1's in place of dummy coding's 0's and 1's: > table(CBSEX) CBSEX 0 1 26 24 > SexEffect <- c(rep(-1,26),rep(1,24)) > summary(lm(CTPEA~SexEffect)) Call: lm(formula = CTPEA ~ SexEffect) Residuals: Min 1Q Median 3Q Max -22.917 -7.870 -0.917 5.223 35.083 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 81.824 1.787 45.779 < 2e-16 *** SexEffect 5.093 1.787 2.849 0.00643 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 12.63 on 48 degrees of freedom Multiple R-squared: 0.1447, Adjusted R-squared: 0.1269 F-statistic: 8.119 on 1 and 48 DF, p-value: 0.006434 # We got the same test bacause the regression is still identifying # the same two groups: > cbind(CBSEX,SexEffect) CBSEX SexEffect [1,] 0 -1 [2,] 0 -1 [3,] 0 -1 [4,] 0 -1 [5,] 0 -1 [6,] 0 -1 [7,] 0 -1 [8,] 0 -1 [9,] 0 -1 [10,] 0 -1 [11,] 0 -1 [12,] 0 -1 [13,] 0 -1 [14,] 0 -1 [15,] 0 -1 [16,] 0 -1 [17,] 0 -1 [18,] 0 -1 [19,] 0 -1 [20,] 0 -1 [21,] 0 -1 [22,] 0 -1 [23,] 0 -1 [24,] 0 -1 [25,] 0 -1 [26,] 0 -1 [27,] 1 1 [28,] 1 1 [29,] 1 1 [30,] 1 1 [31,] 1 1 [32,] 1 1 [33,] 1 1 [34,] 1 1 [35,] 1 1 [36,] 1 1 [37,] 1 1 [38,] 1 1 [39,] 1 1 [40,] 1 1 [41,] 1 1 [42,] 1 1 [43,] 1 1 [44,] 1 1 [45,] 1 1 [46,] 1 1 [47,] 1 1 [48,] 1 1 [49,] 1 1 [50,] 1 1 > > summary(lm(CTPEA~SexEffect)) Call: lm(formula = CTPEA ~ SexEffect) Residuals: Min 1Q Median 3Q Max -22.917 -7.870 -0.917 5.223 35.083 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 81.824 1.787 45.779 < 2e-16 *** SexEffect 5.093 1.787 2.849 0.00643 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 12.63 on 48 degrees of freedom Multiple R-squared: 0.1447, Adjusted R-squared: 0.1269 F-statistic: 8.119 on 1 and 48 DF, p-value: 0.006434 # This time, the value indicating "girl" is -1 instead of # zero, so the conditional mean for girls is: > 81.824 + (-1)*5.093 [1] 76.731 # And for boys: > 81.824 + (1)*5.093 [1] 86.917 # The intercept is actually the mean of the two group means. # If we had a data set where the number of boys and girls was # the same, this would be the overall (unconditional) mean # of Peabody. > mean(c(76.731,86.917)) [1] 81.824 # Here we create yet another coding system that uses two # distinct values on a single variable to indicate group membership: > nonsense <- c(rep(pi,26),rep(1384,24)) > cbind(CBSEX, SexEffect, nonsense) CBSEX SexEffect nonsense [1,] 0 -1 3.141593 [2,] 0 -1 3.141593 [3,] 0 -1 3.141593 [4,] 0 -1 3.141593 [5,] 0 -1 3.141593 [6,] 0 -1 3.141593 [7,] 0 -1 3.141593 [8,] 0 -1 3.141593 [9,] 0 -1 3.141593 [10,] 0 -1 3.141593 [11,] 0 -1 3.141593 [12,] 0 -1 3.141593 [13,] 0 -1 3.141593 [14,] 0 -1 3.141593 [15,] 0 -1 3.141593 [16,] 0 -1 3.141593 [17,] 0 -1 3.141593 [18,] 0 -1 3.141593 [19,] 0 -1 3.141593 [20,] 0 -1 3.141593 [21,] 0 -1 3.141593 [22,] 0 -1 3.141593 [23,] 0 -1 3.141593 [24,] 0 -1 3.141593 [25,] 0 -1 3.141593 [26,] 0 -1 3.141593 [27,] 1 1 1384.000000 [28,] 1 1 1384.000000 [29,] 1 1 1384.000000 [30,] 1 1 1384.000000 [31,] 1 1 1384.000000 [32,] 1 1 1384.000000 [33,] 1 1 1384.000000 [34,] 1 1 1384.000000 [35,] 1 1 1384.000000 [36,] 1 1 1384.000000 [37,] 1 1 1384.000000 [38,] 1 1 1384.000000 [39,] 1 1 1384.000000 [40,] 1 1 1384.000000 [41,] 1 1 1384.000000 [42,] 1 1 1384.000000 [43,] 1 1 1384.000000 [44,] 1 1 1384.000000 [45,] 1 1 1384.000000 [46,] 1 1 1384.000000 [47,] 1 1 1384.000000 [48,] 1 1 1384.000000 [49,] 1 1 1384.000000 [50,] 1 1 1384.000000 # Even though the values of the predictor are totally nonsensical # (so the intercept an slope have no interesting meaning at all), # the conditional means for the boys and girls are the same as before, # and the test on the slope is still equivalent to the t.test() results: > summary(lm(CTPEA~nonsense)) Call: lm(formula = CTPEA ~ nonsense) Residuals: Min 1Q Median 3Q Max -22.917 -7.870 -0.917 5.223 35.083 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 76.707595 2.482287 30.902 < 2e-16 *** nonsense 0.007376 0.002589 2.849 0.00643 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 12.63 on 48 degrees of freedom Multiple R-squared: 0.1447, Adjusted R-squared: 0.1269 F-statistic: 8.119 on 1 and 48 DF, p-value: 0.006434 >