# We start with a bad scatterplot of Raven against Peabody # to help us assess linearity: > attach(JackStatlab) > plot(CTPEA,CTRA) # Here's a relationship that just looks like a random blob. # It is a common misconception to describe this as nonlinear # because there doesn't seem to be a strong linear trend. # The point, though, is there there isn't a NONlinear trend # either; so you can think of this as a linear relationship # in which the slope is zero. > plot(rnorm(50),rnorm(50)) # Recall that we had previously saved the results from # the regression of Raven on Peabody: > regout Call: lm(formula = Raven ~ Peabody) Coefficients: (Intercept) Peabody -7.589 0.496 # The residuals plot plots residuals on the vertical axis # and estimated conditional means on the horizontal axis. It # is another useful tool for assessing linearity. Here's a # poorly done one: > plot(regout$fit, regout$res) # So let's improve that. (This is how you should do residuals # plots for the purposes of your homework.) > par(pin=c(6,4)) > plot(regout$fit, regout$res) > xinc <- .05*(max(regout$fit)-min(regout$fit)) > yinc <- .05*(max(regout$res)-min(regout$res)) > plot(regout$fit, regout$res, main="Residuals Plot for Regression\nof Raven on Peabody", + xlab="Fitted Value",ylab="Residual", + xlim=c(min(regout$fit)-xinc,max(regout$fit)+xinc), + ylim=c(min(regout$res)-yinc,max(regout$res)+yinc)) # We can evaluate linearity in the residuals plot by considering # whether there are ranges of the horizontal axis for which points # on the vertical axis tend to be systematically above or below zero. # Sometimes it can be much easier to see problems with linearity # in the residuals plot than in the raw scatterplot. For example, # this relationship looks linear: > plot(x,y) > temp <- lm(y~x) > temp Call: lm(formula = y ~ x) Coefficients: (Intercept) x -0.1367 1.1308 # But here's the residuals plot: > plot(temp$fit, temp$res) # If we go back to the original plot and add the regression # line, we can see the subtle nonlinearity that was obvious # in the residuals plot: > plot(x,y) > abline(temp$coef) # If you want to replicate this example, here's how x and y # were created: x <- seq(.5,1,.01); y <- x^1.2 # Now we repeat the real residuals plot, this time using it to # evaluate the assumption of homscedasticity (aka homogeneity # of variance). Imagine that as you move from left to right in # the plot, you are walking from one edge of a field to the other, # scattering seeds. Are the seeds being scattered in such a way # that they are reaching about as far above and below the center # of the field, regardless of how far you've walked? If so, there # is no evidence of a problem with this assumption. > plot(regout$fit, regout$res, main="Residuals Plot for Regression\nof Raven on Peabody", + xlab="Fitted Value",ylab="Residual", + xlim=c(min(regout$fit)-xinc,max(regout$fit)+xinc), + ylim=c(min(regout$res)-yinc,max(regout$res)+yinc)) # On the other hand, if you see something like this, there IS a problem. > temp <- lm(y_hetero~x_hetero) > plot(temp$fit, temp$res) > plot(x_hetero, y_hetero) > plot(temp$fit, temp$res) # That won't work if you paste from the transcript. But here are # the data if you want to try to replicate the example: # > x_hetero # [1] 50 51 52 52 53 54 56 56 57 58 59 60 63 64 65 66 66 66 66 69 70 72 74 74 74 #[26] 76 76 76 77 81 82 82 83 84 84 84 85 85 85 88 91 92 92 92 93 96 97 98 98 99 #> y_hetero # [1] 80 74 76 85 87 86 86 94 87 91 94 83 96 101 108 104 89 105 91 #[20] 115 104 89 101 95 98 110 112 119 137 113 119 111 110 128 152 127 171 131 #[39] 129 145 146 111 77 135 115 150 135 124 177 117 # Here, we use standard techniques to evaluate the normality of the residuals # (thereby indirectly evaluating the assumption of normally distributed errors): > qqnorm(regout$res); qqline(regout$res) > stem(regout$res) The decimal point is 1 digit(s) to the right of the | -1 | 65 -1 | 44442 -0 | 9888776 -0 | 433321110 0 | 000111122333444 0 | 667899 1 | 344 1 | 567 # Often problems with one assumption can create problems with another. # For example, the data I created to illustrate problems with heterogeneity # of variance also have problems with normality as a consequence: > plot(temp$fit,temp$res) > qqnorm(temp$res);qqline(temp$res) # In both the residuals plot and the raw scatterplot, we've noticed # an observation off to the right that might be unduly influencing # the regression estimation: > plot(regout$fit, regout$res, main="Residuals Plot for Regression\nof Raven on Peabody", + xlab="Fitted Value",ylab="Residual", + xlim=c(min(regout$fit)-xinc,max(regout$fit)+xinc), + ylim=c(min(regout$res)-yinc,max(regout$res)+yinc)) > plot(CTPEA,CTRA) # The extreme Peabody score (123) is the 35th case in the data set: > CTPEA [1] 81 94 84 74 71 59 86 63 81 100 63 78 87 75 62 64 85 76 83 [20] 82 77 76 102 75 77 68 75 86 78 88 100 70 85 74 123 81 69 94 [39] 78 99 77 93 65 65 94 69 78 82 66 89 # I do some matrix magic to calculate an index called "leverage," which measures # how much influence a data point COULD exert on the regression: > X <- cbind(rep(1,50),CTPEA) > X CTPEA [1,] 1 81 [2,] 1 94 [3,] 1 84 [4,] 1 74 [5,] 1 71 [6,] 1 59 [7,] 1 86 [8,] 1 63 [9,] 1 81 [10,] 1 100 [11,] 1 63 [12,] 1 78 [13,] 1 87 [14,] 1 75 [15,] 1 62 [16,] 1 64 [17,] 1 85 [18,] 1 76 [19,] 1 83 [20,] 1 82 [21,] 1 77 [22,] 1 76 [23,] 1 102 [24,] 1 75 [25,] 1 77 [26,] 1 68 [27,] 1 75 [28,] 1 86 [29,] 1 78 [30,] 1 88 [31,] 1 100 [32,] 1 70 [33,] 1 85 [34,] 1 74 [35,] 1 123 [36,] 1 81 [37,] 1 69 [38,] 1 94 [39,] 1 78 [40,] 1 99 [41,] 1 77 [42,] 1 93 [43,] 1 65 [44,] 1 65 [45,] 1 94 [46,] 1 69 [47,] 1 78 [48,] 1 82 [49,] 1 66 [50,] 1 89 > H <- X%*%solve(t(X)%*%X)%*%t(X) > diag(H) [1] 0.02012576 0.04559132 0.02207417 0.02474538 0.03065348 0.07785538 [7] 0.02468253 0.05793128 0.02012576 0.07227202 0.05793128 0.02053429 [13] 0.02637954 0.02329979 0.06251948 0.05360496 0.02324741 0.02211607 [19] 0.02116282 0.02051334 0.02119424 0.02211607 0.08326066 0.02329979 [25] 0.02119424 0.03891853 0.02329979 0.02468253 0.02053429 0.02833843 [31] 0.07227202 0.03314661 0.02324741 0.02474538 0.26188624 0.02012576 [37] 0.03590163 0.04559132 0.02053429 0.06717053 0.02119424 0.04206113 [43] 0.04954053 0.04954053 0.04559132 0.03590163 0.02053429 0.02051334 [49] 0.04573797 0.03055920 # Notice that the 35th case has leverage that is about 10 times as large # as the leverage of more typical observations. This point is FAR from the # Peabody mean. So let's drop that case and see how much the regression # estimates change: > badPea <- c(CTPEA[1:34],CTPEA[36:50]) > badRA <- c(CTRA[1:34],CTRA[36:50]) > plot(badPea,badRA) # Here's the original regression: > regout Call: lm(formula = Raven ~ Peabody) Coefficients: (Intercept) Peabody -7.589 0.496 # Here's the regression with the extreme case removed: > lm(badRA~badPea) Call: lm(formula = badRA ~ badPea) Coefficients: (Intercept) badPea -11.3313 0.5449 # There was a pretty big change in the slope when we dropped the case. # Another index that can be useful in this context is Cook's distance. # It measures not how much the regression estimates COULD change due to # the extreme point but, rather, how much they DO change: > cooks.distance(regout) 1 2 3 4 5 6 2.379862e-05 6.532398e-02 9.608828e-03 1.432132e-02 9.491560e-03 2.702617e-02 7 8 9 10 11 12 1.954099e-04 2.536848e-02 8.093853e-04 4.974805e-03 3.774918e-02 3.586707e-02 13 14 15 16 17 18 3.844107e-02 3.126711e-04 3.796268e-03 5.865927e-03 2.984917e-05 2.958541e-02 19 20 21 22 23 24 2.605809e-05 3.153208e-02 1.294741e-02 1.224631e-04 3.215120e-02 3.119128e-03 25 26 27 28 29 30 4.437259e-02 1.849196e-02 6.024741e-05 3.902945e-02 1.361400e-03 2.891978e-02 31 32 33 34 35 36 5.531048e-08 3.891902e-02 2.973477e-02 1.135106e-02 1.312020e-01 1.786266e-03 37 38 39 40 41 42 1.037246e-02 1.180526e-02 2.159492e-03 1.129594e-03 3.568509e-02 3.640436e-03 43 44 45 46 47 48 6.771192e-02 6.621437e-04 1.372185e-03 4.757985e-02 5.130702e-04 1.345923e-03 49 50 4.372463e-04 1.405554e-03 # Note that Cook's distance is much higher for the 35th case than for any other. # Here's a final way to understand the correlation coefficient. We standardize # both variables: > PeaStd <- scale(CTPEA) > RaStd <- scale(CTRA) > mean(PeaStd) [1] 3.16406e-16 > sd(PeaStd) [1] 1 # The scatterplot looks the same as it did with the unstandardized variables: > plot(PeaStd, RaStd) # But now the regression intercept is zero, and the slope... > lm(RaStd~PeaStd) Call: lm(formula = RaStd ~ PeaStd) Coefficients: (Intercept) PeaStd -3.175e-16 5.844e-01 # ...is equal to the correlation coefficient: > cor(CTRA,CTPEA) [1] 0.5844355 # And, since the correlation of x with y is the same as the # correlation of y with x, if we reverse the order of the # regression, we still get the same slope (which, as we # know, is not generally true for regression). > lm(PeaStd~RaStd) Call: lm(formula = PeaStd ~ RaStd) Coefficients: (Intercept) RaStd 4.039e-16 5.844e-01 >