# We continued the example of collinearity that we had begun last time. # Recall that we had already created two variables, z1 and z2, that # were correlated .8: > cor(z1,z2) [1] 0.8008038 # We generate y to fit a regression model: > y <- 10 + 10*z1 - 10*z2 # We created some noise: > noise <- rnorm(100, 0, 100) # And we fit the regression of y (with the noise) on the two predictors: > summary(lm(y+noise~z1+z2)) Call: lm(formula = y + noise ~ z1 + z2) Residuals: Min 1Q Median 3Q Max -254.7833 -46.5296 -0.3359 71.3679 205.1611 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 23.966 51.738 0.463 0.644 z1 9.068 1.710 5.304 7.15e-07 *** z2 -9.138 1.671 -5.467 3.55e-07 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 97.8 on 97 degrees of freedom Multiple R-squared: 0.2497, Adjusted R-squared: 0.2342 F-statistic: 16.14 on 2 and 97 DF, p-value: 8.903e-07 # Note that the slopes were pretty close to their true values. Moreover, # if we repeat the process with a new noise sample, the estimates are # pretty stable. Here are four repetitions of the process, to make # that point. Note that the slope estimates stay close to 10 and -10: > noise <- rnorm(100, 0, 100) > summary(lm(y+noise~z1+z2)) Call: lm(formula = y + noise ~ z1 + z2) Residuals: Min 1Q Median 3Q Max -231.762 -52.723 -8.535 55.583 222.862 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -16.800 49.715 -0.338 0.736 z1 9.691 1.643 5.899 5.35e-08 *** z2 -9.366 1.606 -5.832 7.21e-08 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 93.97 on 97 degrees of freedom Multiple R-squared: 0.2826, Adjusted R-squared: 0.2679 F-statistic: 19.11 on 2 and 97 DF, p-value: 1.007e-07 > noise <- rnorm(100, 0, 100) > summary(lm(y+noise~z1+z2)) Call: lm(formula = y + noise ~ z1 + z2) Residuals: Min 1Q Median 3Q Max -214.608 -61.809 1.382 67.693 218.164 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -28.319 49.961 -0.567 0.572 z1 11.181 1.651 6.772 9.77e-10 *** z2 -10.401 1.614 -6.444 4.49e-09 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 94.44 on 97 degrees of freedom Multiple R-squared: 0.3346, Adjusted R-squared: 0.3209 F-statistic: 24.39 on 2 and 97 DF, p-value: 2.631e-09 > noise <- rnorm(100, 0, 100) > summary(lm(y+noise~z1+z2)) Call: lm(formula = y + noise ~ z1 + z2) Residuals: Min 1Q Median 3Q Max -273.5096 -62.0025 -0.6437 61.2825 271.0550 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 64.920 51.753 1.254 0.213 z1 8.212 1.710 4.802 5.71e-06 *** z2 -9.407 1.672 -5.627 1.78e-07 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 97.82 on 97 degrees of freedom Multiple R-squared: 0.2475, Adjusted R-squared: 0.232 F-statistic: 15.95 on 2 and 97 DF, p-value: 1.024e-06 > noise <- rnorm(100, 0, 100) > summary(lm(y+noise~z1+z2)) Call: lm(formula = y + noise ~ z1 + z2) Residuals: Min 1Q Median 3Q Max -218.386 -70.161 -4.683 55.634 238.015 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -21.823 52.077 -0.419 0.676 z1 10.328 1.721 6.002 3.37e-08 *** z2 -9.655 1.682 -5.739 1.09e-07 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 98.44 on 97 degrees of freedom Multiple R-squared: 0.2838, Adjusted R-squared: 0.2691 F-statistic: 19.22 on 2 and 97 DF, p-value: 9.283e-08 # Now we create a third predictor that is actually the ratio # of the other two: > z3 <- z1/z2 # If we add this third variable to the regression model, the # estimates becomre quite unstable. Notice how, in the following # replications of the regression, the parameter estimates are # quite unstable, and the significance levels vary wildly: > summary(lm(y+noise~z1+z2+z3)) Call: lm(formula = y + noise ~ z1 + z2 + z3) Residuals: Min 1Q Median 3Q Max -222.813 -62.897 1.509 55.799 241.542 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 331.233 251.904 1.315 0.19167 z1 17.785 5.481 3.245 0.00162 ** z2 -17.508 5.733 -3.054 0.00292 ** z3 -330.953 231.088 -1.432 0.15535 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 97.91 on 96 degrees of freedom Multiple R-squared: 0.2988, Adjusted R-squared: 0.2769 F-statistic: 13.64 on 3 and 96 DF, p-value: 1.753e-07 > noise <- rnorm(100, 0, 100) > summary(lm(y+noise~z1+z2+z3)) Call: lm(formula = y + noise ~ z1 + z2 + z3) Residuals: Min 1Q Median 3Q Max -216.37 -64.70 -13.14 65.94 279.02 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -274.954 252.711 -1.088 0.279 z1 4.312 5.498 0.784 0.435 z2 -3.782 5.751 -0.658 0.512 z3 263.147 231.828 1.135 0.259 Residual standard error: 98.22 on 96 degrees of freedom Multiple R-squared: 0.2985, Adjusted R-squared: 0.2766 F-statistic: 13.62 on 3 and 96 DF, p-value: 1.792e-07 > noise <- rnorm(100, 0, 100) > summary(lm(y+noise~z1+z2+z3)) Call: lm(formula = y + noise ~ z1 + z2 + z3) Residuals: Min 1Q Median 3Q Max -318.01 -60.49 5.38 74.58 236.55 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -80.875 266.062 -0.304 0.762 z1 9.463 5.789 1.635 0.105 z2 -8.531 6.055 -1.409 0.162 z3 31.739 244.076 0.130 0.897 Residual standard error: 103.4 on 96 degrees of freedom Multiple R-squared: 0.2573, Adjusted R-squared: 0.2341 F-statistic: 11.09 on 3 and 96 DF, p-value: 2.587e-06 > noise <- rnorm(100, 0, 100) > summary(lm(y+noise~z1+z2+z3)) Call: lm(formula = y + noise ~ z1 + z2 + z3) Residuals: Min 1Q Median 3Q Max -228.817 -53.238 -4.891 60.117 271.161 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -496.892 251.015 -1.980 0.0506 . z1 3.730 5.461 0.683 0.4962 z2 -1.130 5.713 -0.198 0.8436 z3 376.250 230.272 1.634 0.1055 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 97.56 on 96 degrees of freedom Multiple R-squared: 0.3607, Adjusted R-squared: 0.3407 F-statistic: 18.05 on 3 and 96 DF, p-value: 2.281e-09 > noise <- rnorm(100, 0, 100) > summary(lm(y+noise~z1+z2+z3)) Call: lm(formula = y + noise ~ z1 + z2 + z3) Residuals: Min 1Q Median 3Q Max -264.794 -59.015 -1.465 79.317 157.664 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -569.6448 246.7130 -2.309 0.0231 * z1 2.9018 5.3679 0.541 0.5900 z2 -0.4568 5.6147 -0.081 0.9353 z3 444.5589 226.3257 1.964 0.0524 . --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 95.89 on 96 degrees of freedom Multiple R-squared: 0.4003, Adjusted R-squared: 0.3815 F-statistic: 21.36 on 3 and 96 DF, p-value: 1.112e-10 # That is fairly typical behavior for a regression with collinearity. # We calculated the simplest form of the condition number. Note that # the magnitude of the index is very large: > X <- cbind(rep(1,100),z1,z2,z3) > sqrt( eigen(t(X)%*%X)$values[1] / eigen(t(X)%*%X)$values[4]) [1] 2375.445 # Next, we illustrated the problem of model misspecification. First, # we created two variables that were correlated .6: > xx1 <- rnorm(50,0,1) > xx2 <- rnorm(50,0,1) > x1 <- xx1*10+50 > x2 <- 100 + 20*(.6*xx1 + sqrt(1-.6^2)*xx2) > cor(x1,x2) [1] 0.569278 # We create a variable that perfectly fits a regression model, # add some noise, and estimate the regression: > y <- 10 + 5*x1 + 10*x2 > noise <- rnorm(50, 0, 100) > lm(y+noise~x1+x2) Call: lm(formula = y + noise ~ x1 + x2) Coefficients: (Intercept) x1 x2 146.827 3.858 9.185 # Note that across several replications, the slope estimates # stay close to the true values: > noise <- rnorm(50, 0, 100) > lm(y+noise~x1+x2) Call: lm(formula = y + noise ~ x1 + x2) Coefficients: (Intercept) x1 x2 25.317 5.715 9.475 > noise <- rnorm(50, 0, 100) > lm(y+noise~x1+x2) Call: lm(formula = y + noise ~ x1 + x2) Coefficients: (Intercept) x1 x2 -201.95 4.13 12.52 > noise <- rnorm(50, 0, 100) > lm(y+noise~x1+x2) Call: lm(formula = y + noise ~ x1 + x2) Coefficients: (Intercept) x1 x2 -85.948 4.833 11.174 # (We can also do print out the regression with testing information:) > summary(lm(y+noise~x1+x2)) Call: lm(formula = y + noise ~ x1 + x2) Residuals: Min 1Q Median 3Q Max -252.72 -38.66 15.41 56.18 235.54 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -85.9483 77.8973 -1.103 0.27549 x1 4.8331 1.6702 2.894 0.00576 ** x2 11.1745 0.8419 13.273 < 2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 88.02 on 47 degrees of freedom Multiple R-squared: 0.8778, Adjusted R-squared: 0.8726 F-statistic: 168.9 on 2 and 47 DF, p-value: < 2.2e-16 # Next, we removed one of the predictors (x2) even though it # was in fact a cause of y. (We know this not because it was # significant in the regression, but because we used it when # we created y.) # Notice that across several repetitions, the estimate of # the slope for x1 has nothing to do with the true slope for # x1. This is because, without x2, the model is misspecified, # which will tend to bias the estimates of other coefficients. > summary(lm(y+noise~x1)) Call: lm(formula = y + noise ~ x1) Residuals: Min 1Q Median 3Q Max -400.242 -82.745 4.404 105.238 649.647 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 399.994 148.262 2.698 0.0096 ** x1 17.453 2.961 5.895 3.63e-07 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 189.8 on 48 degrees of freedom Multiple R-squared: 0.4199, Adjusted R-squared: 0.4078 F-statistic: 34.75 on 1 and 48 DF, p-value: 3.631e-07 > noise <- rnorm(50, 0, 100) > summary(lm(y+noise~x1)) Call: lm(formula = y + noise ~ x1) Residuals: Min 1Q Median 3Q Max -299.16 -128.69 17.24 116.85 504.24 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 473.600 118.099 4.010 0.000211 *** x1 15.506 2.359 6.574 3.31e-08 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 151.2 on 48 degrees of freedom Multiple R-squared: 0.4738, Adjusted R-squared: 0.4628 F-statistic: 43.22 on 1 and 48 DF, p-value: 3.308e-08 > noise <- rnorm(50, 0, 100) > summary(lm(y+noise~x1)) Call: lm(formula = y + noise ~ x1) Residuals: Min 1Q Median 3Q Max -420.304 -124.494 4.501 118.328 537.300 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 398.813 141.973 2.809 0.00717 ** x1 17.133 2.835 6.043 2.16e-07 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 181.7 on 48 degrees of freedom Multiple R-squared: 0.432, Adjusted R-squared: 0.4202 F-statistic: 36.51 on 1 and 48 DF, p-value: 2.158e-07 > noise <- rnorm(50, 0, 100) > summary(lm(y+noise~x1)) Call: lm(formula = y + noise ~ x1) Residuals: Min 1Q Median 3Q Max -329.247 -111.173 6.969 78.393 502.848 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 473.005 129.065 3.665 0.000618 *** x1 15.561 2.578 6.037 2.2e-07 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 165.2 on 48 degrees of freedom Multiple R-squared: 0.4316, Adjusted R-squared: 0.4198 F-statistic: 36.45 on 1 and 48 DF, p-value: 2.201e-07 > noise <- rnorm(50, 0, 100) > summary(lm(y+noise~x1)) Call: lm(formula = y + noise ~ x1) Residuals: Min 1Q Median 3Q Max -383.157 -130.784 5.397 112.582 659.183 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 317.871 147.178 2.160 0.0358 * x1 18.561 2.939 6.315 8.27e-08 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 188.4 on 48 degrees of freedom Multiple R-squared: 0.4538, Adjusted R-squared: 0.4424 F-statistic: 39.88 on 1 and 48 DF, p-value: 8.267e-08 # Finally, we reviewed some diagnostic graphics for the Grades data set: > grades Grade Education Homework 1 78 13 2 2 79 14 6 3 79 13 1 4 89 13 5 5 82 16 3 6 77 13 4 7 88 13 5 8 70 13 3 9 86 15 5 10 80 14 5 11 76 16 1 12 72 13 5 13 66 12 3 14 79 14 4 15 76 10 4 16 80 20 6 17 91 15 8 18 85 16 6 19 79 13 8 20 82 14 3 21 94 12 8 22 91 12 6 23 80 14 5 24 73 14 7 25 77 12 5 26 76 16 5 27 84 12 3 28 81 16 5 29 97 16 10 30 80 15 6 31 74 16 5 32 83 15 11 33 78 13 6 34 64 17 7 35 82 13 4 36 81 17 1 37 73 13 4 38 87 13 7 39 81 14 2 40 86 11 4 41 92 14 4 42 79 13 1 43 69 15 4 44 66 14 5 45 67 13 7 46 76 13 7 47 83 13 6 48 73 15 3 49 85 15 6 50 80 17 5 51 83 13 4 52 82 15 3 53 90 12 7 54 69 12 5 55 95 17 8 56 79 13 2 57 72 13 3 58 83 16 7 59 81 12 4 60 86 15 5 61 78 11 2 62 84 13 9 63 81 14 5 64 72 12 4 65 64 12 5 66 83 14 8 67 94 14 5 68 77 13 6 69 80 14 5 70 74 18 5 71 89 16 8 72 87 15 6 73 82 16 7 74 87 15 4 75 71 12 2 76 88 17 5 77 80 13 7 78 89 16 6 79 87 15 2 80 72 10 5 81 79 17 4 82 93 14 7 83 100 18 7 84 90 13 4 85 69 10 4 86 85 14 5 87 75 14 7 88 82 12 2 89 72 10 2 90 87 14 5 91 81 15 7 92 80 15 9 93 79 17 7 94 79 14 5 95 94 17 8 96 71 11 4 97 77 14 8 98 68 15 4 99 87 16 6 100 74 12 4 # If we save the regression output as an object in R, we can easily # access components such as predicted values ("fit") and residuals ("res"). > regout <- lm(grades$Grade~ grades$Education + grades$Homework) > regout$fit 1 2 3 4 5 6 7 8 76.52082 81.34282 75.53297 79.48435 80.12053 78.49651 79.48435 77.50866 9 10 11 12 13 14 15 16 81.22560 80.35498 78.14484 79.48435 76.63804 79.36713 75.88464 86.56656 17 18 19 20 21 22 23 24 84.18914 83.08407 82.44789 78.37928 81.57727 79.60157 80.35498 82.33067 25 26 27 28 29 30 31 32 78.61373 82.09622 76.63804 82.09622 87.03545 82.21344 82.09622 87.15267 33 34 35 36 37 38 39 40 80.47220 84.94254 78.49651 79.01546 78.49651 81.46004 77.39144 76.75526 41 42 43 44 45 46 47 48 79.36713 75.53297 80.23775 80.35498 81.46004 81.46004 80.47220 79.24991 49 50 51 52 53 54 55 56 82.21344 82.96684 78.49651 79.24991 80.58942 78.61373 85.93038 76.52082 57 58 59 60 61 62 63 64 77.50866 84.07191 77.62588 81.22560 74.77957 83.43573 80.35498 77.62588 65 66 67 68 69 70 71 72 78.61373 83.31851 80.35498 80.47220 80.35498 83.83747 85.05976 82.21344 73 74 75 76 77 78 79 80 84.07191 80.23775 75.65019 82.96684 81.46004 83.08407 78.26206 76.87248 81 82 83 84 85 86 87 88 81.97900 82.33067 85.81316 78.49651 75.88464 80.35498 82.33067 75.65019 89 90 91 92 93 94 95 96 73.90895 80.35498 83.20129 85.17698 84.94254 80.35498 85.93038 76.75526 97 98 99 100 83.31851 80.23775 83.08407 77.62588 > regout$res 1 2 3 4 5 6 1.4791847 -2.3428208 3.4670303 9.5156478 1.8794699 -1.4965065 7 8 9 10 11 12 8.5156478 -7.5086609 4.7744018 -0.3549752 -2.1448388 -7.4843522 13 14 15 16 17 18 -10.6380379 -0.3671296 0.1153626 -6.5665591 6.8108649 1.9159331 19 20 21 22 23 24 -3.4478890 3.6207160 12.4227340 11.3984253 -0.3549752 -9.3306664 25 26 27 28 29 30 -1.6137291 -6.0962213 7.3619621 -1.0962213 9.9645507 -2.2134439 31 32 33 34 35 36 -8.0962213 -4.1526719 -2.4721978 -20.9425356 3.5034935 1.9845381 37 38 39 40 41 42 -5.4965065 5.5399566 3.6085616 9.2447395 12.6328704 3.4670303 43 44 45 46 47 48 -11.2377526 -14.3549752 -14.4600434 -5.4600434 2.5278022 -6.2499070 49 50 51 52 53 54 2.7865561 -2.9668443 4.5034935 2.7500930 9.4105797 -9.6137291 55 56 57 58 59 60 9.0696188 2.4791847 -5.5086609 -1.0719125 3.3741165 4.7744018 61 62 63 64 65 66 3.2204308 0.5642654 0.6450248 -5.6258835 -14.6137291 -0.3185120 67 68 69 70 71 72 13.6450248 -3.4721978 -0.3549752 -9.8374674 3.9402419 4.7865561 73 74 75 76 77 78 -2.0719125 6.7622474 -4.6501923 5.0331557 -1.4600434 5.9159331 79 80 81 82 83 84 8.7379386 -4.8724830 -2.9789987 10.6693336 14.1868414 11.5034935 85 86 87 88 89 90 -6.8846374 4.6450248 -7.3306664 6.3498077 -1.9089462 6.6450248 91 92 93 94 95 96 -2.2012895 -5.1769807 -5.9425356 -1.3549752 8.0696188 -5.7552605 97 98 99 100 -6.3185120 -12.2377526 3.9159331 -3.6258835 # Using those, we could easily (or perhaps NOT so easily, if we want really # nice graphics with clean labels, sufficient white space, etc.) create a # plot of residuals versus predicted values. We didn't actually do that # in class. # The plot we DID do was a normal quantile-quantile plot of the residuals: > qqnorm(regout$res) > qqline(regout$res) # That plot is useful for evaluating the assumption that errors are # normally distributed. (They look pretty good here.) # We returned to R much later in the class when we were discussing # externally Studentized residuals. # Here, we calculate a p-value for the resudual for observation number 34 # in the Grades data. The original df is for the error term in the # regression was 97. For the outlier test, we use 96 because the external # Studentization is equivalent to dropping the observation or creating # a dummy code for it, both of which would reduce the error df by 1. > 2*pt(-3.14360, 96) [1] 0.002220702 # Finally, we used our Peabody and Raven data set to illsutrate the idea # of leverage. We create an artificial data set that adds a fictitious # observation precisely at the mean of the Raven distribution (where the # leverage is zero), but with an outrageously large value for Peabody. # Note that the slope estimate is completely unchanged: > plot(Raven,Peabody) > mean(Raven) [1] 32.525 > newRaven <- c(Raven,32.525) > newPeabody <- c(Peabody,10000000) > lm(Peabody~Raven) Call: lm(formula = Peabody ~ Raven) Coefficients: (Intercept) Raven 63.7942 0.5498 > lm(newPeabody~newRaven) Call: lm(formula = newPeabody ~ newRaven) Coefficients: (Intercept) newRaven 2.440e+05 5.498e-01 >