# Here is how we performed the regression of Raven on Peabody # during our last class: > lm(Raven~Peabody) Call: lm(formula = Raven ~ Peabody) Coefficients: (Intercept) Peabody -7.589 0.496 # There's actually a lot more information in the lm() output # than what it shows by default. We'll need some of that later # on, so let's save the output: > lm(Raven~Peabody) -> regout # If I enter the name of that object, it will show me the # same default, but there's actually more there: > regout Call: lm(formula = Raven ~ Peabody) Coefficients: (Intercept) Peabody -7.589 0.496 # For example, we can directly access the regression coefficients: > regout$coef (Intercept) Peabody -7.5894791 0.4959945 > regout$coef[1] (Intercept) -7.589479 > regout$coef[2] Peabody 0.4959945 # So I could use the accessible coefficients to calculate # the predicted conditional means: > predicted <- regout$coef[1] + regout$coef[2]*Peabody # Here's a table showing the Peabody scores (predictors) with # the corresponding conditional means: > cbind(Peabody,predicted) Peabody predicted [1,] 81 32.58607 [2,] 94 39.03400 [3,] 84 34.07406 [4,] 74 29.11411 [5,] 71 27.62613 [6,] 59 21.67420 [7,] 86 35.06605 [8,] 63 23.65817 [9,] 81 32.58607 [10,] 100 42.00997 [11,] 63 23.65817 [12,] 78 31.09809 [13,] 87 35.56204 [14,] 75 29.61011 [15,] 62 23.16218 [16,] 64 24.15417 [17,] 85 34.57005 [18,] 76 30.10610 [19,] 83 33.57806 [20,] 82 33.08207 [21,] 77 30.60210 [22,] 76 30.10610 [23,] 102 43.00196 [24,] 75 29.61011 [25,] 77 30.60210 [26,] 68 26.13815 [27,] 75 29.61011 [28,] 86 35.06605 [29,] 78 31.09809 [30,] 88 36.05804 [31,] 100 42.00997 [32,] 70 27.13014 [33,] 85 34.57005 [34,] 74 29.11411 [35,] 123 53.41784 [36,] 81 32.58607 [37,] 69 26.63414 [38,] 94 39.03400 [39,] 78 31.09809 [40,] 99 41.51398 [41,] 77 30.60210 [42,] 93 38.53801 [43,] 65 24.65016 [44,] 65 24.65016 [45,] 94 39.03400 [46,] 69 26.63414 [47,] 78 31.09809 [48,] 82 33.08207 [49,] 66 25.14616 [50,] 89 36.55403 # The residuals are estimates of the error component of # the model: > residual <- Raven - predicted > cbind(Peabody,Raven,predicted,residual) Peabody Raven predicted residual [1,] 81 33 32.58607 0.41392540 [2,] 94 25 39.03400 -14.03400297 [3,] 84 42 34.07406 7.92594193 [4,] 74 20 29.11411 -9.11411317 [5,] 71 21 27.62613 -6.62612970 [6,] 59 15 21.67420 -6.67419582 [7,] 86 34 35.06605 -1.06604705 [8,] 63 16 23.65817 -7.65817378 [9,] 81 35 32.58607 2.41392540 [10,] 100 45 42.00997 2.99003009 [11,] 63 33 23.65817 9.34182622 [12,] 78 47 31.09809 15.90190887 [13,] 87 50 35.56204 14.43795846 [14,] 75 31 29.61011 1.38989234 [15,] 62 26 23.16218 2.83782071 [16,] 64 28 24.15417 3.84583173 [17,] 85 35 34.57005 0.42994744 [18,] 76 44 30.10610 13.89389785 [19,] 83 34 33.57806 0.42193642 [20,] 82 48 33.08207 14.91793091 [21,] 77 40 30.60210 9.39790336 [22,] 76 31 30.10610 0.89389785 [23,] 102 50 43.00196 6.99804111 [24,] 75 34 29.61011 4.38989234 [25,] 77 48 30.60210 17.39790336 [26,] 68 18 26.13815 -8.13814623 [27,] 75 29 29.61011 -0.61010766 [28,] 86 20 35.06605 -15.06604705 [29,] 78 28 31.09809 -3.09809113 [30,] 88 24 36.05804 -12.05803603 [31,] 100 42 42.00997 -0.00996991 [32,] 70 40 27.13014 12.86986479 [33,] 85 21 34.57005 -13.57005256 [34,] 74 21 29.11411 -8.11411317 [35,] 123 47 53.41784 -6.41784318 [36,] 81 29 32.58607 -3.58607460 [37,] 69 33 26.63414 6.36585928 [38,] 94 45 39.03400 5.96599703 [39,] 78 35 31.09809 3.90190887 [40,] 99 43 41.51398 1.48602458 [41,] 77 15 30.60210 -15.60209664 [42,] 93 42 38.53801 3.46199152 [43,] 65 11 24.65016 -13.65016276 [44,] 65 26 24.65016 1.34983724 [45,] 94 37 39.03400 -2.03400297 [46,] 69 13 26.63414 -13.63414072 [47,] 78 33 31.09809 1.90190887 [48,] 82 30 33.08207 -3.08206909 [49,] 66 24 25.14616 -1.14615725 [50,] 89 34 36.55403 -2.55403052 # Here is the sum of squared deviations from the mean # of the variable whose distribution we are interested in: > sum((Raven-mean(Raven))^2) [1] 5500.5 # Here's a more efficient way to calculate that: > 49*var(Raven) [1] 5500.5 # Let's save it: > SSy <- 49*var(Raven) # Here's the sum of squares for the predicted conditional # means. Think of that as an expression of how much variation # the model helps us understand: > SSm <- 49*var(predicted) # Note that there's a lot more variation we COULD understand... > SSy [1] 5500.5 # ...than variation we DO understand: > SSm [1] 1878.778 # Finally, there's the variation we DON'T understand. The mean of # the residuals will always be zero: > mean(residual) [1] 2.394468e-14 # So we can calculate the sum of squared deviations from the mean # like this (no need to subtract zero): > SSe <- sum(residual^2) > SSe [1] 3621.722 # Note that these sums of squares have an additive relationship. # The total sum of squares minus the model sum of squares is the # error sum of squares: > SSy - SSm [1] 3621.722 # And the sum of squares that we DO understand plus the sum of # squares we DON'T understand adds up to the sum of squares # that we COULD understand (if every Raven were exactly at its # predicted conditional mean): > SSm + SSe [1] 5500.5 # The estimated regression coefficients are those that minimize # the sum of squared residuals: > SSe [1] 3621.722 > regout$coef (Intercept) Peabody -7.5894791 0.4959945 # Here, we use a different set of values for the coefficients: > badpred <- -7 + .55*Peabody > badres <- Raven - badpred # The sum of squared residuals is larger: > sum(badres^2) [1] 4849.893 # Even if we choose values that are very close to the real estimates... > badpred <- -7.6 + .5*Peabody # The sum of squared residuals will be larger: > badres <- Raven - badpred > sum(badres^2) [1] 3626.65 # Here's the sum of squares for the model again: > SSm [1] 1878.778 # If we take the ratio of the sum of squares that we DO # understand over the sum of squares that we COULD understand, # the resulting "coefficient of determination" represents an # estimate of the proportion of variability that we understand # using the model: > SSm/SSy [1] 0.3415649 # The square root of that ratio, with the sign of the slope attached # to it, is identical to the correlation coefficient: > sqrt(SSm/SSy) [1] 0.5844355 > cor(Raven,Peabody) [1] 0.5844355 # As an aside, it is important that you understand that the # correlation of Raven with Peabody is identical to the # correlation of Peabody with Raven: > cor(Peabody,Raven) [1] 0.5844355 # But the regression of Raven on Peabody... > regout Call: lm(formula = Raven ~ Peabody) Coefficients: (Intercept) Peabody -7.589 0.496 # ...is NOT the same as the regression of Peabody on Raven: > lm(Peabody~Raven) Call: lm(formula = Peabody ~ Raven) Coefficients: (Intercept) Raven 57.9144 0.6886 # Here's our object representing the regression of Raven # on Peabody: > regout Call: lm(formula = Raven ~ Peabody) Coefficients: (Intercept) Peabody -7.589 0.496 # Previously, to aid our understanding, we calculated the residuals # from this regression manually. But there's actually an object in # the regression output that already contains those values: > regout$res 1 2 3 4 5 6 0.41392540 -14.03400297 7.92594193 -9.11411317 -6.62612970 -6.67419582 7 8 9 10 11 12 -1.06604705 -7.65817378 2.41392540 2.99003009 9.34182622 15.90190887 13 14 15 16 17 18 14.43795846 1.38989234 2.83782071 3.84583173 0.42994744 13.89389785 19 20 21 22 23 24 0.42193642 14.91793091 9.39790336 0.89389785 6.99804111 4.38989234 25 26 27 28 29 30 17.39790336 -8.13814623 -0.61010766 -15.06604705 -3.09809113 -12.05803603 31 32 33 34 35 36 -0.00996991 12.86986479 -13.57005256 -8.11411317 -6.41784318 -3.58607460 37 38 39 40 41 42 6.36585928 5.96599703 3.90190887 1.48602458 -15.60209664 3.46199152 43 44 45 46 47 48 -13.65016276 1.34983724 -2.03400297 -13.63414072 1.90190887 -3.08206909 49 50 -1.14615725 -2.55403052 # Note that they are the same as what we calculated ourselvs: > cbind(residual,regout$res) residual 1 0.41392540 0.41392540 2 -14.03400297 -14.03400297 3 7.92594193 7.92594193 4 -9.11411317 -9.11411317 5 -6.62612970 -6.62612970 6 -6.67419582 -6.67419582 7 -1.06604705 -1.06604705 8 -7.65817378 -7.65817378 9 2.41392540 2.41392540 10 2.99003009 2.99003009 11 9.34182622 9.34182622 12 15.90190887 15.90190887 13 14.43795846 14.43795846 14 1.38989234 1.38989234 15 2.83782071 2.83782071 16 3.84583173 3.84583173 17 0.42994744 0.42994744 18 13.89389785 13.89389785 19 0.42193642 0.42193642 20 14.91793091 14.91793091 21 9.39790336 9.39790336 22 0.89389785 0.89389785 23 6.99804111 6.99804111 24 4.38989234 4.38989234 25 17.39790336 17.39790336 26 -8.13814623 -8.13814623 27 -0.61010766 -0.61010766 28 -15.06604705 -15.06604705 29 -3.09809113 -3.09809113 30 -12.05803603 -12.05803603 31 -0.00996991 -0.00996991 32 12.86986479 12.86986479 33 -13.57005256 -13.57005256 34 -8.11411317 -8.11411317 35 -6.41784318 -6.41784318 36 -3.58607460 -3.58607460 37 6.36585928 6.36585928 38 5.96599703 5.96599703 39 3.90190887 3.90190887 40 1.48602458 1.48602458 41 -15.60209664 -15.60209664 42 3.46199152 3.46199152 43 -13.65016276 -13.65016276 44 1.34983724 1.34983724 45 -2.03400297 -2.03400297 46 -13.63414072 -13.63414072 47 1.90190887 1.90190887 48 -3.08206909 -3.08206909 49 -1.14615725 -1.14615725 50 -2.55403052 -2.55403052 # The regression object also contains the predicted conditional means. # You will see that they (regout$fit) are identical to the values # we calculated before (predicted): > cbind(predicted, regout$fit) predicted 1 32.58607 32.58607 2 39.03400 39.03400 3 34.07406 34.07406 4 29.11411 29.11411 5 27.62613 27.62613 6 21.67420 21.67420 7 35.06605 35.06605 8 23.65817 23.65817 9 32.58607 32.58607 10 42.00997 42.00997 11 23.65817 23.65817 12 31.09809 31.09809 13 35.56204 35.56204 14 29.61011 29.61011 15 23.16218 23.16218 16 24.15417 24.15417 17 34.57005 34.57005 18 30.10610 30.10610 19 33.57806 33.57806 20 33.08207 33.08207 21 30.60210 30.60210 22 30.10610 30.10610 23 43.00196 43.00196 24 29.61011 29.61011 25 30.60210 30.60210 26 26.13815 26.13815 27 29.61011 29.61011 28 35.06605 35.06605 29 31.09809 31.09809 30 36.05804 36.05804 31 42.00997 42.00997 32 27.13014 27.13014 33 34.57005 34.57005 34 29.11411 29.11411 35 53.41784 53.41784 36 32.58607 32.58607 37 26.63414 26.63414 38 39.03400 39.03400 39 31.09809 31.09809 40 41.51398 41.51398 41 30.60210 30.60210 42 38.53801 38.53801 43 24.65016 24.65016 44 24.65016 24.65016 45 39.03400 39.03400 46 26.63414 26.63414 47 31.09809 31.09809 48 33.08207 33.08207 49 25.14616 25.14616 50 36.55403 36.55403 # The following calculations relate to the ANOVA table in # the Powerpoint. Here is the "total" sum of squares: > 49*var(Raven) [1] 5500.5 # Here is the model sum of squares: > 49*var(regout$fit) [1] 1878.778 # And here is the error sum of squares: > sum(regout$res^2) [1] 3621.722 # The model SS has 1 degree of freedom (df), so the mean # square is the same as the sum of squares: > MSm <- 1878.778 / 1 > MSm [1] 1878.778 # The error sum of squares has N-2 = 48 df, so the means square is: > MSe <- sum(regout$res^2) / 48 > MSe [1] 75.45255 # The mean square representing variation we DO understand is about 24 # times larger than the mean square that represents variation we # DON'T understand: > F <- MSm / MSe > F [1] 24.90013 # That's much larger than the critical value of the F distribution # with 1 df in the numerator and 48 df in the denominator, so we # reject the null hypothesis that the slope is zero: > qf(.95, 1, 48) [1] 4.042652 # Alternatively, we can compare an exact p value to our alpha level # and reach the same conclusion: > 1-pf(F, 1, 48) [1] 8.340838e-06 # Here's our regression output again: > regout Call: lm(formula = Raven ~ Peabody) Coefficients: (Intercept) Peabody -7.589 0.496 # We can apply an anova "wrapper" to the output to get the # ANOVA table: > anova(regout) Analysis of Variance Table Response: Raven Df Sum Sq Mean Sq F value Pr(>F) Peabody 1 1878.8 1878.78 24.9 8.341e-06 *** Residuals 48 3621.7 75.45 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # We can also apply a "summary" wrapper to get the equivalent # t test: > summary(regout) Call: lm(formula = Raven ~ Peabody) Residuals: Min 1Q Median 3Q Max -15.6021 -6.5741 0.4259 4.2679 17.3979 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -7.5895 8.0481 -0.943 0.35 Peabody 0.4960 0.0994 4.990 8.34e-06 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 8.686 on 48 degrees of freedom Multiple R-squared: 0.3416, Adjusted R-squared: 0.3278 F-statistic: 24.9 on 1 and 48 DF, p-value: 8.341e-06 # Note that the t of 4.99 is just the square root of the # F of 24.9: > 4.99^2 [1] 24.9001 > F [1] 24.90013 # We can use the information from the summary() output to # do a 95% confidence interval for the slope: > qt(.975, 48) [1] 2.010635 > .496 - 2.01*.0994 [1] 0.296206 > .496 + 2.01*.0994 [1] 0.695794 >