# We continued discussing the regression of Peabody on Raven

> attach(JackStatlab)
> lm(CTPEA~CTRA)

Call:
lm(formula = CTPEA ~ CTRA)

Coefficients:
(Intercept)         CTRA  
    59.0490       0.7319  

# We reviewed computational formulas for the slope and intercept.
# (There's no need to calculate them this way, given that we have
# the lm() function, but this is what's happening behind the scenes.)

> sum((CTPEA-mean(CTPEA))*(CTRA-mean(CTRA))) / sum((CTRA-mean(CTRA))^2)
[1] 0.731873
> lm(CTPEA~CTRA)$coef
(Intercept)        CTRA 
  59.049037    0.731873 
> mean(CTPEA) -  0.731873*mean(CTRA)
[1] 59.04904

# We talked about how variation in Peabody can be broken down
# into a predicted value (conditional mean) that we can understand
# using the model...

> PRED <- 59.049037 + 0.731873*CTRA
> PRED
 [1] 74.41837 70.02713 91.98332 84.66459 78.80961 78.07773 70.02713 75.88212
 [9] 68.56339 86.86021 74.41837 75.88212 87.59208 84.66459 82.46897 93.44707
[17] 91.98332 89.05583 82.46897 76.61399 86.86021 77.34586 91.25145 88.32396
[25] 71.49088 90.51958 74.41837 79.54148 86.86021 79.54148 81.73710 87.59208
[33] 74.41837 86.12834 74.41837 75.15024 75.88212 72.22275 79.54148 87.59208
[41] 89.78770 78.07773 83.20085 89.05583 83.93272 84.66459 81.00523 83.20085
[49] 79.54148 89.78770

# ...and the leftover or "residual" part that we don't understand:

> RESIDUAL <- CTPEA - PRED

# If we look at these values in tabular form, we see that Peabody (leftmost
# column) is equal to what we understand plus what we don't understand, so
# that the sum of PRED and RESIDUAL (rightmost column) is equal to Peabody:

> cbind(CTPEA, PRED, RESIDUAL, PRED+RESIDUAL)
      CTPEA     PRED   RESIDUAL    
 [1,]    72 74.41837  -2.418370  72
 [2,]    69 70.02713  -1.027132  69
 [3,]    80 91.98332 -11.983322  80
 [4,]    90 84.66459   5.335408  90
 [5,]    77 78.80961  -1.809608  77
 [6,]    84 78.07773   5.922265  84
 [7,]    59 70.02713 -11.027132  59
 [8,]    72 75.88212  -3.882116  72
 [9,]    62 68.56339  -6.563386  62
[10,]    82 86.86021  -4.860211  82
[11,]    66 74.41837  -8.418370  66
[12,]    72 75.88212  -3.882116  72
[13,]    96 87.59208   8.407916  96
[14,]    80 84.66459  -4.664592  80
[15,]    74 82.46897  -8.468973  74
[16,]    81 93.44707 -12.447068  81
[17,]    82 91.98332  -9.983322  82
[18,]    81 89.05583  -8.055830  81
[19,]    69 82.46897 -13.468973  69
[20,]    68 76.61399  -8.613989  68
[21,]    85 86.86021  -1.860211  85
[22,]    69 77.34586  -8.345862  69
[23,]   106 91.25145  14.748551 106
[24,]    95 88.32396   6.676043  95
[25,]    60 71.49088 -11.490878  60
[26,]    64 90.51958 -26.519576  64
[27,]    86 74.41837  11.581630  86
[28,]    79 79.54148  -0.541481  79
[29,]    83 86.86021  -3.860211  83
[30,]    80 79.54148   0.458519  80
[31,]    78 81.73710  -3.737100  78
[32,]    64 87.59208 -23.592084  64
[33,]    89 74.41837  14.581630  89
[34,]    89 86.12834   2.871662  89
[35,]    81 74.41837   6.581630  81
[36,]    86 75.15024  10.849757  86
[37,]    75 75.88212  -0.882116  75
[38,]    71 72.22275  -1.222751  71
[39,]   122 79.54148  42.458519 122
[40,]   104 87.59208  16.407916 104
[41,]   103 89.78770  13.212297 103
[42,]    65 78.07773 -13.077735  65
[43,]    92 83.20085   8.799154  92
[44,]   108 89.05583  18.944170 108
[45,]    86 83.93272   2.067281  86
[46,]    85 84.66459   0.335408  85
[47,]    88 81.00523   6.994773  88
[48,]    78 83.20085  -5.200846  78
[49,]    91 79.54148  11.458519  91
[50,]   103 89.78770  13.212297 103
 
# Now we consider why the regression estimates are what they are.
# It makes sense to think that the best regression would be the one
# that minimizes the typical residual. But we have a problem. If we
# define "typical" as the mean, the mean residual for the regression
# estimates is always zero. 

> mean(RESIDUAL)
[1] -3.2e-07

# We could think about considering the mean absolute value of the 
# residuals, but, as was the case when we thought about variance and 
# standard deviation, we reject that idea because it's too hard to do 
# math when absolute values are involved. So instead, we make all the
# residuals positive by squaring them:

> sum(RESIDUAL^2)
[1] 6760.777

# We're going to define the "best" regression equation as the one
# that minimizes the sum of squared residuals. Here are the regression
# estimates again:

> lm(CTPEA~CTRA)

Call:
lm(formula = CTPEA ~ CTRA)

Coefficients:
(Intercept)         CTRA  
    59.0490       0.7319  

# If we choose any other values for the intercept and slope, the
# sum of squared residuals for those values will be higher than the
# one we got from our real regression estimates:

> BADRES <- CTPEA - (62 + .6*CTRA)
> sum(BADRES^2)
[1] 6894.12
> BADRES <- CTPEA - (60 + .7*CTRA)
> sum(BADRES^2)
[1] 6764.98
> BADRES <- CTPEA - (59 + .72*CTRA)
> sum(BADRES^2)
[1] 6769.973

# Here's the real sum of squares again. Note that each of those
# other "regressions" resulted in a higher sum of squares.

> sum(RESIDUAL^2)
[1] 6760.777
 
# We repeat a good scatterplot of the relationship. (This was pasted from
# the 2/28/2021 transcript; that's where "xinc" and "yinc" were calculated.)

> par(pin=c(6,4))
> plot(CTRA, CTPEA, main="Peabody Conditioned on Raven",xlab="Raven",ylab="Peabody",
+ xlim = c(min(CTRA)-xinc,max(CTRA)+xinc), ylim = c(min(CTPEA)-yinc,max(CTPEA)+yinc))

# Here's the regression line:

> abline(lm(CTPEA~CTRA)$coef)

# The point nearest the lower left corner of that plot has the smallest
# Raven value:

> min(CTRA)
[1] 13

# The modeled conditional mean Peabody for that Raven score is

> 59.0490 +  0.7319*13
[1] 68.5637

# We see that the observed Peabody score for the Raven=13 case is 62:

> cbind(CTPEA,CTRA)
      CTPEA CTRA
 [1,]    72   21
 [2,]    69   15
 [3,]    80   45
 [4,]    90   35
 [5,]    77   27
 [6,]    84   26
 [7,]    59   15
 [8,]    72   23
 [9,]    62   13     # <--Here it is
[10,]    82   38
[11,]    66   21
[12,]    72   23
[13,]    96   39
[14,]    80   35
[15,]    74   32
[16,]    81   47
[17,]    82   45
[18,]    81   41
[19,]    69   32
[20,]    68   24
[21,]    85   38
[22,]    69   25
[23,]   106   44
[24,]    95   40
[25,]    60   17
[26,]    64   43
[27,]    86   21
[28,]    79   28
[29,]    83   38
[30,]    80   28
[31,]    78   31
[32,]    64   39
[33,]    89   21
[34,]    89   37
[35,]    81   21
[36,]    86   22
[37,]    75   23
[38,]    71   18
[39,]   122   28
[40,]   104   39
[41,]   103   42
[42,]    65   26
[43,]    92   33
[44,]   108   41
[45,]    86   34
[46,]    85   35
[47,]    88   30
[48,]    78   33
[49,]    91   28
[50,]   103   42

# So the residual is

> 62 - (59.0490 +  0.7319*13)
[1] -6.5637

# ...and we can see that the Peabody score in question is about
# 6-1/2 points below the line.

# Near the middle of the Raven values, we have a very large (122)
# Peabody score, with a corresponding Raven of 28, so the residual
# for that cas is

> 122 - (59.0490 +  0.7319*28)
[1] 42.4578
 

# When we calculate the regression using lm(), we can save the
# results in an appropriately named variable. (Here, I have chosen
# "regout" to remind me that this is regression output.)

> regout <- lm(CTPEA~CTRA)

# If we look at that object, we see the same thing we see when
# we use the lm() function:

> regout

Call:
lm(formula = CTPEA ~ CTRA)

Coefficients:
(Intercept)         CTRA  
    59.0490       0.7319  

# But the output actually contains a lot more than that. For
# example, we can see more precise values for the intercept
# and slope (the "coefficients"):

> lm(CTPEA~CTRA)$coef
(Intercept)        CTRA 
  59.049037    0.731873 
> regout$coef
(Intercept)        CTRA 
  59.049037    0.731873 

# The lm() function automatically calculates the estimated
# conditional means:

> regout$fit
       1        2        3        4        5        6        7        8 
74.41837 70.02713 91.98332 84.66459 78.80961 78.07773 70.02713 75.88212 
       9       10       11       12       13       14       15       16 
68.56339 86.86021 74.41837 75.88212 87.59208 84.66459 82.46897 93.44707 
      17       18       19       20       21       22       23       24 
91.98332 89.05583 82.46897 76.61399 86.86021 77.34586 91.25145 88.32396 
      25       26       27       28       29       30       31       32 
71.49088 90.51958 74.41837 79.54148 86.86021 79.54148 81.73710 87.59208 
      33       34       35       36       37       38       39       40 
74.41837 86.12834 74.41837 75.15024 75.88212 72.22275 79.54148 87.59208 
      41       42       43       44       45       46       47       48 
89.78770 78.07773 83.20085 89.05583 83.93272 84.66459 81.00523 83.20085 
      49       50 
79.54148 89.78770 

# It also calculates the residuals:

> regout$res
          1           2           3           4           5           6 
 -2.4183697  -1.0271318 -11.9833216   5.3354083  -1.8096077   5.9222653 
          7           8           9          10          11          12 
-11.0271318  -3.8821157  -6.5633858  -4.8602106  -8.4183697  -3.8821157 
         13          14          15          16          17          18 
  8.4079164  -4.6645917  -8.4689727 -12.4470676  -9.9833216  -8.0558296 
         19          20          21          22          23          24 
-13.4689727  -8.6139887  -1.8602106  -8.3458617  14.7485514   6.6760434 
         25          26          27          28          29          30 
-11.4908778 -26.5195756  11.5816303  -0.5414807  -3.8602106   0.4585193 
         31          32          33          34          35          36 
 -3.7370997 -23.5920836  14.5816303   2.8716624   6.5816303  10.8497573 
         37          38          39          40          41          42 
 -0.8821157  -1.2227508  42.4585193  16.4079164  13.2122974 -13.0777347 
         43          44          45          46          47          48 
  8.7991543  18.9441704   2.0672813   0.3354083   6.9947733  -5.2008457 
         49          50 
 11.4585193  13.2122974 

# So the sum of squared residuals is the same as what we've seen before:

> sum(regout$res^2)
[1] 6760.777

# And the "fit" and "res" values add up to the Peabody values:

> regout$fit + regout$res
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
 72  69  80  90  77  84  59  72  62  82  66  72  96  80  74  81  82  81  69  68 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
 85  69 106  95  60  64  86  79  83  80  78  64  89  89  81  86  75  71 122 104 
 41  42  43  44  45  46  47  48  49  50 
103  65  92 108  86  85  88  78  91 103 
 
# Here's the sum of squared deviations from the mean of the "fit"
# values (which represent what we DO understand about Peabody using Raven):

> sum((regout$fit - mean(regout$fit))^2 )
[1] 2189.003

# We could just as easily calculate that sum of squares this way:

> 49*var(regout$fit)
[1] 2189.003

# Here's the sum of squared residuals, which is what we minimized by
# calculating the regression coefficients the way we did:

> sum(regout$res^2)
[1] 6760.777

# Those add up to the total sum of squares that we might HOPE
# to understand using regression:

> 2189.003 + 6760.777
[1] 8949.78

# Which is the same as the sum of squares for Peabody:

> sum( (CTPEA-mean(CTPEA))^2 )
[1] 8949.78

# If we take the ratio of the sum of squares we understand over the
# sum of squares that we might hope to understand, we see that the
# regression model is helping us understand about 1/4 of the variation
# in Peabody:

>  2189.003 / 8949.78
[1] 0.2445874


# That ratio, called the coefficient of determination or R^2
# is often used to assess how well the regression model is doing.
# It also happens to be the square of the correlation coefficient:

> sqrt( 2189.003 / 8949.78)
[1] 0.4945577
> cor(CTPEA, CTRA)
[1] 0.4945577
> cor(CTPEA, CTRA)^2
[1] 0.2445873

# The correlation of Peabody with Raven is the same as the correlation
# of Raven with Peabody:

> cor(CTRA, CTPEA)
[1] 0.4945577

# The regression of Raven on Peabody is very different from the
# regression of Peabody on Raven:

> lm(CTRA~CTPEA)

Call:
lm(formula = CTRA ~ CTPEA)

Coefficients:
(Intercept)        CTPEA  
     3.5631       0.3342  

# But the coefficient of determination for this regression is
# the same as for the regression of Peabody on Raven:

> temp <- lm(CTRA~CTPEA)$fit
> sum((temp-mean(temp))^2)
[1] 999.56
> sum((CTRA-mean(CTRA))^2)
[1] 4086.72
> 999.56 / 4086.72
[1] 0.2445873
> sqrt(999.56 / 4086.72)
[1] 0.4945577
 

# Usually when we calculate a sum of squares, we divide it
# by degrees of freedom to get a variance estimate:

> sum((CTPEA-mean(CTPEA))^2)
[1] 8949.78
> sum((CTPEA-mean(CTPEA))^2) / 49
[1] 182.6486
> var(CTPEA)
[1] 182.6486
 
# The same idea holds in the ANOVA table for inference in
# regression. The model sum of squares really only used one
# piece of information (the estimated slope), so the variance
# estimate (called a "mean square" in this context) is the
# same as the sum of squares:

> ModelSS <- sum((regout$fit-mean(regout$fit))^2)
> ModelMS <- ModelSS / 1
> ModelSS
[1] 2189.003
> ModelMS
[1] 2189.003

# The error sum of squares has N - 2 degrees of freedom:

> ErrorSS <- sum(regout$res^2)
> ErrorSS
[1] 6760.777
> ErrorMS <- ErrorSS / 48
> ErrorMS
[1] 140.8495

# It looks like we understand about 15-1/2 times as much
# variation as we don't understand:

> F <- ModelMS / ErrorMS
> F
[1] 15.54143
 
# If the null hypothesis is true and all of the assumptions
# are met, that statistic has an F distribution with 1 df in
# the numerator and 48 df in the denominator. The probability
# of getting a value below 15.54 from that distribution is:

> pf(F, 1, 48)
[1] 0.9997384

# So the probability of getting an F larger than 15.54 is only
# about 3 in 10000:

> 1 - pf(F, 1, 48)
[1] 0.0002615643
> 

# It would be very rare to get a value that big (much rarer than
# our usual criterion of .05), so we reject the null hypothesis 
# and conclude that it's not plausible that the true slope is zero.