# At the beginning of class, I created a couple of made-up variables that
# will be useful for demonstrating what a bad residuals plot looks like.
# We didn't actually have time for that demonstration, so we'll take a
# look at that on Thursday:

> x_hetero <- c(
+  50,51,52,52,53,54,56,56,57,58,59,60,63,64,65,66,66,66,66,69,70,72,74,74,74,
+  76,76,76,77,81,82,82,83,84,84,84,85,85,85,88,91,92,92,92,93,96,97,98,98,99)
> y_hetero <- c(
+   80, 74, 76, 85, 87, 86, 86, 94, 87, 91, 94, 83, 96,101,108,104, 89,105, 91,
+  115,104, 89,101, 95, 98,110,112,119,137,113,119,111,110,128,152,127,171,131,
+  129,145,146,111, 77,135,115,150,135,124,177,117)
 
 
# Here's the previously run regression of Peabody on Raven:

> regout

Call:
lm(formula = CTPEA ~ CTRA)

Coefficients:
(Intercept)         CTRA  
    59.0490       0.7319  

> anova(regout)
Analysis of Variance Table

Response: CTPEA
          Df Sum Sq Mean Sq F value    Pr(>F)    
CTRA       1 2189.0 2189.00  15.541 0.0002616 ***
Residuals 48 6760.8  140.85                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> summary(regout)

Call:
lm(formula = CTPEA ~ CTRA)

Residuals:
    Min      1Q  Median      3Q     Max 
-26.520  -8.273  -1.516   6.915  42.459 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  59.0490     5.9663   9.897 3.55e-13 ***
CTRA          0.7319     0.1856   3.942 0.000262 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 11.87 on 48 degrees of freedom
Multiple R-squared:  0.2446,    Adjusted R-squared:  0.2288 
F-statistic: 15.54 on 1 and 48 DF,  p-value: 0.0002616

# Just to review, here's how it was created:

> attach(JackStatlab)
> regout <- lm(CTPEA~CTRA)
> regout

Call:
lm(formula = CTPEA ~ CTRA)

Coefficients:
(Intercept)         CTRA  
    59.0490       0.7319  

# We always want to consider a scatterplot when we use linear
# methods such as correlation and regression. Here's a bad one:

> plot(CTRA,CTPEA)

# We change the aspect ratio to be more horizontal:
> par(pin=c(6,4))

# We create increments that will help us add a bit of white
# space to the plot:

> RAinc <- .05* (max(CTRA)-min(CTRA))
> RAinc
[1] 1.7

# Note that this matches what we got in a previous class:

> xinc
[1] 1.7

# We do the same for Peabody:

> PBinc <- .05*(max(CTPEA)-min(CTPEA))
> PBinc
[1] 3.15
> yinc
[1] 3.15

# Here's a good scatterplot with a reasonable aspect ratio,
# title, axis labels, and white space:

> plot(CTRA, CTPEA, main="Regression of Peabody on Raven",
+     xlab="Raven Scores", ylab="Peabody Scores",
+     xlim=c(min(CTRA)-RAinc,max(CTRA)+RAinc),
+     ylim=c(min(CTPEA)-PBinc,max(CTPEA)+PBinc))
> abline(regout$coef)
 
# Here's an unimproved residuals plot:

> plot(regout$fit, regout$res)

# We make our usual improvements:

> resxinc <- .05 * (max(regout$fit)-min(regout$fit))
> resyinc <- .05 * (max(regout$res)-min(regout$res))
> plot(regout$fit, regout$res, main="Residuals vs. Fitted Values from\n Regression of Peabody on Raven",
+    xlab="Fitted Conditional Means", ylab="Residuals",
+    xlim=c(min(regout$fit)-resxinc,max(regout$fit)+resxinc),
+    ylim=c(min(regout$res)-resyinc,max(regout$res)+resyinc))

# It can be useful to add a horizontal reference line:

> abline(h=0)
 
# That residuals plot is useful for evaluating the assumption of
# linearity, as well as the assumption that variability in the
# errors (represented here by residuals) is the same through the
# range of fitted values.

# Here, we evaluate the normality of the residuals to assess
# the assumption that errors are normally distributed:

> stem(regout$res)

  The decimal point is 1 digit(s) to the right of the |

  -2 | 74
  -1 | 3322110
  -0 | 98888755544442221111
   0 | 00235677789
   1 | 112335569
   2 | 
   3 | 
   4 | 2

> qqnorm(regout$res);qqline(regout$res)

# Note that the plot for the residuals is similar to, but not
# identical to, the plot for Peabody:

> qqnorm(CTPEA)
> qqline(CTPEA)

# Note that the title for a Q-Q plot can be manipulated the same way
# we control the title of a scatterplot:

> qqnorm(regout$res, main="Normal Q-Q Plot for Regression\nof Peabody on Raven");qqline(regout$res)

# We take a look again at the original scatterplot. There's concern about the one Peabody
# score that is much higher than we expect:

> plot(CTRA, CTPEA, main="Regression of Peabody on Raven",
+     xlab="Raven Scores", ylab="Peabody Scores",
+     xlim=c(min(CTRA)-RAinc,max(CTRA)+RAinc),
+     ylim=c(min(CTPEA)-PBinc,max(CTPEA)+PBinc))
> abline(regout$coef)

# It's the 39th case that stands out as extreme:

> CTPEA
 [1]  72  69  80  90  77  84  59  72  62  82  66  72  96  80  74  81  82  81  69
[20]  68  85  69 106  95  60  64  86  79  83  80  78  64  89  89  81  86  75  71
[39] 122 104 103  65  92 108  86  85  88  78  91 103

# A "Studentized residual" divides the residual by its standard
# error. An "externally Studentized residual" does the same thing,
# but using the standard error from a regression that omits that 
# observation. R's "rstudent()" function gives externally
# Studentized residuals:

> rstudent(regout)
          1           2           3           4           5           6 
-0.20628664 -0.08936128 -1.04756125  0.45132004 -0.15273225  0.50160128 
          7           8           9          10          11          12 
-0.96881902 -0.32988816 -0.57820932 -0.41273931 -0.72172749 -0.32988816 
         13          14          15          16          17          18 
 0.71801825 -0.39437460 -0.71730783 -1.09811401 -0.86962423 -0.69089540 
         19          20          21          22          23          24 
-1.15047096 -0.73395284 -0.15772921 -0.70970946  1.29231413  0.57023188 
         25          26          27          28          29          30 
-1.00236506 -2.41285366  0.99787014 -0.04565285 -0.32759810  0.03865804 
         31          32          33          34          35          36 
-0.31508800 -2.09524623  1.26421011  0.24317066  0.56304765  0.93133822 
         37          38          39          40          41          42 
-0.07487684 -0.10518522  4.19724329  1.42329478  1.14620560 -1.11932626 
         43          44          45          46          47          48 
 0.74591117  1.66343261  0.17438925  0.02831102  0.59136996 -0.43919159 
         49          50 
 0.97579609  1.14620560 

# They can be viewed as t-tests of the idea that the case is an
# "outlier" (an extreme observation that for some reason comes from
# another population than the other cases). The test has degrees of
# freedom one less than the degrees of freedom of the error mean
# square in the regression. In the case of simple, one-predictor
# regressions, that will be N-3. So we can view the externally Studentized
# residual for our worrisom point (4.197) as a test for outlier status.
# Here's the p value:

> 2*pt(-abs(4.19724329), 47)
[1] 0.0001190733

# That's dramatic evidence that this point doesn't behave like the others.
# I'm not too worried that it has skewed the regression, though, because
# the corresponding Raven score is near the mean of the Raven distribution:

> mean(CTRA)
[1] 30.84
> CTRA[39]
[1] 28

# I demonstrate the idea that the closer a predictor is to the mean
# of the predictor values, the less influence the case has on the
# regression slope. Here's the original regression:

> regout

Call:
lm(formula = CTPEA ~ CTRA)

Coefficients:
(Intercept)         CTRA  
    59.0490       0.7319  

# If I add a ridiculously large Peabody score that has a Raven
# score exactly at the mean...

> Y <- c(CTPEA,10000)
> X <- c(CTRA,30.84)

# ...the slope is absolutely unchanged:

> lm(Y~X)

Call:
lm(formula = Y ~ X)

Coefficients:
(Intercept)            X  
   253.5271       0.7319  

# A statistic called "leverage" measures the potential influence
# of a data point. Its calculation involves matrix algebra, which
# I'm not even going to try to explain here:

> X <- cbind(rep(1,50),CTRA)
> X
        CTRA
 [1,] 1   21
 [2,] 1   15
 [3,] 1   45
 [4,] 1   35
 [5,] 1   27
 [6,] 1   26
 [7,] 1   15
 [8,] 1   23
 [9,] 1   13
[10,] 1   38
[11,] 1   21
[12,] 1   23
[13,] 1   39
[14,] 1   35
[15,] 1   32
[16,] 1   47
[17,] 1   45
[18,] 1   41
[19,] 1   32
[20,] 1   24
[21,] 1   38
[22,] 1   25
[23,] 1   44
[24,] 1   40
[25,] 1   17
[26,] 1   43
[27,] 1   21
[28,] 1   28
[29,] 1   38
[30,] 1   28
[31,] 1   31
[32,] 1   39
[33,] 1   21
[34,] 1   37
[35,] 1   21
[36,] 1   22
[37,] 1   23
[38,] 1   18
[39,] 1   28
[40,] 1   39
[41,] 1   42
[42,] 1   26
[43,] 1   33
[44,] 1   41
[45,] 1   34
[46,] 1   35
[47,] 1   30
[48,] 1   33
[49,] 1   28
[50,] 1   42
> H <- X%*%solve(t(X)%*%X)%*%t(X)

# The result is a huge matrix, but the elements on the diagonal
# of the matrix represent this idea of potential influence:

> diag(H)
 [1] 0.04369274 0.08139535 0.06906272 0.02423459 0.02360817 0.02573213
 [7] 0.08139535 0.03504033 0.09787800 0.03254444 0.04369274 0.03504033
[13] 0.03629316 0.02423459 0.02032926 0.08390103 0.06906272 0.04525879
[19] 0.02032926 0.03144820 0.03254444 0.02834547 0.06237765 0.04053128
[25] 0.06687025 0.05618197 0.04369274 0.02197361 0.03254444 0.02197361
[31] 0.02000626 0.03629316 0.04369274 0.02928510 0.04369274 0.03912184
[37] 0.03504033 0.06034179 0.02197361 0.03629316 0.05047569 0.02573213
[43] 0.02114165 0.04525879 0.02244343 0.02423459 0.02017266 0.02114165
[49] 0.02197361 0.05047569

# The leverage for the 39th case (0.02197361) is small compared to most
# of the others, because the Raven value is near the mean.

# If we compare the actual regression...

> regout

Call:
lm(formula = CTPEA ~ CTRA)

Coefficients:
(Intercept)         CTRA  
    59.0490       0.7319  

# ...to the regression with that case dropped, we see that the estimated
# slope doesn't change very much when the extreme case is eliminated:

> X <- c(CTRA[1:38],CTRA[40:50])
> Y <- c(CTPEA[1:38],CTPEA[40:50])
> lm(Y~X)

Call:
lm(formula = Y ~ X)

Coefficients:
(Intercept)            X  
     57.250        0.762  

# Cook's distance provides a measure of how much the estimates actually do
# change when a point is dropped. Cooks distance is actually the highest
# for the 39th case, but that is partly because it has a large effect on the
# intercept, which we care less about than the slope:

> cooks.distance(regout)
           1            2            3            4            5            6 
9.919149e-04 3.612512e-04 4.062299e-02 2.572142e-03 2.878708e-04 3.375276e-03 
           7            8            9           10           11           12 
4.163725e-02 2.013267e-03 1.839183e-02 2.915678e-03 1.201946e-02 2.013267e-03 
          13           14           15           16           17           18 
9.806778e-03 1.966015e-03 5.393087e-03 5.498335e-02 2.819471e-02 1.143844e-02 
          19           20           21           22           23           24 
1.364096e-02 8.830257e-03 4.271236e-04 7.423640e-03 5.478813e-02 6.965970e-03 
          25           26           27           28           29           30 
3.599730e-02 1.574593e-01 2.274932e-02 2.391007e-05 1.839295e-03 1.714471e-05 
          31           32           33           34           35           36 
1.032769e-03 7.721153e-02 3.606128e-02 9.097974e-04 7.346762e-03 1.770669e-02 
          37           38           39           40           41           42 
1.039479e-04 3.627178e-04 1.470092e-01 3.734706e-02 3.469296e-02 1.645882e-02 
          43           44           45           46           47           48 
6.064509e-03 6.325537e-02 3.563027e-04 1.016500e-05 3.649437e-03 2.118662e-03 
          49           50 
1.070710e-02 3.469296e-02 
 
# We turn now to viewing familiar statistics as special cases of regression.
# The sample mean and variance...
 
> mean(CTPEA)
[1] 81.62
> sd(CTPEA); var(CTPEA)
[1] 13.51475
[1] 182.6486

# ...are equivalent to the intercept and error mean square from a regression
# of Peabody on a constant:

> temp <- lm(CTPEA~1) # Note: that's a one, not a lower case ell
> temp

Call:
lm(formula = CTPEA ~ 1)

Coefficients:
(Intercept)  
      81.62  

> anova(temp)
Analysis of Variance Table

Response: CTPEA
          Df Sum Sq Mean Sq F value Pr(>F)
Residuals 49 8949.8  182.65               
 
# A two-sample t test is equivalent to a regression on a dummy (0, 1)
# variable that indicates group membership (girl=0, boy=1).
 
# Here's the t test:

> t.test(CTPEA~CBSEX, var.equal=TRUE)

        Two Sample t-test

data:  CTPEA by CBSEX
t = -2.8494, df = 48, p-value = 0.006434
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -17.373374  -2.998421
sample estimates:
mean in group 0 mean in group 1 
       76.73077        86.91667 

# Here's a dummy variable for sex:

> CBSEX
 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
[39] 1 1 1 1 1 1 1 1 1 1 1 1

# The intercept is the girls' mean:

> lm(CTPEA~CBSEX)$coef
(Intercept)       CBSEX 
   76.73077    10.18590 

# And the intercept plus the slope is the boys' mean:

>  76.73077 + 10.18590
[1] 86.91667

# So the slope is what lets the boys' mean differ from the girls',
# and if we test the null hypothesis that the slope equals zero,
# we are testing exactly the same thing that the t test tested:

> lm(CTPEA~CBSEX) -> temp
> summary(temp)

Call:
lm(formula = CTPEA ~ CBSEX)

Residuals:
    Min      1Q  Median      3Q     Max 
-22.917  -7.870  -0.917   5.223  35.083 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   76.731      2.477  30.982  < 2e-16 ***
CBSEX         10.186      3.575   2.849  0.00643 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 12.63 on 48 degrees of freedom
Multiple R-squared:  0.1447,    Adjusted R-squared:  0.1269 
F-statistic: 8.119 on 1 and 48 DF,  p-value: 0.006434

# Note that the t statistic for the slope exactly matches
# the result from the t.test() function.

# Actually, ANY artificially created variable that uses two
# distinct values to indicate membership in two different
# groups can give us the same test. Here, for example, we
# create what's called an "effects coding" scheme, but using
# -1's and 1's in place of dummy coding's 0's and 1's:
 
> table(CBSEX)
CBSEX
 0  1 
26 24 
> SexEffect <- c(rep(-1,26),rep(1,24))
> summary(lm(CTPEA~SexEffect))

Call:
lm(formula = CTPEA ~ SexEffect)

Residuals:
    Min      1Q  Median      3Q     Max 
-22.917  -7.870  -0.917   5.223  35.083 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   81.824      1.787  45.779  < 2e-16 ***
SexEffect      5.093      1.787   2.849  0.00643 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 12.63 on 48 degrees of freedom
Multiple R-squared:  0.1447,    Adjusted R-squared:  0.1269 
F-statistic: 8.119 on 1 and 48 DF,  p-value: 0.006434

# We got the same test bacause the regression is still identifying
# the same two groups:

> cbind(CBSEX,SexEffect)
      CBSEX SexEffect
 [1,]     0        -1
 [2,]     0        -1
 [3,]     0        -1
 [4,]     0        -1
 [5,]     0        -1
 [6,]     0        -1
 [7,]     0        -1
 [8,]     0        -1
 [9,]     0        -1
[10,]     0        -1
[11,]     0        -1
[12,]     0        -1
[13,]     0        -1
[14,]     0        -1
[15,]     0        -1
[16,]     0        -1
[17,]     0        -1
[18,]     0        -1
[19,]     0        -1
[20,]     0        -1
[21,]     0        -1
[22,]     0        -1
[23,]     0        -1
[24,]     0        -1
[25,]     0        -1
[26,]     0        -1
[27,]     1         1
[28,]     1         1
[29,]     1         1
[30,]     1         1
[31,]     1         1
[32,]     1         1
[33,]     1         1
[34,]     1         1
[35,]     1         1
[36,]     1         1
[37,]     1         1
[38,]     1         1
[39,]     1         1
[40,]     1         1
[41,]     1         1
[42,]     1         1
[43,]     1         1
[44,]     1         1
[45,]     1         1
[46,]     1         1
[47,]     1         1
[48,]     1         1
[49,]     1         1
[50,]     1         1
> 
> summary(lm(CTPEA~SexEffect))

Call:
lm(formula = CTPEA ~ SexEffect)

Residuals:
    Min      1Q  Median      3Q     Max 
-22.917  -7.870  -0.917   5.223  35.083 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   81.824      1.787  45.779  < 2e-16 ***
SexEffect      5.093      1.787   2.849  0.00643 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 12.63 on 48 degrees of freedom
Multiple R-squared:  0.1447,    Adjusted R-squared:  0.1269 
F-statistic: 8.119 on 1 and 48 DF,  p-value: 0.006434

# This time, the value indicating "girl" is -1 instead of
# zero, so the conditional mean for girls is:

> 81.824 + (-1)*5.093
[1] 76.731

# And for boys:

> 81.824 + (1)*5.093
[1] 86.917

# The intercept is actually the mean of the two group means.
# If we had a data set where the number of boys and girls was
# the same, this would be the overall (unconditional) mean
# of Peabody.

> mean(c(76.731,86.917))
[1] 81.824
 
# Here we create yet another coding system that uses two
# distinct values on a single variable to indicate group membership:
 
> nonsense <- c(rep(pi,26),rep(1384,24))
> cbind(CBSEX, SexEffect, nonsense)
      CBSEX SexEffect    nonsense
 [1,]     0        -1    3.141593
 [2,]     0        -1    3.141593
 [3,]     0        -1    3.141593
 [4,]     0        -1    3.141593
 [5,]     0        -1    3.141593
 [6,]     0        -1    3.141593
 [7,]     0        -1    3.141593
 [8,]     0        -1    3.141593
 [9,]     0        -1    3.141593
[10,]     0        -1    3.141593
[11,]     0        -1    3.141593
[12,]     0        -1    3.141593
[13,]     0        -1    3.141593
[14,]     0        -1    3.141593
[15,]     0        -1    3.141593
[16,]     0        -1    3.141593
[17,]     0        -1    3.141593
[18,]     0        -1    3.141593
[19,]     0        -1    3.141593
[20,]     0        -1    3.141593
[21,]     0        -1    3.141593
[22,]     0        -1    3.141593
[23,]     0        -1    3.141593
[24,]     0        -1    3.141593
[25,]     0        -1    3.141593
[26,]     0        -1    3.141593
[27,]     1         1 1384.000000
[28,]     1         1 1384.000000
[29,]     1         1 1384.000000
[30,]     1         1 1384.000000
[31,]     1         1 1384.000000
[32,]     1         1 1384.000000
[33,]     1         1 1384.000000
[34,]     1         1 1384.000000
[35,]     1         1 1384.000000
[36,]     1         1 1384.000000
[37,]     1         1 1384.000000
[38,]     1         1 1384.000000
[39,]     1         1 1384.000000
[40,]     1         1 1384.000000
[41,]     1         1 1384.000000
[42,]     1         1 1384.000000
[43,]     1         1 1384.000000
[44,]     1         1 1384.000000
[45,]     1         1 1384.000000
[46,]     1         1 1384.000000
[47,]     1         1 1384.000000
[48,]     1         1 1384.000000
[49,]     1         1 1384.000000
[50,]     1         1 1384.000000

# Even though the values of the predictor are totally nonsensical
# (so the intercept an slope have no interesting meaning at all),
# the conditional means for the boys and girls are the same as before,
# and the test on the slope is still equivalent to the t.test() results:

> summary(lm(CTPEA~nonsense))

Call:
lm(formula = CTPEA ~ nonsense)

Residuals:
    Min      1Q  Median      3Q     Max 
-22.917  -7.870  -0.917   5.223  35.083 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 76.707595   2.482287  30.902  < 2e-16 ***
nonsense     0.007376   0.002589   2.849  0.00643 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 12.63 on 48 degrees of freedom
Multiple R-squared:  0.1447,    Adjusted R-squared:  0.1269 
F-statistic: 8.119 on 1 and 48 DF,  p-value: 0.006434

>