# We demonstrated the central limit theorem using the class's
# mean Raven scores, based on their samples of N=50.

# Recall that we are regarding the large Statlab data set
# as a population. The mean, variance, and standard deviation
# of that population are:

> attach(Statlab)
> mean(CTRA)
[1] 30.95062
> var(CTRA)
[1] 99.48791
> sd(CTRA)
[1] 9.974362

# Here are the means of class members' samples:

> stem(RAmeans)

  The decimal point is at the |

  29 | 44
  30 | 017
  31 | 38
  32 | 01267
  33 | 14

# According to the central limit theorem, the mean of those
# means should be near 31:

> mean(RAmeans)
[1] 31.48857

# The variance and standard deviation of the means should be
# about 2 and 1.4, respectively:

> var(CTRA)/50
[1] 1.989758
> sd(CTRA)/sqrt(50)
[1] 1.410588

# And they are indeed near those values:

> var(RAmeans)
[1] 1.828598
> sd(RAmeans)
[1] 1.352257

# We collected estimates of elapsed time, for use next Tuesday:

> seconds <- c(20, 14, 25, 18, 15, 23, 30, 13, 40,12,25, 32, 30, 12)
 
# The critical value of a Z statistic is the value that separates
# off the bottom and top alpha/2 proportion of the standard normal
# curve. A Z statistic that exceeds this value in absolute value
# is considered significant. Here, we calculate the critical value
# for an alpha level of .05:

> qnorm(.025)
[1] -1.959964
> qnorm(.975)
[1] 1.959964
 
# Our Z statistic's value was 2.5, so we reject the null hypothesis
# and conclude that our treatment does, indeed, increase the mean.

# Another option for interpreting the Z statistic is to calculate
# an exact p-value.

> Z <- 2.5

# The probability of gettting a test statistic below -2.5 if the
# null hypothesis is true is:

> pnorm(-2.5)
[1] 0.006209665

# And the probability of getting a value above 2.5 is:

> 1 - pnorm(2.5)
[1] 0.006209665

# So, if the null is true, the probability of getting a test statistic
# at least as large as 2.5 is the sum of those two:

> 2*pnorm(-abs(Z))
[1] 0.01241933
> 
> 
> detach(Statlab)
> attach(JackStatlab)
The following object is masked _by_ .GlobalEnv:

    X

# A stem-and-leaf plot of my Raven sample appears plausibly
# normally distributed:

> stem(CTRA)

  The decimal point is 1 digit(s) to the right of the |

  1 | 13
  1 | 5568
  2 | 0011144
  2 | 5668899
  3 | 01133334444
  3 | 5557
  4 | 0022234
  4 | 557788
  5 | 00

# Another way to evaluate normality is to perform a normal quantile-quantile
# plot. The idea is that we calculate the quantiles of an idealized sample
# of 50 from a normal distribution: 
> rank <- ((1:50)-1/2)/50
> rank
 [1] 0.01 0.03 0.05 0.07 0.09 0.11 0.13 0.15 0.17 0.19 0.21 0.23 0.25 0.27 0.29
[16] 0.31 0.33 0.35 0.37 0.39 0.41 0.43 0.45 0.47 0.49 0.51 0.53 0.55 0.57 0.59
[31] 0.61 0.63 0.65 0.67 0.69 0.71 0.73 0.75 0.77 0.79 0.81 0.83 0.85 0.87 0.89
[46] 0.91 0.93 0.95 0.97 0.99
> quantiles <- qnorm(rank)
> quantiles
 [1] -2.32634787 -1.88079361 -1.64485363 -1.47579103 -1.34075503 -1.22652812
 [7] -1.12639113 -1.03643339 -0.95416525 -0.87789630 -0.80642125 -0.73884685
[13] -0.67448975 -0.61281299 -0.55338472 -0.49585035 -0.43991317 -0.38532047
[19] -0.33185335 -0.27931903 -0.22754498 -0.17637416 -0.12566135 -0.07526986
[25] -0.02506891  0.02506891  0.07526986  0.12566135  0.17637416  0.22754498
[31]  0.27931903  0.33185335  0.38532047  0.43991317  0.49585035  0.55338472
[37]  0.61281299  0.67448975  0.73884685  0.80642125  0.87789630  0.95416525
[43]  1.03643339  1.12639113  1.22652812  1.34075503  1.47579103  1.64485363
[49]  1.88079361  2.32634787

# Here's why we had to shift things sideways by 1/2:

> badrank <- (1:50)/50
> qnorm(badrank)
 [1] -2.05374891 -1.75068607 -1.55477359 -1.40507156 -1.28155157 -1.17498679
 [7] -1.08031934 -0.99445788 -0.91536509 -0.84162123 -0.77219321 -0.70630256
[13] -0.64334541 -0.58284151 -0.52440051 -0.46769880 -0.41246313 -0.35845879
[19] -0.30548079 -0.25334710 -0.20189348 -0.15096922 -0.10043372 -0.05015358
[25]  0.00000000  0.05015358  0.10043372  0.15096922  0.20189348  0.25334710
[31]  0.30548079  0.35845879  0.41246313  0.46769880  0.52440051  0.58284151
[37]  0.64334541  0.70630256  0.77219321  0.84162123  0.91536509  0.99445788
[43]  1.08031934  1.17498679  1.28155157  1.40507156  1.55477359  1.75068607
[49]  2.05374891         Inf

# We sort the Raven scores from smallest to largest:

> sorted <- sort(CTRA)
> sorted
 [1] 11 13 15 15 16 18 20 20 21 21 21 24 24 25 26 26 28 28 29 29 30 31 31 33 33
[26] 33 33 34 34 34 34 35 35 35 37 40 40 42 42 42 43 44 45 45 47 47 48 48 50 50

# A plot of the sorted scores against the ideal quantiles should fall on a
# straight line if the sample came from a normal population:

> par(pin=c(6,4))
> plot(quantiles,sorted)

# Here's the easy way to do that in R:

> qqnorm(CTRA)

# And we can add a reference line to help evaluate the plot:

> qqline(CTRA)

# Here's an example of a Q-Q plot for a highly non-normal distribution:

> temp <- rchisq(50,2)
> qqnorm(temp); qqline(temp)
> stem(temp)

  The decimal point is at the |

  0 | 011112222333444555566899
  1 | 01345679
  2 | 4466778
  3 | 1235
  4 | 39
  5 | 22
  6 | 1
  7 | 3
  8 | 
  9 | 3

# Apart from mild departures in the tails consistent with the
# idea that the tails are a little light compared to those of
# a normal distribution, our Raven Q-Q plot looks pretty good:

> qqnorm(CTRA);qqline(CTRA)

# Here's how you can build up experience evaluating these plots
# for a particular sample size:

> temp <- rnorm(50); qqnorm(temp); qqline(temp)
> temp <- rnorm(50); qqnorm(temp); qqline(temp)
> temp <- rnorm(50); qqnorm(temp); qqline(temp)
> temp <- rnorm(50); qqnorm(temp); qqline(temp)
> temp <- rnorm(50); qqnorm(temp); qqline(temp)
> temp <- rnorm(50); qqnorm(temp); qqline(temp)
> temp <- rnorm(50); qqnorm(temp); qqline(temp)

# And here's an ideal plot of a very large sample from a
# distribution that is truly normal:

> temp <- rnorm(50000); qqnorm(temp); qqline(temp)