# We demonstrated the central limit theorem using the class's # mean Raven scores, based on their samples of N=50. # Recall that we are regarding the large Statlab data set # as a population. The mean, variance, and standard deviation # of that population are: > attach(Statlab) > mean(CTRA) [1] 30.95062 > var(CTRA) [1] 99.48791 > sd(CTRA) [1] 9.974362 # Here are the means of class members' samples: > stem(RAmeans) The decimal point is at the | 29 | 44 30 | 017 31 | 38 32 | 01267 33 | 14 # According to the central limit theorem, the mean of those # means should be near 31: > mean(RAmeans) [1] 31.48857 # The variance and standard deviation of the means should be # about 2 and 1.4, respectively: > var(CTRA)/50 [1] 1.989758 > sd(CTRA)/sqrt(50) [1] 1.410588 # And they are indeed near those values: > var(RAmeans) [1] 1.828598 > sd(RAmeans) [1] 1.352257 # We collected estimates of elapsed time, for use next Tuesday: > seconds <- c(20, 14, 25, 18, 15, 23, 30, 13, 40,12,25, 32, 30, 12) # The critical value of a Z statistic is the value that separates # off the bottom and top alpha/2 proportion of the standard normal # curve. A Z statistic that exceeds this value in absolute value # is considered significant. Here, we calculate the critical value # for an alpha level of .05: > qnorm(.025) [1] -1.959964 > qnorm(.975) [1] 1.959964 # Our Z statistic's value was 2.5, so we reject the null hypothesis # and conclude that our treatment does, indeed, increase the mean. # Another option for interpreting the Z statistic is to calculate # an exact p-value. > Z <- 2.5 # The probability of gettting a test statistic below -2.5 if the # null hypothesis is true is: > pnorm(-2.5) [1] 0.006209665 # And the probability of getting a value above 2.5 is: > 1 - pnorm(2.5) [1] 0.006209665 # So, if the null is true, the probability of getting a test statistic # at least as large as 2.5 is the sum of those two: > 2*pnorm(-abs(Z)) [1] 0.01241933 > > > detach(Statlab) > attach(JackStatlab) The following object is masked _by_ .GlobalEnv: X # A stem-and-leaf plot of my Raven sample appears plausibly # normally distributed: > stem(CTRA) The decimal point is 1 digit(s) to the right of the | 1 | 13 1 | 5568 2 | 0011144 2 | 5668899 3 | 01133334444 3 | 5557 4 | 0022234 4 | 557788 5 | 00 # Another way to evaluate normality is to perform a normal quantile-quantile # plot. The idea is that we calculate the quantiles of an idealized sample # of 50 from a normal distribution: > rank <- ((1:50)-1/2)/50 > rank [1] 0.01 0.03 0.05 0.07 0.09 0.11 0.13 0.15 0.17 0.19 0.21 0.23 0.25 0.27 0.29 [16] 0.31 0.33 0.35 0.37 0.39 0.41 0.43 0.45 0.47 0.49 0.51 0.53 0.55 0.57 0.59 [31] 0.61 0.63 0.65 0.67 0.69 0.71 0.73 0.75 0.77 0.79 0.81 0.83 0.85 0.87 0.89 [46] 0.91 0.93 0.95 0.97 0.99 > quantiles <- qnorm(rank) > quantiles [1] -2.32634787 -1.88079361 -1.64485363 -1.47579103 -1.34075503 -1.22652812 [7] -1.12639113 -1.03643339 -0.95416525 -0.87789630 -0.80642125 -0.73884685 [13] -0.67448975 -0.61281299 -0.55338472 -0.49585035 -0.43991317 -0.38532047 [19] -0.33185335 -0.27931903 -0.22754498 -0.17637416 -0.12566135 -0.07526986 [25] -0.02506891 0.02506891 0.07526986 0.12566135 0.17637416 0.22754498 [31] 0.27931903 0.33185335 0.38532047 0.43991317 0.49585035 0.55338472 [37] 0.61281299 0.67448975 0.73884685 0.80642125 0.87789630 0.95416525 [43] 1.03643339 1.12639113 1.22652812 1.34075503 1.47579103 1.64485363 [49] 1.88079361 2.32634787 # Here's why we had to shift things sideways by 1/2: > badrank <- (1:50)/50 > qnorm(badrank) [1] -2.05374891 -1.75068607 -1.55477359 -1.40507156 -1.28155157 -1.17498679 [7] -1.08031934 -0.99445788 -0.91536509 -0.84162123 -0.77219321 -0.70630256 [13] -0.64334541 -0.58284151 -0.52440051 -0.46769880 -0.41246313 -0.35845879 [19] -0.30548079 -0.25334710 -0.20189348 -0.15096922 -0.10043372 -0.05015358 [25] 0.00000000 0.05015358 0.10043372 0.15096922 0.20189348 0.25334710 [31] 0.30548079 0.35845879 0.41246313 0.46769880 0.52440051 0.58284151 [37] 0.64334541 0.70630256 0.77219321 0.84162123 0.91536509 0.99445788 [43] 1.08031934 1.17498679 1.28155157 1.40507156 1.55477359 1.75068607 [49] 2.05374891 Inf # We sort the Raven scores from smallest to largest: > sorted <- sort(CTRA) > sorted [1] 11 13 15 15 16 18 20 20 21 21 21 24 24 25 26 26 28 28 29 29 30 31 31 33 33 [26] 33 33 34 34 34 34 35 35 35 37 40 40 42 42 42 43 44 45 45 47 47 48 48 50 50 # A plot of the sorted scores against the ideal quantiles should fall on a # straight line if the sample came from a normal population: > par(pin=c(6,4)) > plot(quantiles,sorted) # Here's the easy way to do that in R: > qqnorm(CTRA) # And we can add a reference line to help evaluate the plot: > qqline(CTRA) # Here's an example of a Q-Q plot for a highly non-normal distribution: > temp <- rchisq(50,2) > qqnorm(temp); qqline(temp) > stem(temp) The decimal point is at the | 0 | 011112222333444555566899 1 | 01345679 2 | 4466778 3 | 1235 4 | 39 5 | 22 6 | 1 7 | 3 8 | 9 | 3 # Apart from mild departures in the tails consistent with the # idea that the tails are a little light compared to those of # a normal distribution, our Raven Q-Q plot looks pretty good: > qqnorm(CTRA);qqline(CTRA) # Here's how you can build up experience evaluating these plots # for a particular sample size: > temp <- rnorm(50); qqnorm(temp); qqline(temp) > temp <- rnorm(50); qqnorm(temp); qqline(temp) > temp <- rnorm(50); qqnorm(temp); qqline(temp) > temp <- rnorm(50); qqnorm(temp); qqline(temp) > temp <- rnorm(50); qqnorm(temp); qqline(temp) > temp <- rnorm(50); qqnorm(temp); qqline(temp) > temp <- rnorm(50); qqnorm(temp); qqline(temp) # And here's an ideal plot of a very large sample from a # distribution that is truly normal: > temp <- rnorm(50000); qqnorm(temp); qqline(temp)