# CBSEX in my data set is sorted girls (0) first, then boys (1). # We can find out how many boys there were by summing the variable: > attach(JackStatlab) > sum(CBSEX) [1] 18 # So the first 32 cases are girls, and the remaining 18 are boys: > 50-18 [1] 32 # The girls' mean is lower than the boys': > mean(CTPEA[1:32]) [1] 78.75 > mean(CTPEA[32:50]) [1] 81.63158 # We can also get the means this way (which doesn't depend on the # data being sorted by sex): > tapply(CTPEA, CBSEX, mean) 0 1 78.75000 82.27778 # Here, we get the variances for both groups: > tapply(CTPEA, CBSEX, var) 0 1 121.4194 219.3889 # Here's the pooled variance estimate (a weighted average # of the two variances, weighting by the degrees of freedom): > s2p <- (31*121.4194 + 17*219.3889) / (31 + 17) > s2p [1] 156.1169 # And here's the standard error of the difference between means: > se <- sqrt(s2p/32 + s2p/18) > se [1] 3.681279 # Finally, here is a t statistic comparing the two groups: > t <- (mean(CTPEA[1:32]) - mean(CTPEA[33:50])) / se > t [1] -0.9583021 # The df is n1-1 + n2-1 (or n1+n2-2): > 31 + 17 [1] 48 # So here's the critical value of the test: > qt(.975, 48) [1] 2.010635 # And here is an exact p-value: > 2*pt(-abs(t), 48) [1] 0.3427126 # Notice that the t.test() function gives us a different # result. That's because, by default, R uses the Welch version # of the test, also known as the Satterthwaite approximation, # which doesn't pool the variances: > t.test(CTPEA[1:32],CTPEA[33:50]) Welch Two Sample t-test data: CTPEA[1:32] and CTPEA[33:50] t = -0.88242, df = 27.757, p-value = 0.3851 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -11.720188 4.664633 sample estimates: mean of x mean of y 78.75000 82.27778 > t [1] -0.9583021 # But we can forse R to use the pooled-variance version of the # test. Note that the value of t and the p-value match our # manual results: > t.test(CTPEA[1:32],CTPEA[33:50], var.equal=TRUE) Two Sample t-test data: CTPEA[1:32] and CTPEA[33:50] t = -0.9583, df = 48, p-value = 0.3427 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -10.929485 3.873929 sample estimates: mean of x mean of y 78.75000 82.27778 # And here's another way to do it, which doesn't rely on the # data being sorted into two groups: > t.test(CTPEA~CBSEX, var.equal=TRUE) Two Sample t-test data: CTPEA by CBSEX t = -0.9583, df = 48, p-value = 0.3427 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -10.929485 3.873929 sample estimates: mean in group 0 mean in group 1 78.75000 82.27778 # Next, we manually calculate the Satterthwaite approximation, so # that we can confirm that R's default version of the test matches # what is in our Powerpoint. First, we'll need the variances of # the two groups: > variances <- tapply(CTPEA, CBSEX, var) > variances 0 1 121.4194 219.3889 > variances[1] 0 121.4194 > variances[2] 1 219.3889 # Here are the sample sizes: > ns <- tapply(CTPEA, CBSEX, length) > ns 0 1 32 18 > dfapprox <- ((variances[1]/ns[1])+variances[2]/ns[2])^2 / + (((variances[1]/ns[1])^2 / (ns[1]-1)) + ((variances[2]/ns[2])^2 / (ns[2]-1))) # The approximate df matches what we saw before in R's t.test output: > dfapprox 0 27.75696 # Here, we calculate the standard error of the difference between # means using the unpooled variances: > se <- sqrt(var(CTPEA[1:32])/32 + var(CTPEA[33:50])/18) > se [1] 3.997828 # Our t matches R's output: > newt <- (mean(CTPEA[1:32]) - mean(CTPEA[33:50])) / se > newt [1] -0.8824237 > dfapprox 0 27.75696 # As does our p-value: > 2*pt(-abs(newt), dfapprox) [1] 0.3851262 # A student asked whether the two versions of the test would # give identical results of the variability was identical in # the samples. I create a data set with different means but # identical standard deviations: > x1 <- rnorm(25, 50, 10) > x2 <- rnorm(25, 60, 10) > sd(x1) [1] 9.174232 > x2 <- (x2-mean(x2))/sd(x2) > x2 <- x2*sd(x1) + 60 > mean(x2) [1] 60 > mean(x1) [1] 50.84808 > sd(x1) [1] 9.174232 > sd(x2) [1] 9.174232 # And the two versions of the test do indeed produce # identical results: > t.test(x1,x2, var.equal= + TRUE) Two Sample t-test data: x1 and x2 t = -3.5269, df = 48, p-value = 0.0009367 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -14.369246 -3.934596 sample estimates: mean of x mean of y 50.84808 60.00000 > t.test(x1,x2) Welch Two Sample t-test data: x1 and x2 t = -3.5269, df = 48, p-value = 0.0009367 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -14.369246 -3.934596 sample estimates: mean of x mean of y 50.84808 60.00000 # We check the assumption that population variances are the # same by comparing the sample standard deviations: > tapply(CTPEA, CBSEX, sd) 0 1 11.01905 14.81178 # Those look pretty similar. A pooled sd would be about 12.5: > s2p [1] 156.1169 > sqrt(s2p) [1] 12.49468 # By taking a few samples from a normal distribution with that sd, # it's easy to convince ourselves that the observed sd of 14.8 for # the boys group is actually an estimate of a true sd of 12.5: > sd(rnorm(18, 0, sqrt(s2p))) [1] 11.04908 > sd(rnorm(18, 0, sqrt(s2p))) [1] 16.17077 > sd(rnorm(18, 0, sqrt(s2p))) [1] 13.65993 > sd(rnorm(18, 0, sqrt(s2p))) [1] 12.71699 > sd(rnorm(18, 0, sqrt(s2p))) [1] 15.56267 # We can compare the distributions visually: > boxplot(CTPEA~CBSEX) # And we can evaluate the normality of each population by # checking the normality of the samples: > par(mfrow=c(2,1)) > tapply(CTPEA,CBSEX,qqnorm) $`0` $`0`$x [1] 0.19709908 1.22985876 0.53340971 -0.62609901 -0.72451438 -2.15387469 [7] 0.72451438 -1.41779714 0.27769044 1.41779714 -1.22985876 0.03917609 [13] 0.94678176 -0.53340971 -1.67593972 -1.07751557 0.62609901 -0.27769044 [19] 0.44509652 0.36012989 -0.11776987 -0.19709908 2.15387469 -0.44509652 [25] -0.03917609 -0.94678176 -0.36012989 0.83051088 0.11776987 1.07751557 [31] 1.67593972 -0.83051088 $`0`$y [1] 81 94 84 74 71 59 86 63 81 100 63 78 87 75 62 64 85 76 83 [20] 82 77 76 102 75 77 68 75 86 78 88 100 70 $`1` $`1`$x [1] 0.35549042 -0.50848806 1.91450583 0.06968492 -0.86163412 0.86163412 [7] -0.21042839 1.38299413 -0.35549042 0.67448975 -1.91450583 -1.38299413 [13] 1.08532491 -0.67448975 -0.06968492 0.21042839 -1.08532491 0.50848806 $`1`$y [1] 85 74 123 81 69 94 78 99 77 93 65 65 94 69 78 82 66 89 >