# CBSEX in my data set is sorted girls (0) first, then boys (1).
# We can find out how many boys there were by summing the variable:

> attach(JackStatlab)
> sum(CBSEX)
[1] 18

# So the first 32 cases are girls, and the remaining 18 are boys:

> 50-18
[1] 32

# The girls' mean is lower than the boys':
> mean(CTPEA[1:32])
[1] 78.75
> mean(CTPEA[32:50])
[1] 81.63158

# We can also get the means this way (which doesn't depend on the
# data being sorted by sex):

> tapply(CTPEA, CBSEX, mean)
       0        1 
78.75000 82.27778 

# Here, we get the variances for both groups:

> tapply(CTPEA, CBSEX, var)
       0        1 
121.4194 219.3889 

# Here's the pooled variance estimate (a weighted average
# of the two variances, weighting by the degrees of freedom):

> s2p <- (31*121.4194 + 17*219.3889) / (31 + 17)
> s2p
[1] 156.1169

# And here's the standard error of the difference between means:

> se <- sqrt(s2p/32 + s2p/18)
> se
[1] 3.681279

# Finally, here is a t statistic comparing the two groups:

> t <- (mean(CTPEA[1:32]) - mean(CTPEA[33:50])) / se
> t
[1] -0.9583021
 
# The df is n1-1 + n2-1 (or n1+n2-2):

> 31 + 17
[1] 48

# So here's the critical value of the test:

> qt(.975, 48)
[1] 2.010635
 
# And here is an exact p-value:

> 2*pt(-abs(t), 48)
[1] 0.3427126

# Notice that the t.test() function gives us a different
# result. That's because, by default, R uses the Welch version
# of the test, also known as the Satterthwaite approximation,
# which doesn't pool the variances:

> t.test(CTPEA[1:32],CTPEA[33:50])

        Welch Two Sample t-test

data:  CTPEA[1:32] and CTPEA[33:50]
t = -0.88242, df = 27.757, p-value = 0.3851
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -11.720188   4.664633
sample estimates:
mean of x mean of y 
 78.75000  82.27778 

> t
[1] -0.9583021

# But we can forse R to use the pooled-variance version of the
# test. Note that the value of t and the p-value match our
# manual results:

> t.test(CTPEA[1:32],CTPEA[33:50], var.equal=TRUE)

        Two Sample t-test

data:  CTPEA[1:32] and CTPEA[33:50]
t = -0.9583, df = 48, p-value = 0.3427
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -10.929485   3.873929
sample estimates:
mean of x mean of y 
 78.75000  82.27778 


# And here's another way to do it, which doesn't rely on the
# data being sorted into two groups:

> t.test(CTPEA~CBSEX, var.equal=TRUE)

        Two Sample t-test

data:  CTPEA by CBSEX
t = -0.9583, df = 48, p-value = 0.3427
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -10.929485   3.873929
sample estimates:
mean in group 0 mean in group 1 
       78.75000        82.27778 

 
# Next, we manually calculate the Satterthwaite approximation, so
# that we can confirm that R's default version of the test matches
# what is in our Powerpoint. First, we'll need the variances of
# the two groups:

> variances <- tapply(CTPEA, CBSEX, var)
> variances
       0        1 
121.4194 219.3889 
> variances[1]
       0 
121.4194 
> variances[2]
       1 
219.3889 

# Here are the sample sizes:

> ns <- tapply(CTPEA, CBSEX, length)
> ns
 0  1 
32 18 
> dfapprox <- ((variances[1]/ns[1])+variances[2]/ns[2])^2 /
+ (((variances[1]/ns[1])^2 / (ns[1]-1)) + ((variances[2]/ns[2])^2 / (ns[2]-1)))

# The approximate df matches what we saw before in R's t.test output:

> dfapprox
       0 
27.75696 

# Here, we calculate the standard error of the difference between
# means using the unpooled variances:

> se <- sqrt(var(CTPEA[1:32])/32 + var(CTPEA[33:50])/18)
> se
[1] 3.997828

# Our t matches R's output:

> newt <- (mean(CTPEA[1:32]) - mean(CTPEA[33:50])) / se
> newt
[1] -0.8824237
> dfapprox
       0 
27.75696 

# As does our p-value:

> 2*pt(-abs(newt), dfapprox)
[1] 0.3851262

# A student asked whether the two versions of the test would
# give identical results of the variability was identical in
# the samples. I create a data set with different means but
# identical standard deviations:

> x1 <- rnorm(25, 50, 10)
> x2 <- rnorm(25, 60, 10)
> sd(x1)
[1] 9.174232
> x2 <- (x2-mean(x2))/sd(x2)
> x2 <- x2*sd(x1) + 60
> mean(x2)
[1] 60
> mean(x1)
[1] 50.84808
> sd(x1)
[1] 9.174232
> sd(x2)
[1] 9.174232

# And the two versions of the test do indeed produce
# identical results:

> t.test(x1,x2, var.equal=
+ TRUE)

        Two Sample t-test

data:  x1 and x2
t = -3.5269, df = 48, p-value = 0.0009367
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -14.369246  -3.934596
sample estimates:
mean of x mean of y 
 50.84808  60.00000 

> t.test(x1,x2)

        Welch Two Sample t-test

data:  x1 and x2
t = -3.5269, df = 48, p-value = 0.0009367
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -14.369246  -3.934596
sample estimates:
mean of x mean of y 
 50.84808  60.00000 

# We check the assumption that population variances are the
# same by comparing the sample standard deviations:

> tapply(CTPEA, CBSEX, sd)
       0        1 
11.01905 14.81178 

# Those look pretty similar. A pooled sd would be about 12.5:

> s2p
[1] 156.1169
> sqrt(s2p)
[1] 12.49468

# By taking a few samples from a normal distribution with that sd,
# it's easy to convince ourselves that the observed sd of 14.8 for
# the boys group is actually an estimate of a true sd of 12.5:

> sd(rnorm(18, 0, sqrt(s2p)))
[1] 11.04908
> sd(rnorm(18, 0, sqrt(s2p)))
[1] 16.17077
> sd(rnorm(18, 0, sqrt(s2p)))
[1] 13.65993
> sd(rnorm(18, 0, sqrt(s2p)))
[1] 12.71699
> sd(rnorm(18, 0, sqrt(s2p)))
[1] 15.56267

# We can compare the distributions visually:

> boxplot(CTPEA~CBSEX)

# And we can evaluate the normality of each population by
# checking the normality of the samples:

> par(mfrow=c(2,1))
> tapply(CTPEA,CBSEX,qqnorm)
$`0`
$`0`$x
 [1]  0.19709908  1.22985876  0.53340971 -0.62609901 -0.72451438 -2.15387469
 [7]  0.72451438 -1.41779714  0.27769044  1.41779714 -1.22985876  0.03917609
[13]  0.94678176 -0.53340971 -1.67593972 -1.07751557  0.62609901 -0.27769044
[19]  0.44509652  0.36012989 -0.11776987 -0.19709908  2.15387469 -0.44509652
[25] -0.03917609 -0.94678176 -0.36012989  0.83051088  0.11776987  1.07751557
[31]  1.67593972 -0.83051088

$`0`$y
 [1]  81  94  84  74  71  59  86  63  81 100  63  78  87  75  62  64  85  76  83
[20]  82  77  76 102  75  77  68  75  86  78  88 100  70


$`1`
$`1`$x
 [1]  0.35549042 -0.50848806  1.91450583  0.06968492 -0.86163412  0.86163412
 [7] -0.21042839  1.38299413 -0.35549042  0.67448975 -1.91450583 -1.38299413
[13]  1.08532491 -0.67448975 -0.06968492  0.21042839 -1.08532491  0.50848806

$`1`$y
 [1]  85  74 123  81  69  94  78  99  77  93  65  65  94  69  78  82  66  89


>