# Last time, we calculated a t test manually:

> length(seconds)
[1] 14
> t <- (mean(seconds)-23) / (sd(seconds)/sqrt(14))
> t
[1] -0.3991615
> 2*pt(-abs(t), 13)
[1] 0.6962541

# R has a built-in t test function that will do this for us.
# By default, it tests the null hypothesis that mu = 0, so it
# doesn't give us the same result...

> t.test(seconds)

        One Sample t-test

data:  seconds
t = 9.4878, df = 13, p-value = 3.3e-07
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 17.04575 27.09711
sample estimates:
mean of x 
 22.07143 

# ...but we can specify a particular null hypothesis and get
# exactly the same result that we did from our manual calculation:

> t.test(seconds, mu=23)

        One Sample t-test

data:  seconds
t = -0.39916, df = 13, p-value = 0.6963
alternative hypothesis: true mean is not equal to 23
95 percent confidence interval:
 17.04575 27.09711
sample estimates:
mean of x 
 22.07143 

# Note that the t.test() function also, by default, gives us
# a 95% confidence interval for the mean. You can think of the
# confidence interval as defining the range of possible null
# hypotheses that would not be rejected. So if we test a null
# hypothesis that is just outside that range, it will be significant:

> t.test(seconds, mu=17)

        One Sample t-test

data:  seconds
t = 2.18, df = 13, p-value = 0.04823
alternative hypothesis: true mean is not equal to 17
95 percent confidence interval:
 17.04575 27.09711
sample estimates:
mean of x 
 22.07143 

> t.test(seconds, mu=27.2)

        One Sample t-test

data:  seconds
t = -2.2046, df = 13, p-value = 0.04611
alternative hypothesis: true mean is not equal to 27.2
95 percent confidence interval:
 17.04575 27.09711
sample estimates:
mean of x 
 22.07143 

# If we test a null hypothesis that is inside the range of
# the confidence interval, it will be non-significant:

> t.test(seconds, mu=18)

        One Sample t-test

data:  seconds
t = 1.7502, df = 13, p-value = 0.1036
alternative hypothesis: true mean is not equal to 18
95 percent confidence interval:
 17.04575 27.09711
sample estimates:
mean of x 
 22.07143 

# And if we test null hypotheses that are exactly at the
# edges of the interval, the p-value will be exactly at
# the decision boundary:

> t.test(seconds, mu=17.04575)

        One Sample t-test

data:  seconds
t = 2.1604, df = 13, p-value = 0.05
alternative hypothesis: true mean is not equal to 17.04575
95 percent confidence interval:
 17.04575 27.09711
sample estimates:
mean of x 
 22.07143 

# The confidence interval can be thought of as providing a
# range of reasonable values for the mean. Here's how to
# calculate one manually (we had already calculated the
# standard error last class):

> se
[1] 2.326305
> qt(.975, 13) -> tcrit
> tcrit
[1] 2.160369
> mean(seconds) - tcrit*se
[1] 17.04575
> mean(seconds) + tcrit*se
[1] 27.09711

# Here's how we could do a 99% interval. We calculate
# tcrit assuming an alpha level of .01:

> tcrit <- qt(.995, 13)
> mean(seconds) - tcrit*se
[1] 15.06396
> mean(seconds) + tcrit*se
[1] 29.0789

# And here's how to make the t.test() function do the
# 99% interval:

> t.test(seconds, mu=23, conf.level=.99)

        One Sample t-test

data:  seconds
t = -0.39916, df = 13, p-value = 0.6963
alternative hypothesis: true mean is not equal to 23
99 percent confidence interval:
 15.06396 29.07890
sample estimates:
mean of x 
 22.07143 

# Here is a little simulation that illustrates the idea that
# in the long run, confidence intervals will capture the true
# mean (50 in this case) 100*(1-alpha) percent of the time
# (95%, in this case). Intervals that capture the true mean
# are plotted in black; intervals that miss are plotted in red.
# When we did this in class, 94 of the 100 intervals captured
# the true mean (which is about 95%). If you repeat this at home,
# you will get a different result, but in the long run, 95% of
# the intervals should succeed.

> par(pin=c(6,4))
> plot(c(40,60), c(0,101), type='n', xlab="",ylab="",axes=F)
> tcrit <- qt(.975, 19)
> tcrit
[1] 2.093024
> abline(v=50)
> for(i in 1:100) {
+    x <- rnorm(20, 50, 10)
+    se <- sd(x)/sqrt(20)
+    low <- mean(x) - tcrit*se
+    high <- mean(x) + tcrit*se
+    gotit <- 1
+    if(50 < low) gotit <- 0
+    if(50 > high) gogit <- 0
+    if(50 > high) gotit <- 0
+    par(col=1)
+    if(gotit==0) par(col=2)
+    lines(c(low,high), c(i,i))
+    Sys.sleep(1/2)
+ }
 
 
# Here, we illustrate the idea of bootstrapping, starting with a
# case where we actually know the theoretical sampling distribution
# so that we can compare our conventional results to our bootstrapped
# results.
 
> output <- NULL
> output
NULL
> attach(JackStatlab)
The following object is masked _by_ .GlobalEnv:

    X

# We take a large number of samples of N=50 from our sample of N=50,
# sampling with replacement, and calculate the mean each time:

> for(i in 1:100000) output[i] <- mean(sample(CTRA, 50, replace=TRUE))

# The resulting empirical sampling distribution of the mean looks normal,
# which is not surprising, as that is what theory predicts:

> hist(output)
> qqnorm(output); qqline(output)

# Here's a 95% confidence interval based on the theoretical sampling
# distribution of the mean:

> t.test(CTRA)

        One Sample t-test

data:  CTRA
t = 21.423, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 29.08892 35.11108
sample estimates:
mean of x 
     32.1 

# And here is a confidence interval based on our bootstrapped
# sampling distribution. Note that the result is almost the same:

> quantile(output, .025)
 2.5% 
29.18 
> quantile(output, .975)
97.5% 
   35 


# Here's a statistic for which we don't have sampling theory:
 
> pskew(CTRA)
[1] -0.254836

# We can still get an estimate of the sampling distribution
# by bootstrapping:

> for(i in 1:100000) output[i] <- pskew(sample(CTRA, 50, replace=TRUE))
> hist(output)

# Here's a 95% confidence interval for Pearson's index, based on the
# bootstrapped sampling distribution:

> quantile(output, .025); quantile(output, .975)
      2.5% 
-0.6816927 
    97.5% 
0.4878823 

# Note that the interval includes zero, so it is perfectly plausible
# that the population distribution is not skewed, even though the skew 
# index of my sample was negative.

> stem(CTRA)

  The decimal point is 1 digit(s) to the right of the |

  1 | 13
  1 | 5568
  2 | 0011144
  2 | 5668899
  3 | 01133334444
  3 | 5557
  4 | 0022234
  4 | 557788
  5 | 00

 
# The idea of bootstrapping could also be used for a statistic
# that DOES have relevant sampling theory but for which the
# assumptions might not be met. For example, our time distribution
# doesn't look particularly normal:
 
> stem(seconds)


  The decimal point is 1 digit(s) to the right of the |

  1 | 223458
  2 | 0355
  3 | 002
  4 | 0

# Here's the 95% CI using conventional smapling theory:

> t.test(seconds)

        One Sample t-test

data:  seconds
t = 9.4878, df = 13, p-value = 3.3e-07
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 17.04575 27.09711
sample estimates:
mean of x 
 22.07143 

# Here's one based on a bootstrapped sampling distribution. The
# results are pretty similar:

> for(i in 1:100000) output[i] <- mean(sample(seconds, 14, replace=TRUE))
> quantile(output, .025); quantile(output, .975)
    2.5% 
17.85714 
   97.5% 
26.57143 
>