# Today we demonstrated the calculation of confidence intervals for # a regression slope. We pasted one of the slopes and its standard # error from the grades regression output in SAS: > 0.98784561 0.36088447 Error: unexpected numeric constant in " 0.98784561 0.36088447" # The regression had 97 degrees of freedom; we found the correct # critical value of t: > qt(.975, 97) [1] 1.984723 # Then we calculated the slope plus or minus its standard error # times the critical t value: > 0.98784561 + 0.36088447*1.984723 [1] 1.704101 > 0.98784561 - 0.36088447*1.984723 [1] 0.2715899 # Next, we demonstrated the performance of such confidence intervals # using a simulation. We generate data to fit a simple regression with # a slope of 20: > x <- rnorm(100) > y <- 10 + 20*x > y <- y + rnorm(100, 0, 40) # We calculate a confidence interval. This time, df=98 because although # we still have 100 cases, there is only one predictor. > tcrit <- qt(.975, 98) > regout <- lm(y~x) > se <- sqrt(var(regout$res)/98/var(x)) > low <- regout$coef[2] - se*tcrit > high <- regout$coef[2] + se*tcrit > low x 19.72490 > high x 38.03909 # Note that this particular confidence interval was successful, inasmuch as # it contained the true value of the slope. # Next, we simulated a set of 100 such intervals, producing graphics so that # the intervals that missed are color coded red. We repeated the exercise # several times, and noted that we tended to get about 5 misses in each # set of 100 intervals: > plot( c(0,40), c(0,101), type='n', axes=FALSE, xlab="", ylab="") > axis(side=1, at=seq(0,40,5)) > lines(c(20,20),c(0,101)) > > for( i in 1:100) { + par(col=1) + x <- rnorm(100) + y <- 10 + 20*x + y <- y + rnorm(100,0,40) + regout <- lm(y~x) + se <- sqrt(var(regout$res)/98/var(x)) + low <- regout$coef[2] - tcrit*se + high <- regout$coef[2] + tcrit*se + if(low > 20) par(col=2) + if(high < 20) par(col=2) + lines(c(low,high),c(i,i)) + } > plot( c(0,40), c(0,101), type='n', axes=FALSE, xlab="", ylab="") > axis(side=1, at=seq(0,40,5)) > lines(c(20,20),c(0,101)) > > for( i in 1:100) { + par(col=1) + x <- rnorm(100) + y <- 10 + 20*x + y <- y + rnorm(100,0,40) + regout <- lm(y~x) + se <- sqrt(var(regout$res)/98/var(x)) + low <- regout$coef[2] - tcrit*se + high <- regout$coef[2] + tcrit*se + if(low > 20) par(col=2) + if(high < 20) par(col=2) + lines(c(low,high),c(i,i)) + } > plot( c(0,40), c(0,101), type='n', axes=FALSE, xlab="", ylab="") > axis(side=1, at=seq(0,40,5)) > lines(c(20,20),c(0,101)) > > for( i in 1:100) { + par(col=1) + x <- rnorm(100) + y <- 10 + 20*x + y <- y + rnorm(100,0,40) + regout <- lm(y~x) + se <- sqrt(var(regout$res)/98/var(x)) + low <- regout$coef[2] - tcrit*se + high <- regout$coef[2] + tcrit*se + if(low > 20) par(col=2) + if(high < 20) par(col=2) + lines(c(low,high),c(i,i)) + } > plot( c(0,40), c(0,101), type='n', axes=FALSE, xlab="", ylab="") > axis(side=1, at=seq(0,40,5)) > lines(c(20,20),c(0,101)) > > for( i in 1:100) { + par(col=1) + x <- rnorm(100) + y <- 10 + 20*x + y <- y + rnorm(100,0,40) + regout <- lm(y~x) + se <- sqrt(var(regout$res)/98/var(x)) + low <- regout$coef[2] - tcrit*se + high <- regout$coef[2] + tcrit*se + if(low > 20) par(col=2) + if(high < 20) par(col=2) + lines(c(low,high),c(i,i)) + } > plot( c(0,40), c(0,101), type='n', axes=FALSE, xlab="", ylab="") > axis(side=1, at=seq(0,40,5)) > lines(c(20,20),c(0,101)) > > for( i in 1:100) { + par(col=1) + x <- rnorm(100) + y <- 10 + 20*x + y <- y + rnorm(100,0,40) + regout <- lm(y~x) + se <- sqrt(var(regout$res)/98/var(x)) + low <- regout$coef[2] - tcrit*se + high <- regout$coef[2] + tcrit*se + if(low > 20) par(col=2) + if(high < 20) par(col=2) + lines(c(low,high),c(i,i)) + } # Next, we demonstrated how the standard error of a conditional mean # is assembled from the variances and covariances of the X'X^-1 matrix, # using the formula for the variance of a linear combination of # random variables: > se <- sqrt(50.290426 * ( + 1^2 * .54593567 + 13^2 * 0.0029356502 + + 2^2 * 0.0025897097 + + 2 * (1 * 13 * -0.037302111 + + 1*2 * -0.002472899 + + 13 * 2 * -0.000763273 + ))) > se [1] 1.287911 # We calculated the predicted value by pasting in the intercept # and slopes from SAS: > pred <- 63.22702455 > pred <- pred + 13 * 0.87062304 > pred <- pred + 2* 0.98784561 > pred [1] 76.52082 > tcrit <- qt(.975,97) > pred - tcrit*se [1] 73.96467 > pred + tcrit*se [1] 79.07696 # Note that those values match the numbers produced by SAS. (This transcript corrects a false # start caused by a misplaced parenthesis in the standard error calculation.)