# Last time, we considered a multiple regression that was highly # significant overall, but in which both predictors were nonsignificant: > attach(JackStatlab) > summary(lm(CTPEA~FIT+MBAG)) Call: lm(formula = CTPEA ~ FIT + MBAG) Residuals: Min 1Q Median 3Q Max -22.468 -8.929 1.052 6.418 29.929 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 55.30811 9.86828 5.605 1.06e-06 *** FIT 0.04940 0.02549 1.938 0.0587 . MBAG 0.72118 0.37013 1.948 0.0573 . --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 12.64 on 47 degrees of freedom Multiple R-squared: 0.1612, Adjusted R-squared: 0.1255 F-statistic: 4.515 on 2 and 47 DF, p-value: 0.01609 # One way of addressing such a situation is to attempt to create # a composite variable that captures most of the variability associated # with the multiple variables. Principal components analysis is one # such method. Principal components seeks to find the linear combination # of the variables that maximizes variability accounted for. Then it # seeks further such combinations to maximize remaining variability and # to be uncorrelated with the earlier component(s). There can be as # many components as there are variables, so here the principal # components analysis gives us two principal components, and we hope # that the first one captures most of the variability. # Here's the analysis: > components <- princomp(cbind(FIT,MBAG)) > head(components$scores) Comp.1 Comp.2 [1,] -53.280640 12.117400 [2,] 21.782498 6.967123 [3,] -43.202010 5.231094 [4,] -7.204318 5.638776 [5,] -57.189788 4.072615 [6,] 19.805275 4.944602 # Note that the two sets of principal components scores are # perfectly uncorrelated: > cor(components$scores) Comp.1 Comp.2 Comp.1 1.000000e+00 1.634274e-17 Comp.2 1.634274e-17 1.000000e+00 # If we regress on the two components, the overall regression # accounts for exactly the same variation as before (because # the component scores are just linear recombinations of the # variables that were on the right hand side of the regression # equation): > summary(lm(CTPEA~components$scores[,1]+components$scores[,2])) Call: lm(formula = CTPEA ~ components$scores[, 1] + components$scores[, 2]) Residuals: Min 1Q Median 3Q Max -22.468 -8.929 1.052 6.418 29.929 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 81.62000 1.78736 45.665 <2e-16 *** components$scores[, 1] 0.05757 0.02515 2.289 0.0266 * components$scores[, 2] -0.72058 0.37015 -1.947 0.0576 . --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 12.64 on 47 degrees of freedom Multiple R-squared: 0.1612, Adjusted R-squared: 0.1255 F-statistic: 4.515 on 2 and 47 DF, p-value: 0.01609 # Note also that the first principal component IS significant # in the regression. With any luck, it can represent a general # socioeconomic status variable, and the second component won't # be needed. However, in this particular case, much of the # variance accounted for by the regression is still associated # with the second variable. Note that when we use only the # first principal component, the multiple R^2 drops from .16 # to .09: > summary(lm(CTPEA~components$scores[,1])) Call: lm(formula = CTPEA ~ components$scores[, 1]) Residuals: Min 1Q Median 3Q Max -23.2996 -10.0820 -0.6279 8.4220 29.6663 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 81.62000 1.83856 44.393 <2e-16 *** components$scores[, 1] 0.05757 0.02587 2.225 0.0308 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 13 on 48 degrees of freedom Multiple R-squared: 0.09352, Adjusted R-squared: 0.07464 F-statistic: 4.952 on 1 and 48 DF, p-value: 0.03079 # We considered the process of transforming variables to fix # problems with regression analysis. Here is a data set that # contains information about the arsenic content of water in # wells, as well as the arsenic content of toenail clippings # from humans who drink the water from those wells: > arsenic <- read.csv("http://faculty.ucmerced.edu/jvevea/classes/105/data/arsenic.csv") > arsenic wells toes 1 0.00087 0.119 2 0.00021 0.118 3 0.00115 0.118 4 0.00013 0.080 5 0.00069 0.158 6 0.00039 0.310 7 0.04600 0.832 8 0.01940 0.517 9 0.13700 2.252 10 0.02140 0.851 11 0.01750 0.269 12 0.07640 0.433 13 0.01650 0.275 14 0.00012 0.135 15 0.00410 0.175 > attach(arsenic) > plot(wells, toes) # The regression of arsenic in humans on arsenic in wells # is informative and highly significant: > arsenicout <- lm(toes~wells) > summary(arsenicout) Call: lm(formula = toes ~ wells) Residuals: Min 1Q Median 3Q Max -0.71332 -0.05312 -0.02252 0.10166 0.42645 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.14372 0.07814 1.839 0.0888 . wells 13.12301 1.80360 7.276 6.22e-06 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.2574 on 13 degrees of freedom Multiple R-squared: 0.8029, Adjusted R-squared: 0.7877 F-statistic: 52.94 on 1 and 13 DF, p-value: 6.217e-06 # But the variability of residuals is much smaller at the # left side of the residuals plot: > plot(arsenicout$fit, arsenicout$res) # The normality also doesn't look so great, although it's # difficult to say much about normality in this small a # data set: > qqnorm(arsenicout$res); qqline(arsenicout$res) # Sometimes transformations can help with regression assumptions. # Most often, we transform the variable on the left hand side # of the regression equation, but in this particular case, a log # transformation of both variables helps a lot: > logwells <- log(wells) > logtoes <- log(toes) > plot(logwells, logtoes) # The regression is still informative and significant: > logarsenicout <- lm(logtoes~logwells) > summary(logarsenicout) Call: lm(formula = logtoes ~ logwells) Residuals: Min 1Q Median 3Q Max -0.5451 -0.4883 -0.0200 0.3328 0.9135 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.54579 0.35325 1.545 0.146 logwells 0.32573 0.05796 5.620 8.34e-05 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.5235 on 13 degrees of freedom Multiple R-squared: 0.7084, Adjusted R-squared: 0.686 F-statistic: 31.58 on 1 and 13 DF, p-value: 8.344e-05 # But now, the residuals plot looks great: > plot(logarsenicout$fit, logarsenicout$res) # Normality is still not great, but is still hard to evaluate # given the small data set: > qqnorm(logarsenicout$res); qqline(logarsenicout$res) # The commonly used transformations are the log, the square # root, and the negative reciprocal. These can be expressed # as a power to which the variable is raised. For example, # the square root is equivalent to raising a value to the # 1/2 power: > y <- 2 > sqrt(y) [1] 1.414214 > y^(1/2) [1] 1.414214 # Similarly, the cube root (not a commonly used transformation) # is equivalent to a 1/3 power: > y^(1/3) [1] 1.259921 # The reciprocal is the same as raising to the -1 power: > y^(-1) [1] 0.5 # And, of course, no transformation at all is the same as # raising to the first power: > y^1 [1] 2 # What doesn't quite fit the pattern is the 0th power, because # any value raised to the 0th power is just 1.0: > y^0 [1] 1 > 3^0 [1] 1 > 22^0 [1] 1 > 3487^0 [1] 1 # But in the world of transformations, we use the log transformation # where we would expect to use the 0th power. # It is possible to use maximum likelihood to find the optimal # transformation for normalizing the residuals from a regression. # Here's the relevant likelihood function: > lambdalike function(y,X,lambda) { if(min(y) <= 0) return(NA) ytemp = y^lambda if (lambda==0) ytemp = log(y) res = lm(ytemp~X)$res rss = sum(res^2) n = length(y) like = n*log(abs(lambda)) - n/2 * log(rss) + n*(lambda-1)*mean(log(y)) if (lambda==0) like = -n/2 * log(rss) - n*mean(log(y)) return(like) } # The 'X' that is fed to the function consists of a column of 1's # followed by columns containing whatever variables we have in the # regression (here, that's just well arsenic): > length(toes) [1] 15 > X <- cbind(rep(1,15), wells) > X wells [1,] 1 0.00087 [2,] 1 0.00021 [3,] 1 0.00115 [4,] 1 0.00013 [5,] 1 0.00069 [6,] 1 0.00039 [7,] 1 0.04600 [8,] 1 0.01940 [9,] 1 0.13700 [10,] 1 0.02140 [11,] 1 0.01750 [12,] 1 0.07640 [13,] 1 0.01650 [14,] 1 0.00012 [15,] 1 0.00410 # We consider a range of possible values for lambda: > lambda <- seq(-1,1,.01) > like <- NULL # For each value of lambda, we calculate the likelihood: > for(i in 1:length(lambda)) like[i] <- lambdalike(toes, X, lambda[i]) > plot(lambda, like, type='l') # It looks like that curve peaked somewhere between .1 and .2, so we # try to narrow it down: > lambda <- seq(.1,.2,.001) > like <- NULL > for(i in 1:length(lambda)) like[i] <- lambdalike(toes, X, lambda[i]) > plot(lambda, like, type='l') # Oops, that missed. Try again: > lambda <- seq(.05,.15,.001) > like <- NULL > for(i in 1:length(lambda)) like[i] <- lambdalike(toes, X, lambda[i]) > plot(lambda, like, type='l') # That curve peaked near .1. We can see exactly where in this table: at 0.092: > cbind(lambda,like) lambda like [1,] 0.050 8.631151 [2,] 0.051 8.631662 [3,] 0.052 8.632161 [4,] 0.053 8.632648 [5,] 0.054 8.633123 [6,] 0.055 8.633586 [7,] 0.056 8.634038 [8,] 0.057 8.634477 [9,] 0.058 8.634905 [10,] 0.059 8.635321 [11,] 0.060 8.635724 [12,] 0.061 8.636116 [13,] 0.062 8.636496 [14,] 0.063 8.636864 [15,] 0.064 8.637220 [16,] 0.065 8.637564 [17,] 0.066 8.637895 [18,] 0.067 8.638215 [19,] 0.068 8.638522 [20,] 0.069 8.638818 [21,] 0.070 8.639101 [22,] 0.071 8.639372 [23,] 0.072 8.639631 [24,] 0.073 8.639878 [25,] 0.074 8.640113 [26,] 0.075 8.640335 [27,] 0.076 8.640545 [28,] 0.077 8.640743 [29,] 0.078 8.640928 [30,] 0.079 8.641102 [31,] 0.080 8.641262 [32,] 0.081 8.641411 [33,] 0.082 8.641547 [34,] 0.083 8.641671 [35,] 0.084 8.641783 [36,] 0.085 8.641882 [37,] 0.086 8.641968 [38,] 0.087 8.642042 [39,] 0.088 8.642104 [40,] 0.089 8.642153 [41,] 0.090 8.642190 [42,] 0.091 8.642214 [43,] 0.092 8.642225 [44,] 0.093 8.642224 [45,] 0.094 8.642211 [46,] 0.095 8.642185 [47,] 0.096 8.642146 [48,] 0.097 8.642094 [49,] 0.098 8.642030 [50,] 0.099 8.641953 [51,] 0.100 8.641864 [52,] 0.101 8.641761 [53,] 0.102 8.641646 [54,] 0.103 8.641518 [55,] 0.104 8.641378 [56,] 0.105 8.641224 [57,] 0.106 8.641058 [58,] 0.107 8.640879 [59,] 0.108 8.640687 [60,] 0.109 8.640482 [61,] 0.110 8.640265 [62,] 0.111 8.640034 [63,] 0.112 8.639790 [64,] 0.113 8.639534 [65,] 0.114 8.639264 [66,] 0.115 8.638982 [67,] 0.116 8.638686 [68,] 0.117 8.638377 [69,] 0.118 8.638056 [70,] 0.119 8.637721 [71,] 0.120 8.637373 [72,] 0.121 8.637012 [73,] 0.122 8.636638 [74,] 0.123 8.636250 [75,] 0.124 8.635850 [76,] 0.125 8.635436 [77,] 0.126 8.635009 [78,] 0.127 8.634569 [79,] 0.128 8.634115 [80,] 0.129 8.633649 [81,] 0.130 8.633168 [82,] 0.131 8.632675 [83,] 0.132 8.632168 [84,] 0.133 8.631648 [85,] 0.134 8.631115 [86,] 0.135 8.630568 [87,] 0.136 8.630007 [88,] 0.137 8.629433 [89,] 0.138 8.628846 [90,] 0.139 8.628245 [91,] 0.140 8.627631 [92,] 0.141 8.627003 [93,] 0.142 8.626362 [94,] 0.143 8.625707 [95,] 0.144 8.625038 [96,] 0.145 8.624356 [97,] 0.146 8.623660 [98,] 0.147 8.622950 [99,] 0.148 8.622227 [100,] 0.149 8.621490 [101,] 0.150 8.620739 # No one would likely use .092. Rather, they would say # "That's about zero; let's use a log transformation." > temp <- lm(log(toes)~wells) > temp Call: lm(formula = log(toes) ~ wells) Coefficients: (Intercept) wells -1.741 19.864 > summary(temp) Call: lm(formula = log(toes) ~ wells) Residuals: Min 1Q Median 3Q Max -0.7873 -0.4026 -0.1178 0.3422 1.1546 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -1.7410 0.1722 -10.109 1.59e-07 *** wells 19.8641 3.9753 4.997 0.000244 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.5673 on 13 degrees of freedom Multiple R-squared: 0.6576, Adjusted R-squared: 0.6313 F-statistic: 24.97 on 1 and 13 DF, p-value: 0.0002443 # The log transformation does improve the residuals plot some # (but not as much as when we applied it to the predictor as # well as to the outcome variable): > plot(temp$fit, temp$res) > qqnorm(temp$res); qqline(temp$res) # In practice, it is probably a bad idea to apply this approach # with small data sets. So let's try it with a really big data set: > detach(JackStatlab) > attach(Statlab) # We regress income on Raven for the full N=1296 Statlab data set: > incomeout <- lm(FIT~CTRA) > incomeout Call: lm(formula = FIT ~ CTRA) Coefficients: (Intercept) CTRA 115.0 1.3 > summary(incomeout) Call: lm(formula = FIT ~ CTRA) Residuals: Min 1Q Median 3Q Max -156.56 -43.58 -12.16 31.04 647.34 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 114.9633 6.0721 18.933 < 2e-16 *** CTRA 1.2998 0.1867 6.961 5.36e-12 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 67.03 on 1294 degrees of freedom Multiple R-squared: 0.03609, Adjusted R-squared: 0.03535 F-statistic: 48.45 on 1 and 1294 DF, p-value: 5.36e-12 # The residuals plot isn't too bad: > plot(incomeout$fit, incomeout$res) # But the residuals are non-normal: > qqnorm(incomeout$res); qqline(incomeout$res) # We try a set of values for lambda: > lambda <- seq(-1,1,.01) > like <- NULL > for(i in 1:length(lambda)) like[i] <- lambdalike(FIT, CTRA, lambda[i]) # It didn't work: > head(like) [1] NA # That's because we have an income of 0 in the data set: > min(FIT) [1] 0 # So we add 1 to make the variable nonnegative: > income <- FIT+1 > X <- cbind(rep(1,1296), CTRA) > for(i in 1:length(lambda)) like[i] <- lambdalike(income, CTRA, lambda[i]) > head(like) [1] -12886.66 -12836.41 -12786.34 -12736.46 -12686.78 -12637.29 > plot(lambda,like,type='l') # The plot suggests a square root transformation: > transformed <- lm(sqrt(FIT)~CTRA) > summary(transformed) Call: lm(formula = sqrt(FIT) ~ CTRA) Residuals: Min 1Q Median 3Q Max -12.243 -1.652 -0.255 1.432 16.197 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 10.580565 0.228884 46.227 < 2e-16 *** CTRA 0.051943 0.007039 7.379 2.83e-13 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.527 on 1294 degrees of freedom Multiple R-squared: 0.04038, Adjusted R-squared: 0.03964 F-statistic: 54.45 on 1 and 1294 DF, p-value: 2.835e-13 # The normality is somewhat (but not that much) improved: > qqnorm(transformed$res); qqline(transformed$res) > plot(transformed$fit, transformed$res) # Even though we think of the Box-Cox process as being tied # to regression, we can use it for a single variable by # thinking about the regression of that variable on the # constant 1.0. # Here, we create a variable that is the square of a normal # variable (so that we KNOW the optimal transformation is the # square root): > y <- rnorm(1000, 50, 10)^2 > hist(y) > hist(sqrt(y)) > qqnorm(sqrt(y)); qqline(sqrt(y)) > qqnorm(y); qqline(y) # We apply Box-Cox: > X <- cbind(rep(1,1000)) > like <- NULL > for(i in 1:length(lambda)) like[i] <- lambdalike(y,X,lambda[i]) > head(like) [1] -10862.01 -10854.03 -10846.12 -10838.30 -10830.55 -10822.88 # And the likelihood peaks at 1/2, implying the square root transformation: > plot(lambda, like, type='l') >