# Last time, we considered a multiple regression that was highly
# significant overall, but in which both predictors were nonsignificant:

> attach(JackStatlab)
> summary(lm(CTPEA~FIT+MBAG))

Call:
lm(formula = CTPEA ~ FIT + MBAG)

Residuals:
    Min      1Q  Median      3Q     Max 
-22.468  -8.929   1.052   6.418  29.929 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 55.30811    9.86828   5.605 1.06e-06 ***
FIT          0.04940    0.02549   1.938   0.0587 .  
MBAG         0.72118    0.37013   1.948   0.0573 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 12.64 on 47 degrees of freedom
Multiple R-squared:  0.1612,    Adjusted R-squared:  0.1255 
F-statistic: 4.515 on 2 and 47 DF,  p-value: 0.01609

# One way of addressing such a situation is to attempt to create
# a composite variable that captures most of the variability associated
# with the multiple variables. Principal components analysis is one
# such method. Principal components seeks to find the linear combination
# of the variables that maximizes variability accounted for. Then it
# seeks further such combinations to maximize remaining variability and
# to be uncorrelated with the earlier component(s). There can be as
# many components as there are variables, so here the principal
# components analysis gives us two principal components, and we hope
# that the first one captures most of the variability.

# Here's the analysis:

> components <- princomp(cbind(FIT,MBAG))
> head(components$scores)
         Comp.1    Comp.2
[1,] -53.280640 12.117400
[2,]  21.782498  6.967123
[3,] -43.202010  5.231094
[4,]  -7.204318  5.638776
[5,] -57.189788  4.072615
[6,]  19.805275  4.944602

# Note that the two sets of principal components scores are
# perfectly uncorrelated:

> cor(components$scores)
             Comp.1       Comp.2
Comp.1 1.000000e+00 1.634274e-17
Comp.2 1.634274e-17 1.000000e+00
 
# If we regress on the two components, the overall regression 
# accounts for exactly the same variation as before (because
# the component scores are just linear recombinations of the
# variables that were on the right hand side of the regression
# equation):

> summary(lm(CTPEA~components$scores[,1]+components$scores[,2]))

Call:
lm(formula = CTPEA ~ components$scores[, 1] + components$scores[, 
    2])

Residuals:
    Min      1Q  Median      3Q     Max 
-22.468  -8.929   1.052   6.418  29.929 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)            81.62000    1.78736  45.665   <2e-16 ***
components$scores[, 1]  0.05757    0.02515   2.289   0.0266 *  
components$scores[, 2] -0.72058    0.37015  -1.947   0.0576 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 12.64 on 47 degrees of freedom
Multiple R-squared:  0.1612,    Adjusted R-squared:  0.1255 
F-statistic: 4.515 on 2 and 47 DF,  p-value: 0.01609

# Note also that the first principal component IS significant
# in the regression. With any luck, it can represent a general
# socioeconomic status variable, and the second component won't
# be needed. However, in this particular case, much of the
# variance accounted for by the regression is still associated 
# with the second variable. Note that when we use only the
# first principal component, the multiple R^2 drops from .16 
# to .09:

> summary(lm(CTPEA~components$scores[,1]))

Call:
lm(formula = CTPEA ~ components$scores[, 1])

Residuals:
     Min       1Q   Median       3Q      Max 
-23.2996 -10.0820  -0.6279   8.4220  29.6663 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)            81.62000    1.83856  44.393   <2e-16 ***
components$scores[, 1]  0.05757    0.02587   2.225   0.0308 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 13 on 48 degrees of freedom
Multiple R-squared:  0.09352,   Adjusted R-squared:  0.07464 
F-statistic: 4.952 on 1 and 48 DF,  p-value: 0.03079


# We considered the process of transforming variables to fix
# problems with regression analysis. Here is a data set that
# contains information about the arsenic content of water in
# wells, as well as the arsenic content of toenail clippings
# from humans who drink the water from those wells:

> arsenic <- read.csv("http://faculty.ucmerced.edu/jvevea/classes/105/data/arsenic.csv")
> arsenic
     wells  toes
1  0.00087 0.119
2  0.00021 0.118
3  0.00115 0.118
4  0.00013 0.080
5  0.00069 0.158
6  0.00039 0.310
7  0.04600 0.832
8  0.01940 0.517
9  0.13700 2.252
10 0.02140 0.851
11 0.01750 0.269
12 0.07640 0.433
13 0.01650 0.275
14 0.00012 0.135
15 0.00410 0.175
> attach(arsenic)
> plot(wells, toes)

# The regression of arsenic in humans on arsenic in wells
# is informative and highly significant:

> arsenicout <- lm(toes~wells)
> summary(arsenicout)

Call:
lm(formula = toes ~ wells)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.71332 -0.05312 -0.02252  0.10166  0.42645 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.14372    0.07814   1.839   0.0888 .  
wells       13.12301    1.80360   7.276 6.22e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2574 on 13 degrees of freedom
Multiple R-squared:  0.8029,    Adjusted R-squared:  0.7877 
F-statistic: 52.94 on 1 and 13 DF,  p-value: 6.217e-06

# But the variability of residuals is much smaller at the
# left side of the residuals plot:

> plot(arsenicout$fit, arsenicout$res)

# The normality also doesn't look so great, although it's
# difficult to say much about normality in this small a 
# data set:

> qqnorm(arsenicout$res); qqline(arsenicout$res)

# Sometimes transformations can help with regression assumptions.
# Most often, we transform the variable on the left hand side
# of the regression equation, but in this particular case, a log
# transformation of both variables helps a lot:

> logwells <- log(wells)
> logtoes <- log(toes)
> plot(logwells, logtoes)

# The regression is still informative and significant:

> logarsenicout <- lm(logtoes~logwells)
> summary(logarsenicout)

Call:
lm(formula = logtoes ~ logwells)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.5451 -0.4883 -0.0200  0.3328  0.9135 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.54579    0.35325   1.545    0.146    
logwells     0.32573    0.05796   5.620 8.34e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.5235 on 13 degrees of freedom
Multiple R-squared:  0.7084,    Adjusted R-squared:  0.686 
F-statistic: 31.58 on 1 and 13 DF,  p-value: 8.344e-05

# But now, the residuals plot looks great:

> plot(logarsenicout$fit, logarsenicout$res)

# Normality is still not great, but is still hard to evaluate
# given the small data set:

> qqnorm(logarsenicout$res); qqline(logarsenicout$res)
 
# The commonly used transformations are the log, the square
# root, and the negative reciprocal. These can be expressed
# as a power to which the variable is raised. For example,
# the square root is equivalent to raising a value to the 
# 1/2 power:

> y <- 2
> sqrt(y)
[1] 1.414214
> y^(1/2)
[1] 1.414214

# Similarly, the cube root (not a commonly used transformation)
# is equivalent to a 1/3 power:

> y^(1/3)
[1] 1.259921

# The reciprocal is the same as raising to the -1 power:

> y^(-1)
[1] 0.5

# And, of course, no transformation at all is the same as
# raising to the first power:

> y^1
[1] 2

# What doesn't quite fit the pattern is the 0th power, because
# any value raised to the 0th power is just 1.0:

> y^0
[1] 1
> 3^0
[1] 1
> 22^0
[1] 1
> 3487^0
[1] 1

# But in the world of transformations, we use the log transformation
# where we would expect to use the 0th power.

# It is possible to use maximum likelihood to find the optimal
# transformation for normalizing the residuals from a regression.
# Here's the relevant likelihood function:

> lambdalike
function(y,X,lambda) {
   if(min(y) <= 0) return(NA)
   ytemp = y^lambda
   if (lambda==0) ytemp = log(y)
   res = lm(ytemp~X)$res
   rss = sum(res^2)
   n = length(y)
   like = n*log(abs(lambda)) - n/2 * log(rss) + n*(lambda-1)*mean(log(y))
   if (lambda==0) like = -n/2 * log(rss) - n*mean(log(y))
   return(like)
}

# The 'X' that is fed to the function consists of a column of 1's
# followed by columns containing whatever variables we have in the
# regression (here, that's just well arsenic):

> length(toes)
[1] 15
> X <- cbind(rep(1,15), wells)
> X
          wells
 [1,] 1 0.00087
 [2,] 1 0.00021
 [3,] 1 0.00115
 [4,] 1 0.00013
 [5,] 1 0.00069
 [6,] 1 0.00039
 [7,] 1 0.04600
 [8,] 1 0.01940
 [9,] 1 0.13700
[10,] 1 0.02140
[11,] 1 0.01750
[12,] 1 0.07640
[13,] 1 0.01650
[14,] 1 0.00012
[15,] 1 0.00410

# We consider a range of possible values for lambda:

> lambda <- seq(-1,1,.01)
> like <- NULL

# For each value of lambda, we calculate the likelihood:

> for(i in 1:length(lambda)) like[i] <- lambdalike(toes, X, lambda[i])
> plot(lambda, like, type='l')

# It looks like that curve peaked somewhere between .1 and .2, so we
# try to narrow it down:

> lambda <- seq(.1,.2,.001)
> like <- NULL
> for(i in 1:length(lambda)) like[i] <- lambdalike(toes, X, lambda[i])
> plot(lambda, like, type='l')

# Oops, that missed. Try again:

> lambda <- seq(.05,.15,.001)
> like <- NULL
> for(i in 1:length(lambda)) like[i] <- lambdalike(toes, X, lambda[i])
> plot(lambda, like, type='l')

# That curve peaked near .1. We can see exactly where in this table: at 0.092:

> cbind(lambda,like)
       lambda     like
  [1,]  0.050 8.631151
  [2,]  0.051 8.631662
  [3,]  0.052 8.632161
  [4,]  0.053 8.632648
  [5,]  0.054 8.633123
  [6,]  0.055 8.633586
  [7,]  0.056 8.634038
  [8,]  0.057 8.634477
  [9,]  0.058 8.634905
 [10,]  0.059 8.635321
 [11,]  0.060 8.635724
 [12,]  0.061 8.636116
 [13,]  0.062 8.636496
 [14,]  0.063 8.636864
 [15,]  0.064 8.637220
 [16,]  0.065 8.637564
 [17,]  0.066 8.637895
 [18,]  0.067 8.638215
 [19,]  0.068 8.638522
 [20,]  0.069 8.638818
 [21,]  0.070 8.639101
 [22,]  0.071 8.639372
 [23,]  0.072 8.639631
 [24,]  0.073 8.639878
 [25,]  0.074 8.640113
 [26,]  0.075 8.640335
 [27,]  0.076 8.640545
 [28,]  0.077 8.640743
 [29,]  0.078 8.640928
 [30,]  0.079 8.641102
 [31,]  0.080 8.641262
 [32,]  0.081 8.641411
 [33,]  0.082 8.641547
 [34,]  0.083 8.641671
 [35,]  0.084 8.641783
 [36,]  0.085 8.641882
 [37,]  0.086 8.641968
 [38,]  0.087 8.642042
 [39,]  0.088 8.642104
 [40,]  0.089 8.642153
 [41,]  0.090 8.642190
 [42,]  0.091 8.642214
 [43,]  0.092 8.642225
 [44,]  0.093 8.642224
 [45,]  0.094 8.642211
 [46,]  0.095 8.642185
 [47,]  0.096 8.642146
 [48,]  0.097 8.642094
 [49,]  0.098 8.642030
 [50,]  0.099 8.641953
 [51,]  0.100 8.641864
 [52,]  0.101 8.641761
 [53,]  0.102 8.641646
 [54,]  0.103 8.641518
 [55,]  0.104 8.641378
 [56,]  0.105 8.641224
 [57,]  0.106 8.641058
 [58,]  0.107 8.640879
 [59,]  0.108 8.640687
 [60,]  0.109 8.640482
 [61,]  0.110 8.640265
 [62,]  0.111 8.640034
 [63,]  0.112 8.639790
 [64,]  0.113 8.639534
 [65,]  0.114 8.639264
 [66,]  0.115 8.638982
 [67,]  0.116 8.638686
 [68,]  0.117 8.638377
 [69,]  0.118 8.638056
 [70,]  0.119 8.637721
 [71,]  0.120 8.637373
 [72,]  0.121 8.637012
 [73,]  0.122 8.636638
 [74,]  0.123 8.636250
 [75,]  0.124 8.635850
 [76,]  0.125 8.635436
 [77,]  0.126 8.635009
 [78,]  0.127 8.634569
 [79,]  0.128 8.634115
 [80,]  0.129 8.633649
 [81,]  0.130 8.633168
 [82,]  0.131 8.632675
 [83,]  0.132 8.632168
 [84,]  0.133 8.631648
 [85,]  0.134 8.631115
 [86,]  0.135 8.630568
 [87,]  0.136 8.630007
 [88,]  0.137 8.629433
 [89,]  0.138 8.628846
 [90,]  0.139 8.628245
 [91,]  0.140 8.627631
 [92,]  0.141 8.627003
 [93,]  0.142 8.626362
 [94,]  0.143 8.625707
 [95,]  0.144 8.625038
 [96,]  0.145 8.624356
 [97,]  0.146 8.623660
 [98,]  0.147 8.622950
 [99,]  0.148 8.622227
[100,]  0.149 8.621490
[101,]  0.150 8.620739
 
# No one would likely use .092. Rather, they would say
# "That's about zero; let's use a log transformation."

> temp <- lm(log(toes)~wells)
> temp

Call:
lm(formula = log(toes) ~ wells)

Coefficients:
(Intercept)        wells  
     -1.741       19.864  

> summary(temp)

Call:
lm(formula = log(toes) ~ wells)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.7873 -0.4026 -0.1178  0.3422  1.1546 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -1.7410     0.1722 -10.109 1.59e-07 ***
wells        19.8641     3.9753   4.997 0.000244 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.5673 on 13 degrees of freedom
Multiple R-squared:  0.6576,    Adjusted R-squared:  0.6313 
F-statistic: 24.97 on 1 and 13 DF,  p-value: 0.0002443

# The log transformation does improve the residuals plot some
# (but not as much as when we applied it to the predictor as
# well as to the outcome variable):

> plot(temp$fit, temp$res)
> qqnorm(temp$res); qqline(temp$res)

# In practice, it is probably a bad idea to apply this approach
# with small data sets. So let's try it with a really big data set:

> detach(JackStatlab)
> attach(Statlab)

# We regress income on Raven for the full N=1296 Statlab data set:

> incomeout <- lm(FIT~CTRA)
> incomeout

Call:
lm(formula = FIT ~ CTRA)

Coefficients:
(Intercept)         CTRA  
      115.0          1.3  

> summary(incomeout)

Call:
lm(formula = FIT ~ CTRA)

Residuals:
    Min      1Q  Median      3Q     Max 
-156.56  -43.58  -12.16   31.04  647.34 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 114.9633     6.0721  18.933  < 2e-16 ***
CTRA          1.2998     0.1867   6.961 5.36e-12 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 67.03 on 1294 degrees of freedom
Multiple R-squared:  0.03609,   Adjusted R-squared:  0.03535 
F-statistic: 48.45 on 1 and 1294 DF,  p-value: 5.36e-12

# The residuals plot isn't too bad:

> plot(incomeout$fit, incomeout$res)

# But the residuals are non-normal:

> qqnorm(incomeout$res); qqline(incomeout$res)

# We try a set of values for lambda:

> lambda <- seq(-1,1,.01)
> like <- NULL
> for(i in 1:length(lambda)) like[i] <- lambdalike(FIT, CTRA, lambda[i])

# It didn't work:

> head(like)
[1] NA

# That's because we have an income of 0 in the data set:

> min(FIT)
[1] 0

# So we add 1 to make the variable nonnegative:

> income <- FIT+1
> X <- cbind(rep(1,1296), CTRA)
> for(i in 1:length(lambda)) like[i] <- lambdalike(income, CTRA, lambda[i])
> head(like)
[1] -12886.66 -12836.41 -12786.34 -12736.46 -12686.78 -12637.29
> plot(lambda,like,type='l')

# The plot suggests a square root transformation:

> transformed <- lm(sqrt(FIT)~CTRA)
> summary(transformed)

Call:
lm(formula = sqrt(FIT) ~ CTRA)

Residuals:
    Min      1Q  Median      3Q     Max 
-12.243  -1.652  -0.255   1.432  16.197 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.580565   0.228884  46.227  < 2e-16 ***
CTRA         0.051943   0.007039   7.379 2.83e-13 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.527 on 1294 degrees of freedom
Multiple R-squared:  0.04038,   Adjusted R-squared:  0.03964 
F-statistic: 54.45 on 1 and 1294 DF,  p-value: 2.835e-13

# The normality is somewhat (but not that much) improved:

> qqnorm(transformed$res); qqline(transformed$res)
> plot(transformed$fit, transformed$res)
 
# Even though we think of the Box-Cox process as being tied
# to regression, we can use it for a single variable by
# thinking about the regression of that variable on the
# constant 1.0. 

# Here, we create a variable that is the square of a normal
# variable (so that we KNOW the optimal transformation is the
# square root):

> y <- rnorm(1000, 50, 10)^2
> hist(y)
> hist(sqrt(y))
> qqnorm(sqrt(y)); qqline(sqrt(y))
> qqnorm(y); qqline(y)

# We apply Box-Cox:

> X <- cbind(rep(1,1000))
> like <- NULL
> for(i in 1:length(lambda)) like[i] <- lambdalike(y,X,lambda[i])
> head(like)
[1] -10862.01 -10854.03 -10846.12 -10838.30 -10830.55 -10822.88

# And the likelihood peaks at 1/2, implying the square root transformation:

> plot(lambda, like, type='l')
>