# Next, we took up the subject of collinearity. First, we showed that # collinearity and correlation are not the same thing. # Here, we generate highly correlated data that are not problematic # with respect to collinearity: > cormat <- matrix(c( + 1, .1, .2, .3, .4, + .1, 1, .1, .1, .0, + .2, .1, 1, .2, .95, + .3, .1, .2, 1, .4, + .4, 0, .95,.4, 1),5,5,byrow=T) > cormat [,1] [,2] [,3] [,4] [,5] [1,] 1.0 0.1 0.20 0.3 0.40 [2,] 0.1 1.0 0.10 0.1 0.00 [3,] 0.2 0.1 1.00 0.2 0.95 [4,] 0.3 0.1 0.20 1.0 0.40 [5,] 0.4 0.0 0.95 0.4 1.00 > > x1 <- rnorm(50) > x2 <- rnorm(50) > x3 <- rnorm(50) > x4 <- rnorm(50) > x5 <- rnorm(50) > x <- cbind(x1,x2,x3,x4,x5) > newx <- x%*%chol(cormat) > cor(newx) [,1] [,2] [,3] [,4] [,5] [1,] 1.0000000 0.1713720 0.2830376 0.3753486 0.4630247 [2,] 0.1713720 1.0000000 0.1923732 0.1546041 0.1217239 [3,] 0.2830376 0.1923732 1.0000000 0.1591590 0.9492125 [4,] 0.3753486 0.1546041 0.1591590 1.0000000 0.3976179 [5,] 0.4630247 0.1217239 0.9492125 0.3976179 1.0000000 # Note that x3 and x5 are correlated .95. # Now, we define y to be the first variable in that matrix, # and define x1-x4 as the remaining four variable: > y <- newx[,1] > x1 <- newx[,2] > x2 <- newx[,3] > x3 <- newx[,4] > x4 <- newx[,5] > # Here's the estimated regression of y on x1-x4: > summary(lm(y~x1+x2+x3+x4)) Call: lm(formula = y ~ x1 + x2 + x3 + x4) Residuals: Min 1Q Median 3Q Max -1.06748 -0.39833 -0.01049 0.33287 1.36406 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.01150 0.08626 -0.133 0.894 x1 0.55121 0.10708 5.148 5.62e-06 *** x2 -3.60373 0.44373 -8.121 2.30e-10 *** x3 -0.68670 0.13149 -5.222 4.37e-06 *** x4 4.09404 0.46870 8.735 3.01e-11 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.6051 on 45 degrees of freedom Multiple R-squared: 0.7025, Adjusted R-squared: 0.676 F-statistic: 26.56 on 4 and 45 DF, p-value: 2.401e-11 # If the predictors were collinear, we would expect to see # large standard errors, various predictors to vacilate between # significance and nonsignificance, and unstable slope estimate. # However, as we repeat the stimulation a number of times, we see # that things remain relatively stable: > x1 <- rnorm(50) > x2 <- rnorm(50) > x3 <- rnorm(50) > x4 <- rnorm(50) > x5 <- rnorm(50) > x <- cbind(x1,x2,x3,x4,x5) > newx <- x%*%chol(cormat) > y <- newx[,1] > x1 <- newx[,2] > x2 <- newx[,3] > x3 <- newx[,4] > x4 <- newx[,5] > > summary(lm(y~x1+x2+x3+x4)) Call: lm(formula = y ~ x1 + x2 + x3 + x4) Residuals: Min 1Q Median 3Q Max -1.10555 -0.34178 -0.02253 0.43776 1.35495 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.006491 0.081401 -0.080 0.937 x1 0.616399 0.080958 7.614 1.26e-09 *** x2 -4.075541 0.426358 -9.559 2.10e-12 *** x3 -0.778704 0.122707 -6.346 9.55e-08 *** x4 4.522205 0.435174 10.392 1.54e-13 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.5423 on 45 degrees of freedom Multiple R-squared: 0.7405, Adjusted R-squared: 0.7175 F-statistic: 32.11 on 4 and 45 DF, p-value: 1.158e-12