# Next, we took up the subject of collinearity.  First, we showed that
# collinearity and correlation are not the same thing.


# Here, we generate highly correlated data that are not problematic
# with respect to collinearity:

> cormat <- matrix(c(
+        1, .1, .2, .3, .4,
+       .1,  1, .1, .1, .0,
+       .2, .1,  1, .2, .95,
+       .3, .1, .2,  1, .4,
+       .4,  0, .95,.4,  1),5,5,byrow=T)
> cormat
     [,1] [,2] [,3] [,4] [,5]
[1,]  1.0  0.1 0.20  0.3 0.40
[2,]  0.1  1.0 0.10  0.1 0.00
[3,]  0.2  0.1 1.00  0.2 0.95
[4,]  0.3  0.1 0.20  1.0 0.40
[5,]  0.4  0.0 0.95  0.4 1.00
> 
> x1 <- rnorm(50)
> x2 <- rnorm(50)
> x3 <- rnorm(50)
> x4 <- rnorm(50)
> x5 <- rnorm(50)
> x <- cbind(x1,x2,x3,x4,x5)
> newx <- x%*%chol(cormat)
> cor(newx)
          [,1]      [,2]      [,3]      [,4]      [,5]
[1,] 1.0000000 0.1713720 0.2830376 0.3753486 0.4630247
[2,] 0.1713720 1.0000000 0.1923732 0.1546041 0.1217239
[3,] 0.2830376 0.1923732 1.0000000 0.1591590 0.9492125
[4,] 0.3753486 0.1546041 0.1591590 1.0000000 0.3976179
[5,] 0.4630247 0.1217239 0.9492125 0.3976179 1.0000000

# Note that x3 and x5 are correlated .95.

# Now, we define y to be the first variable in that matrix,
# and define x1-x4 as the remaining four variable:
> y <- newx[,1]
> x1 <- newx[,2]
> x2 <- newx[,3]
> x3 <- newx[,4]
> x4 <- newx[,5]
> 

# Here's the estimated regression of y on x1-x4:

> summary(lm(y~x1+x2+x3+x4))

Call:
lm(formula = y ~ x1 + x2 + x3 + x4)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.06748 -0.39833 -0.01049  0.33287  1.36406 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.01150    0.08626  -0.133    0.894    
x1           0.55121    0.10708   5.148 5.62e-06 ***
x2          -3.60373    0.44373  -8.121 2.30e-10 ***
x3          -0.68670    0.13149  -5.222 4.37e-06 ***
x4           4.09404    0.46870   8.735 3.01e-11 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

Residual standard error: 0.6051 on 45 degrees of freedom
Multiple R-squared: 0.7025,     Adjusted R-squared: 0.676 
F-statistic: 26.56 on 4 and 45 DF,  p-value: 2.401e-11 

# If the predictors were collinear, we would expect to see
# large standard errors, various predictors to vacilate between
# significance and nonsignificance, and unstable slope estimate.
# However, as we repeat the stimulation a number of times, we see
# that things remain relatively stable:

> x1 <- rnorm(50)
> x2 <- rnorm(50)
> x3 <- rnorm(50)
> x4 <- rnorm(50)
> x5 <- rnorm(50)
> x <- cbind(x1,x2,x3,x4,x5)
> newx <- x%*%chol(cormat)
> y <- newx[,1]
> x1 <- newx[,2]
> x2 <- newx[,3]
> x3 <- newx[,4]
> x4 <- newx[,5]
> 
> summary(lm(y~x1+x2+x3+x4))

Call:
lm(formula = y ~ x1 + x2 + x3 + x4)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.10555 -0.34178 -0.02253  0.43776  1.35495 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.006491   0.081401  -0.080    0.937    
x1           0.616399   0.080958   7.614 1.26e-09 ***
x2          -4.075541   0.426358  -9.559 2.10e-12 ***
x3          -0.778704   0.122707  -6.346 9.55e-08 ***
x4           4.522205   0.435174  10.392 1.54e-13 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

Residual standard error: 0.5423 on 45 degrees of freedom
Multiple R-squared: 0.7405,     Adjusted R-squared: 0.7175 
F-statistic: 32.11 on 4 and 45 DF,  p-value: 1.158e-12