# Here are some made-up data in which the relationship does not have # homogeneity: > x_hetero <- c( + 50,51,52,52,53,54,56,56,57,58,59,60,63,64,65,66,66,66,66,69,70,72,74,74,74, + 76,76,76,77,81,82,82,83,84,84,84,85,85,85,88,91,92,92,92,93,96,97,98,98,99) > y_hetero <- c( + 80, 74, 76, 85, 87, 86, 86, 94, 87, 91, 94, 83, 96,101,108,104, 89,105, 91, + 115,104, 89,101, 95, 98,110,112,119,137,113,119,111,110,128,152,127,171,131, + 129,145,146,111, 77,135,115,150,135,124,177,117) # The scatterplot looks linear, but we can see the increase in variability # for larger values of x: > plot(x_hetero,y_hetero) # We estimate the regression: > lm(y_hetero~x_hetero) -> temp > temp Call: lm(formula = y_hetero ~ x_hetero) Coefficients: (Intercept) x_hetero 17.773 1.244 > abline(temp$coef) # The residuals plot very clearly shows the increasing variability as # we move from left to right in the plot: > plot(temp$fit, temp$res) # Even though I created the fake data with normally distributed # errors, by the time the heterogeneity is added, we also end up # with non-normal residuals: > stem(temp$res) The decimal point is 1 digit(s) to the right of the | -4 | 5 -2 | 41 -0 | 8865211999976632211100 0 | 011333344555677791358 2 | 307 4 | 7 > qqnorm(temp$res); qqline(temp$res) > # Here, we illustrate what happens to correlation and regression when # we restrict the range of the variables. First, I create a fake data # set that has a linear relationship: > x <- c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9) > y <- c(1,3,2,4,3,5,4,6,5,7,6,8,7,9,8,10,9,11) # Here's the plot: > plot(x,y) > abline(lm(y~x)$coef) # The regression has an intercept of 1.0 and a slope of 1.0: > lm(y~x) Call: lm(formula = y ~ x) Coefficients: (Intercept) x 1 1 # The correlation is very high: > cor(x,y) [1] 0.9325048 # And the regression is highly significant: > anova(lm(y~x)) Analysis of Variance Table Response: y Df Sum Sq Mean Sq F value Pr(>F) x 1 120 120.000 106.67 1.75e-08 *** Residuals 16 18 1.125 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Now, if we cut out the lowest values... > plot(x[5:18],y[5:18]) # The regression estimates remain the same: > lm(y[5:18]~x[5:18]) Call: lm(formula = y[5:18] ~ x[5:18]) Coefficients: (Intercept) x[5:18] 1 1 # But the significance of the regression goes down: > anova(lm(y[5:18]~x[5:18])) Analysis of Variance Table Response: y[5:18] Df Sum Sq Mean Sq F value Pr(>F) x[5:18] 1 56 56.000 48 1.587e-05 *** Residuals 12 14 1.167 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # And the correlation goes down: > cor(x,y) [1] 0.9325048 > cor(x[5:18],y[5:18]) [1] 0.8944272 # If we restrict the range further... > plot(x[9:18],y[9:18]) # ...the regression estimates are still unbiased: > lm(y[9:18]~x[9:18]) Call: lm(formula = y[9:18] ~ x[9:18]) Coefficients: (Intercept) x[9:18] 1 1 # But the significance of the slope is lowered again: > anova(lm(y[9:18]~x[9:18])) Analysis of Variance Table Response: y[9:18] Df Sum Sq Mean Sq F value Pr(>F) x[9:18] 1 20 20.00 16 0.00395 ** Residuals 8 10 1.25 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # And the correlation is biased further downwards: > cor(x[9:18],y[9:18]) [1] 0.8164966 # In summary, when the range of a linear relationship is restricted, # correlation is biased downwards. Regression estimates are unbiased, # but the significance level is decreased. # Next, we look at ways to generate fake regression data. # We need some predictor values: > x <- rnorm(50, 50, 10) > x [1] 62.59335 72.76818 63.27385 38.19079 50.19154 49.70449 31.89649 68.75344 [9] 56.14301 44.14596 56.40272 51.96820 67.15902 43.67797 41.41311 48.86068 [17] 65.87632 55.27406 35.69603 63.01907 46.03423 60.12913 37.72627 33.19086 [25] 45.56889 48.98451 39.22056 45.55720 46.39967 50.54503 45.61082 52.02128 [33] 51.03519 62.89976 50.26502 30.32364 52.38987 53.76446 48.11898 53.05711 [41] 47.68748 62.46523 47.88082 50.95061 57.64587 51.07593 47.24121 37.82576 [49] 34.63126 54.90486 # That's ugly, so let's round them off: > x <- round(x) > x [1] 63 73 63 38 50 50 32 69 56 44 56 52 67 44 41 49 66 55 36 63 46 60 38 33 46 [26] 49 39 46 46 51 46 52 51 63 50 30 52 54 48 53 48 62 48 51 58 51 47 38 35 55 # We create a linear relationship: > y <- 2 + 1/2*x # Every point falls exactly on the line, because we haven't added any error: > plot(x,y) # So the estimated regression intercept and slope are exactly what we created: > lm(y~x) Call: lm(formula = y ~ x) Coefficients: (Intercept) x 2.0 0.5 # But we can't do inference, because the fit to the regression line is perfect: > anova(lm(y~x)) Analysis of Variance Table Response: y Df Sum Sq Mean Sq F value Pr(>F) x 1 1205.4 1205.4 1.0616e+31 < 2.2e-16 *** Residuals 48 0.0 0.0 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Warning message: In anova.lm(lm(y ~ x)) : ANOVA F-tests on an essentially perfect fit are unreliable # So let's try adding some error: > error <- rnorm(50, 0, 5) > error [1] 5.3481187 -0.2659994 -0.4962413 1.8207603 9.5616765 1.0267017 [7] 4.2608351 6.3256839 13.4609861 1.9036581 -1.1707599 1.7704495 [13] 2.9951151 -10.0783957 -4.1162844 2.0221208 5.4677749 -4.1819411 [19] 7.0989025 4.9035483 -4.2600607 -4.7113426 2.7634036 6.8725428 [25] 3.1614830 3.4056975 -0.8617369 -1.4994942 -5.7819456 4.9908298 [31] -2.1483157 -1.6784286 5.2145310 6.3112618 -8.0834603 -0.7492575 [37] -3.3657858 -6.4812447 6.0610681 0.6604929 -0.8268999 4.5050535 [43] 0.5807796 -2.8298184 -1.8476654 -2.9946100 7.1851965 6.9659110 [49] -2.8430277 -8.0431861 # Again, that's ugly, so we round: > error <- round(error) > error [1] 5 0 0 2 10 1 4 6 13 2 -1 2 3 -10 -4 2 5 -4 7 [20] 5 -4 -5 3 7 3 3 -1 -1 -6 5 -2 -2 5 6 -8 -1 -3 -6 [39] 6 1 -1 5 1 -3 -2 -3 7 7 -3 -8 # If we add those errors to y, the scatterplot looks like a genuine data set # with a linear relationship: > plot(x, y+error) # I'm happy with that, so I go ahead and add the errors to y: > y <- y+error > plot(x,y) > lm(y~x) Call: lm(formula = y ~ x) Coefficients: (Intercept) x 1.3701 0.5316 > anova(lm(y~x)) Analysis of Variance Table Response: y Df Sum Sq Mean Sq F value Pr(>F) x 1 1362.8 1362.75 55.951 1.356e-09 *** Residuals 48 1169.1 24.36 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # My data were created in such a way that all of the assumptions # were met, so the residuals plot and Q-Q plot look pretty good: > plot(lm(y~x)$fit, lm(y~x)$res) > qqnorm(lm(y~x)$res); qqline(lm(y~x)$res) # Now, we create an example where the assumptions are violated. # First, let's get a nonlinear relationship: > y <- 2 + 2 * x^3 > plot(x,y) # The values of y are huge: > min(y) [1] 54002 > max(y) [1] 778036 # I'm going to want my predictors to be sorted into ascending # order: > x <- sort(x) # So I create my y's again, and this time I divide by 70000 so the # numbers aren't so huge: > y <- round((2 + 2 * x^3)/70000) > plot(x,y) # Here are my first 10 X's: > x[1:10] [1] 30 32 33 35 36 38 38 38 39 41 # I try generating errors for those first 10 cases that are normal # with a mean of zero and a standard deviation of 2; I keep the # error zero for the remaining 40 cases: > error <- c(round(rnorm(10, 0, 2)), rep(0,40)) > error [1] -1 0 -1 2 -3 5 -3 -2 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [26] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # That looks like a bit more variation at the left edge of the plot # than I actually intended to have: > plot(x, y+error) # So I try a smaller standard deviation: > error <- c(round(rnorm(10, 0, 1/2)), rep(0,40)) > plot(x, y+error) # Looks good, so I go ahead and incorporate those errors with y: > y <- y+error # Now I work on the next 10 cases (somewhat larger x's). I've already # added my error for the first 10 cases, so I'll start by adding # rep(0, 10), then use a sd of 3/4 for the next 10 cases, and leave # the remaining 30 with zero error: > error <- c(rep(0,10), round(rnorm(10, 0, 3/4)), rep(0,30)) > plot(x,y+error) # Looks good, so we incorporate those errors into y: > y <- y+error # We add some even more variable error to the next 10 cases: > error <- c(rep(0,20), round(rnorm(10, 0, 1.5)), rep(0,20)) > plot(x,y+error) # Looks good, so we'll keep it: > y <- y+error # Now we work on the 31st to 40th cases: > error <- c(rep(0,30), round(rnorm(10, 0, 2)), rep(0,10)) > plot(x,y+error) # Hmmm, I think I want a bit more variability there: > error <- c(rep(0,30), round(rnorm(10, 0, 3)), rep(0,10)) > plot(x,y+error) # That's a keeper: > y <- y + error # And finally we work on the last 10 cases: > error <- c(rep(0,40), round(rnorm(10, 0, 9))) > plot(x, y+error) # Oops, that's more variation than I intended; let's # try a smaller sd: > error <- c(rep(0,40), round(rnorm(10, 0, 5))) > plot(x, y+error) # Still too much! Let's go a little lower on the sd: > error <- c(rep(0,40), round(rnorm(10, 0, 4))) > plot(x, y+error) # I like that, so I keep it: > y <- y+error > plot(x,y) # We do the regression: > badreg <- lm(y~x) # We can clearly see the pattern of increasing variability # in the residuals plot. (We can also see subtle evidence of # the underlying nonlinearity.) > plot(badreg$fit, badreg$res) # Potentially, the pattern of increasing variability should # also give us non-normal residuals, but it's actually not # such a bad Q-Q plot: > qqnorm(regout$res); qqline(regout$res) # Here, we go back to a truly linear regression: > y <- 2 + 1/2*x > plot(x,y) # This time, we apply non-normally distributed errors: > error <- round(runif(50, -5, 5)) > plot(x,y+error) > y <- y+error # We do the regression: > badreg <- lm(y~x) # The residuals plot shows no problems (because the relationship # is linear and we didn't mess around with different variability # in different ranges: > plot(badreg$fit, badreg$res) > qqnorm(badreg$res); qqline(badreg$res) > # But the Q-Q plot shows heavy tails at both ends of the distribution. # (Actually, this isn't such a horrible Q-Q; it's hard to show # clear non-normality in smallish data sets.)