# Here are some made-up data in which the relationship does not have
# homogeneity:

> x_hetero <- c(
+  50,51,52,52,53,54,56,56,57,58,59,60,63,64,65,66,66,66,66,69,70,72,74,74,74,
+  76,76,76,77,81,82,82,83,84,84,84,85,85,85,88,91,92,92,92,93,96,97,98,98,99)
> y_hetero <- c(
+   80, 74, 76, 85, 87, 86, 86, 94, 87, 91, 94, 83, 96,101,108,104, 89,105, 91,
+  115,104, 89,101, 95, 98,110,112,119,137,113,119,111,110,128,152,127,171,131,
+  129,145,146,111, 77,135,115,150,135,124,177,117)

# The scatterplot looks linear, but we can see the increase in variability
# for larger values of x:

> plot(x_hetero,y_hetero)

# We estimate the regression:

> lm(y_hetero~x_hetero) -> temp
> temp

Call:
lm(formula = y_hetero ~ x_hetero)

Coefficients:
(Intercept)     x_hetero  
     17.773        1.244  

> abline(temp$coef)

# The residuals plot very clearly shows the increasing variability as
# we move from left to right in the plot:

> plot(temp$fit, temp$res)

# Even though I created the fake data with normally distributed
# errors, by the time the heterogeneity is added, we also end up
# with non-normal residuals:

> stem(temp$res)

  The decimal point is 1 digit(s) to the right of the |

  -4 | 5
  -2 | 41
  -0 | 8865211999976632211100
   0 | 011333344555677791358
   2 | 307
   4 | 7

> qqnorm(temp$res); qqline(temp$res)
> 
 
 
# Here, we illustrate what happens to correlation and regression when
# we restrict the range of the variables. First, I create a fake data
# set that has a linear relationship:

> x <- c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9)
> y <- c(1,3,2,4,3,5,4,6,5,7,6,8,7,9,8,10,9,11)

# Here's the plot:

> plot(x,y)
> abline(lm(y~x)$coef)

# The regression has an intercept of 1.0 and a slope of 1.0:

> lm(y~x)

Call:
lm(formula = y ~ x)

Coefficients:
(Intercept)            x  
          1            1  

# The correlation is very high:

> cor(x,y)
[1] 0.9325048

# And the regression is highly significant:

> anova(lm(y~x))
Analysis of Variance Table

Response: y
          Df Sum Sq Mean Sq F value   Pr(>F)    
x          1    120 120.000  106.67 1.75e-08 ***
Residuals 16     18   1.125                     
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Now, if we cut out the lowest values...

> plot(x[5:18],y[5:18])

# The regression estimates remain the same:

> lm(y[5:18]~x[5:18])

Call:
lm(formula = y[5:18] ~ x[5:18])

Coefficients:
(Intercept)      x[5:18]  
          1            1  

# But the significance of the regression goes down:

> anova(lm(y[5:18]~x[5:18]))
Analysis of Variance Table

Response: y[5:18]
          Df Sum Sq Mean Sq F value    Pr(>F)    
x[5:18]    1     56  56.000      48 1.587e-05 ***
Residuals 12     14   1.167                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# And the correlation goes down:

> cor(x,y)
[1] 0.9325048
> cor(x[5:18],y[5:18])
[1] 0.8944272

# If we restrict the range further...

> plot(x[9:18],y[9:18])

# ...the regression estimates are still unbiased:

> lm(y[9:18]~x[9:18])

Call:
lm(formula = y[9:18] ~ x[9:18])

Coefficients:
(Intercept)      x[9:18]  
          1            1  

# But the significance of the slope is lowered again:

> anova(lm(y[9:18]~x[9:18]))
Analysis of Variance Table

Response: y[9:18]
          Df Sum Sq Mean Sq F value  Pr(>F)   
x[9:18]    1     20   20.00      16 0.00395 **
Residuals  8     10    1.25                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# And the correlation is biased further downwards:

> cor(x[9:18],y[9:18])
[1] 0.8164966
 
# In summary, when the range of a linear relationship is restricted,
# correlation is biased downwards. Regression estimates are unbiased,
# but the significance level is decreased.

 
 
# Next, we look at ways to generate fake regression data.

# We need some predictor values:
 
> x <- rnorm(50, 50, 10)
> x
 [1] 62.59335 72.76818 63.27385 38.19079 50.19154 49.70449 31.89649 68.75344
 [9] 56.14301 44.14596 56.40272 51.96820 67.15902 43.67797 41.41311 48.86068
[17] 65.87632 55.27406 35.69603 63.01907 46.03423 60.12913 37.72627 33.19086
[25] 45.56889 48.98451 39.22056 45.55720 46.39967 50.54503 45.61082 52.02128
[33] 51.03519 62.89976 50.26502 30.32364 52.38987 53.76446 48.11898 53.05711
[41] 47.68748 62.46523 47.88082 50.95061 57.64587 51.07593 47.24121 37.82576
[49] 34.63126 54.90486

# That's ugly, so let's round them off:

> x <- round(x)
> x
 [1] 63 73 63 38 50 50 32 69 56 44 56 52 67 44 41 49 66 55 36 63 46 60 38 33 46
[26] 49 39 46 46 51 46 52 51 63 50 30 52 54 48 53 48 62 48 51 58 51 47 38 35 55

# We create a linear relationship:

> y <- 2 + 1/2*x

# Every point falls exactly on the line, because we haven't added any error:

> plot(x,y)

# So the estimated regression intercept and slope are exactly what we created:

> lm(y~x)

Call:
lm(formula = y ~ x)

Coefficients:
(Intercept)            x  
        2.0          0.5  

# But we can't do inference, because the fit to the regression line is perfect:

> anova(lm(y~x))
Analysis of Variance Table

Response: y
          Df Sum Sq Mean Sq    F value    Pr(>F)    
x          1 1205.4  1205.4 1.0616e+31 < 2.2e-16 ***
Residuals 48    0.0     0.0                         
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Warning message:
In anova.lm(lm(y ~ x)) :
  ANOVA F-tests on an essentially perfect fit are unreliable
 
# So let's try adding some error:

> error <- rnorm(50, 0, 5)
> error
 [1]   5.3481187  -0.2659994  -0.4962413   1.8207603   9.5616765   1.0267017
 [7]   4.2608351   6.3256839  13.4609861   1.9036581  -1.1707599   1.7704495
[13]   2.9951151 -10.0783957  -4.1162844   2.0221208   5.4677749  -4.1819411
[19]   7.0989025   4.9035483  -4.2600607  -4.7113426   2.7634036   6.8725428
[25]   3.1614830   3.4056975  -0.8617369  -1.4994942  -5.7819456   4.9908298
[31]  -2.1483157  -1.6784286   5.2145310   6.3112618  -8.0834603  -0.7492575
[37]  -3.3657858  -6.4812447   6.0610681   0.6604929  -0.8268999   4.5050535
[43]   0.5807796  -2.8298184  -1.8476654  -2.9946100   7.1851965   6.9659110
[49]  -2.8430277  -8.0431861

# Again, that's ugly, so we round:

> error <- round(error)
> error
 [1]   5   0   0   2  10   1   4   6  13   2  -1   2   3 -10  -4   2   5  -4   7
[20]   5  -4  -5   3   7   3   3  -1  -1  -6   5  -2  -2   5   6  -8  -1  -3  -6
[39]   6   1  -1   5   1  -3  -2  -3   7   7  -3  -8

# If we add those errors to y, the scatterplot looks like a genuine data set
# with a linear relationship:

> plot(x, y+error)

# I'm happy with that, so I go ahead and add the errors to y:

> y <- y+error
> plot(x,y)
> lm(y~x)

Call:
lm(formula = y ~ x)

Coefficients:
(Intercept)            x  
     1.3701       0.5316  

> anova(lm(y~x))
Analysis of Variance Table

Response: y
          Df Sum Sq Mean Sq F value    Pr(>F)    
x          1 1362.8 1362.75  55.951 1.356e-09 ***
Residuals 48 1169.1   24.36                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# My data were created in such a way that all of the assumptions
# were met, so the residuals plot and Q-Q plot look pretty good:

> plot(lm(y~x)$fit, lm(y~x)$res)
> qqnorm(lm(y~x)$res); qqline(lm(y~x)$res)
 
 
# Now, we create an example where the assumptions are violated.
# First, let's get a nonlinear relationship:

> y <- 2 +  2 * x^3
> plot(x,y)

# The values of y are huge:

> min(y)
[1] 54002
> max(y)
[1] 778036

# I'm going to want my predictors to be sorted into ascending
# order:

> x <- sort(x)

# So I create my y's again, and this time I divide by 70000 so the
# numbers aren't so huge:

> y <- round((2 +  2 * x^3)/70000) 
> plot(x,y)

# Here are my first 10 X's:

> x[1:10]
 [1] 30 32 33 35 36 38 38 38 39 41

# I try generating errors for those first 10 cases that are normal
# with a mean of zero and a standard deviation of 2; I keep the
# error zero for the remaining 40 cases:

> error <- c(round(rnorm(10, 0, 2)), rep(0,40))
> error
 [1] -1  0 -1  2 -3  5 -3 -2  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
[26]  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0

# That looks like a bit more variation at the left edge of the plot
# than I actually intended to have:

> plot(x, y+error)

# So I try a smaller standard deviation:

> error <- c(round(rnorm(10, 0, 1/2)), rep(0,40))
> plot(x, y+error)

# Looks good, so I go ahead and incorporate those errors with y:

> y <- y+error

# Now I work on the next 10 cases (somewhat larger x's). I've already
# added my error for the first 10 cases, so I'll start by adding
# rep(0, 10), then use a sd of 3/4 for the next 10 cases, and leave
# the remaining 30 with zero error:

> error <- c(rep(0,10), round(rnorm(10, 0, 3/4)), rep(0,30))
> plot(x,y+error)

# Looks good, so we incorporate those errors into y:

> y <- y+error

# We add some even more variable error to the next 10 cases:

> error <- c(rep(0,20), round(rnorm(10, 0, 1.5)), rep(0,20))
> plot(x,y+error)

# Looks good, so we'll keep it:

> y <- y+error

# Now we work on the 31st to 40th cases:

> error <- c(rep(0,30), round(rnorm(10, 0, 2)), rep(0,10))
> plot(x,y+error)

# Hmmm, I think I want a bit more variability there:

> error <- c(rep(0,30), round(rnorm(10, 0, 3)), rep(0,10))
> plot(x,y+error)

# That's a keeper:

> y <- y + error

# And finally we work on the last 10 cases:

> error <- c(rep(0,40), round(rnorm(10, 0, 9)))
> plot(x, y+error)

# Oops, that's more variation than I intended; let's
# try a smaller sd:

> error <- c(rep(0,40), round(rnorm(10, 0, 5)))
> plot(x, y+error)

# Still too much! Let's go a little lower on the sd:

> error <- c(rep(0,40), round(rnorm(10, 0, 4)))
> plot(x, y+error)

# I like that, so I keep it:

> y <- y+error
> plot(x,y)

# We do the regression:

> badreg <- lm(y~x)

# We can clearly see the pattern of increasing variability
# in the residuals plot. (We can also see subtle evidence of
# the underlying nonlinearity.)

> plot(badreg$fit, badreg$res)

# Potentially, the pattern of increasing variability should
# also give us non-normal residuals, but it's actually not
# such a bad Q-Q plot:

> qqnorm(regout$res); qqline(regout$res)
 
 
# Here, we go back to a truly linear regression:

> y <- 2 + 1/2*x
> plot(x,y)

# This time, we apply non-normally distributed errors:

> error <- round(runif(50, -5, 5))
> plot(x,y+error)
> y <- y+error

# We do the regression:

> badreg <- lm(y~x)

# The residuals plot shows no problems (because the relationship
# is linear and we didn't mess around with different variability
# in different ranges:

> plot(badreg$fit, badreg$res)
> qqnorm(badreg$res); qqline(badreg$res)
> 

# But the Q-Q plot shows heavy tails at both ends of the distribution.
# (Actually, this isn't such a horrible Q-Q; it's hard to show
# clear non-normality in smallish data sets.)