# We began with a demonstration of some of the efficiencies afforded by # matrix manipulations. # Recall our Raven variable: > Raven [1] 15 21 40 14 27 44 39 41 41 27 42 39 41 42 36 29 16 14 18 30 29 13 21 47 37 [26] 50 53 16 38 34 29 45 49 42 52 24 23 21 41 21 # If we needed to calculate the variance by hand, we would most likely use the # computational formula for the sum of squared deviations from the mean: > sum(Raven^2) - (sum(Raven)^2)/40 [1] 5563.975 # Divide that by N-1 to get the variance: > (sum(Raven^2) - (sum(Raven)^2)/40) / 39 [1] 142.6660 # Here, we confirm that our variance calculation was correct: > var(Raven) [1] 142.6660 # Now, in order to do the hand calculation, we needed to know N, sum(x), and sum(x^2). # If we construct an appropriate matrix, we can get all three of those quantities # in a single operation: > > Ravmat <- cbind( rep(1,40), Raven) > Ravmat Raven [1,] 1 15 [2,] 1 21 [3,] 1 40 [4,] 1 14 [5,] 1 27 [6,] 1 44 [7,] 1 39 [8,] 1 41 [9,] 1 41 [10,] 1 27 [11,] 1 42 [12,] 1 39 [13,] 1 41 [14,] 1 42 [15,] 1 36 [16,] 1 29 [17,] 1 16 [18,] 1 14 [19,] 1 18 [20,] 1 30 [21,] 1 29 [22,] 1 13 [23,] 1 21 [24,] 1 47 [25,] 1 37 [26,] 1 50 [27,] 1 53 [28,] 1 16 [29,] 1 38 [30,] 1 34 [31,] 1 29 [32,] 1 45 [33,] 1 49 [34,] 1 42 [35,] 1 52 [36,] 1 24 [37,] 1 23 [38,] 1 21 [39,] 1 41 [40,] 1 21 > t(Ravmat)%*%Ravmat Raven 40 1301 Raven 1301 47879 # We turned our attention to the text's first regression example. Here, we read in the data: > homework <- read.table("c:/documents and settings/jvevea/desktop/Chap 1/homework.txt", + col.names=c("homework","score")) > homework homework score 1 2 54 2 0 53 3 4 53 4 0 56 5 2 59 6 0 30 7 1 49 8 0 54 9 3 37 10 0 49 11 4 55 12 7 50 13 3 45 14 1 44 15 1 60 16 0 36 17 3 53 18 0 22 19 1 56 20 3 54 21 3 75 22 0 37 23 2 42 24 0 64 25 1 61 26 4 57 27 1 53 28 4 63 29 3 75 30 1 53 31 1 54 32 3 64 33 3 54 34 5 59 35 2 48 36 5 65 37 1 57 38 3 48 39 2 54 40 2 62 41 0 53 42 2 61 43 5 56 44 0 32 45 2 50 46 2 44 47 1 62 48 2 39 49 1 61 50 4 51 51 0 36 52 0 38 53 4 38 54 10 59 55 1 46 56 1 55 57 2 48 58 2 54 59 1 41 60 2 61 61 2 52 62 1 63 63 2 25 64 2 64 65 3 72 66 2 65 67 0 41 68 0 29 69 3 39 70 2 70 71 5 68 72 5 38 73 1 59 74 0 29 75 0 49 76 2 39 77 2 45 78 3 74 79 0 41 80 2 53 81 4 71 82 0 41 83 3 63 84 1 34 85 2 61 86 3 51 87 4 62 88 4 55 89 4 64 90 2 47 91 1 48 92 6 47 93 3 44 94 5 52 95 6 50 96 4 48 97 3 58 98 2 39 99 2 41 100 1 51 # We review how to extract individual components of that table: > homework$score [1] 54 53 53 56 59 30 49 54 37 49 55 50 45 44 60 36 53 22 56 54 75 37 42 64 61 [26] 57 53 63 75 53 54 64 54 59 48 65 57 48 54 62 53 61 56 32 50 44 62 39 61 51 [51] 36 38 38 59 46 55 48 54 41 61 52 63 25 64 72 65 41 29 39 70 68 38 59 29 49 [76] 39 45 74 41 53 71 41 63 34 61 51 62 55 64 47 48 47 44 52 50 48 58 39 41 51 # It will save time and effort if we extract the objects of interest into # new variables: > Homework <- homework$homework > Score <- homework$score > Homework [1] 2 0 4 0 2 0 1 0 3 0 4 7 3 1 1 0 3 0 1 3 3 0 2 0 1 [26] 4 1 4 3 1 1 3 3 5 2 5 1 3 2 2 0 2 5 0 2 2 1 2 1 4 [51] 0 0 4 10 1 1 2 2 1 2 2 1 2 2 3 2 0 0 3 2 5 5 1 0 0 [76] 2 2 3 0 2 4 0 3 1 2 3 4 4 4 2 1 6 3 5 6 4 3 2 2 1 > Score [1] 54 53 53 56 59 30 49 54 37 49 55 50 45 44 60 36 53 22 56 54 75 37 42 64 61 [26] 57 53 63 75 53 54 64 54 59 48 65 57 48 54 62 53 61 56 32 50 44 62 39 61 51 [51] 36 38 38 59 46 55 48 54 41 61 52 63 25 64 72 65 41 29 39 70 68 38 59 29 49 [76] 39 45 74 41 53 71 41 63 34 61 51 62 55 64 47 48 47 44 52 50 48 58 39 41 51 # We review the production of a scatterplot: > plot(Homework,Score) # Here, we expand the white space in the plot a little. First, we lay out # an empty plot with the desired dimensions: > 10*.05 [1] 0.5 > plot( c(-0.5,10.5), c(20-2.75, 75+2.75) ) # Then we add the points we want: > points(Homework,Score) # Problem: when we laid out the plot, two points that aren't in the actual # data set were plotted. Solution: "type='n'" suppresses the actual plotting # of the points: > plot( c(-0.5,10.5), c(20-2.75, 75+2.75), xlab="Homework Hours", ) > plot( c(-0.5,10.5), c(20-2.75, 75+2.75), xlab="Homework Hours", + ylab="Exam Score", type='n') # Note that we also improved our axis labels in that example. Let's also # add a main title: > plot( c(-0.5,10.5), c(20-2.75, 75+2.75), xlab="Homework Hours", + ylab="Exam Score", type='n', main="Exam Performance Given Homework Hours") # This might look better with the title printed on two lines: > plot( c(-0.5,10.5), c(20-2.75, 75+2.75), xlab="Homework Hours", + ylab="Exam Score", type='n', main="Exam Performance\nGiven Homework Hours") > points(Homework,Score) # Here, we regress Score on Homework hours... > lm(Score~Homework) Call: lm(formula = Score ~ Homework) Coefficients: (Intercept) Homework 47.03 1.99 # ...so that we can easily add the regression line to the plot: > abline(lm(Score~Homework)$coef) # Next, we considered the simulation of regression data. The key steps are (1) generate # data that exactly fit the desired regression (in this example, one with a slope of # -3); then (2) add random noise with a mean of zero, the more variable the noise, # the lower the correlation. # We generate a predictor: > x <- rnorm(50, 100, 15) > x [1] 111.38336 96.61980 84.30112 96.91442 77.63840 98.40860 99.00900 [8] 93.66727 114.97316 85.71269 95.18870 127.52641 90.72060 92.02172 [15] 112.84834 79.75204 76.04453 97.31886 98.31864 135.12414 105.84658 [22] 71.01548 97.74633 102.39825 80.90685 60.79648 76.43511 88.79224 [29] 107.29763 92.30516 109.73811 106.91989 115.72511 101.38114 123.26260 [36] 106.22662 84.51966 82.38424 111.20876 85.21334 115.78542 108.83951 [43] 88.97307 78.15761 130.40367 92.29673 99.31268 84.85949 95.91643 [50] 97.69406 # Let's make our predictor look cleaner: > x <- round(x) > x [1] 111 97 84 97 78 98 99 94 115 86 95 128 91 92 113 80 76 97 98 [20] 135 106 71 98 102 81 61 76 89 107 92 110 107 116 101 123 106 85 82 [39] 111 85 116 109 89 78 130 92 99 85 96 98 # Then we calculate Y to fit the desired regression: > y <- 42 - 3*x > y [1] -291 -249 -210 -249 -192 -252 -255 -240 -303 -216 -243 -342 -231 -234 -297 [16] -198 -186 -249 -252 -363 -276 -171 -252 -264 -201 -141 -186 -225 -279 -234 [31] -288 -279 -306 -261 -327 -276 -213 -204 -291 -213 -306 -285 -225 -192 -348 [46] -234 -255 -213 -246 -252 > plot(x,y) > abline(c(42,-3)) # I decide to make all the Y's positive. Note that this will affect only # the intercept; the slope is still -3. > mean(y) [1] -249.9 > y <- y +500 > y [1] 209 251 290 251 308 248 245 260 197 284 257 158 269 266 203 302 314 251 248 [20] 137 224 329 248 236 299 359 314 275 221 266 212 221 194 239 173 224 287 296 [39] 209 287 194 215 275 308 152 266 245 287 254 248 > plot(x,y) > lm(y~x) Call: lm(formula = y ~ x) Coefficients: (Intercept) x 542 -3 # Note that the regression is still a perfect fit. If we attempt to look # at interential results, we get t's approaching infinity. > summary(lm(y~x)) Call: lm(formula = y ~ x) Residuals: Min 1Q Median 3Q Max -1.127e-13 8.314e-16 3.264e-15 4.298e-15 2.123e-14 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 5.420e+02 1.719e-14 3.154e+16 <2e-16 *** x -3.000e+00 1.745e-16 -1.720e+16 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.904e-14 on 48 degrees of freedom Multiple R-squared: 1, Adjusted R-squared: 1 F-statistic: 2.957e+32 on 1 and 48 DF, p-value: < 2.2e-16 # Let's add some noise: > noise <- rnorm(50, 0, 125) > plot(x,y+noise) > cor(x,y+noise) [1] -0.4037959 # We wanted a correlation closer to .5, so we tried a bit less # variable noise. Results were confusing, because our first example # actually was a bit of a fluke (the correlation should have been # weaker than -.4). > noise <- rnorm(50, 0, 115) > plot(x,y+noise) > cor(x,y+noise) [1] -0.3177537 > noise <- rnorm(50, 0, 115) > plot(x,y+noise) > cor(x,y+noise) [1] -0.324854 > noise <- rnorm(50, 0, 100) > plot(x,y+noise) > cor(x,y+noise) [1] -0.2114077 > noise <- rnorm(50, 0, 1) > cor(x,y+noise) [1] -0.9998329 > noise <- rnorm(50, 0, 100) > cor(x,y+noise) [1] -0.4231648 > noise <- rnorm(50, 0, 90) > cor(x,y+noise) [1] -0.5658167 > noise <- rnorm(50, 0, 100) > cor(x,y+noise) [1] -0.191936 > noise <- rnorm(50, 0, 100) > cor(x,y+noise) [1] -0.3548271 # Eventually, through trial and error, we got the correlation # near our target value: > noise <- rnorm(50, 0, 100) > cor(x,y+noise) [1] -0.5190591 # We added the noise to Y, and then rounded the result: > y <- y + noise > y [1] 66.11544 89.22383 398.82113 213.54687 217.93069 284.12229 408.17350 [8] 189.55112 35.41043 114.40286 135.25061 67.07718 232.00830 329.70065 [15] 211.22154 293.95278 501.74476 376.30122 253.36845 80.98438 140.88300 [22] 292.02359 152.83190 214.41028 267.82185 512.79812 293.08933 218.63401 [29] 144.99284 219.28710 257.07171 178.98266 224.54737 325.72655 238.98793 [36] 243.91844 187.95744 115.98001 294.39173 278.64972 154.02858 161.37559 [43] 293.18968 131.94481 141.42928 378.18182 109.71603 333.57933 307.64551 [50] 155.53494 > y <- round(y) > plot(x,y) > lm(y~x) Call: lm(formula = y ~ x) Coefficients: (Intercept) x 578.206 -3.585 > cor(x,y) [1] -0.5194568 >