# We did a simulation to demonstrate what we mean when we say that # the population variance is biased in small samples, whereas the # sample variance is unbiased. # First, we need a place to store results: > pop <- NULL > pop NULL > samp <- NULL # Here's a loop. Everything between the curly braces will happen # 500,000 times. Each time through the loop, "i" will be incremented # by 1, so that, for example, samp[i] will be the first, then the second, # then the third element of samp. > for( i in 1:500000 ) { + x <- rnorm(10, 50, 10) + samp[i] <- var(x) + pop[i] <- 9*var(x) / 10 } # The "rnorm" call generates a sample of 10 draws from a normal distribution # with a mean of 50 and a standard deviation of 10. Because the variance is # sd squared, the true value of the population variance is 100. # Here are some results; note that in each case, the sample variance estimate # is larger than the population estimate: > head(pop) [1] 45.70579 224.90874 59.47954 48.60493 189.25521 118.24044 > head(samp) [1] 50.78421 249.89860 66.08837 54.00547 210.28356 131.37827 > length(pop) [1] 500000 # The sample estimate is unbiased. That is to say, its mean is almost exactly # the true value: > mean(samp) [1] 100.0362 # The population formula systematically underestimates the variance: > mean(pop) [1] 90.03258 # Recall that we had observed some evidence that our Peabody sample # is negatively skewed: > stem(Peabody) The decimal point is 1 digit(s) to the right of the | 5 | 7 6 | 14 6 | 55799 7 | 12 7 | 6679 8 | 01113444 8 | 566799 9 | 00112234 9 | 556 10 | 0 # (Disclaimer: we really shouldn't be taking visual evidence of skew # very seriously in a sample this small.) # The longer negative tail has pulled the mean downward, so that the # mean is lower than the median: > median(Peabody) [1] 84 > mean(Peabody) [1] 81.675 > 84-81.675 [1] 2.325 # However, if we change the scale of Peabody (here, multiplying it by 10), # the magnitude of the difference changes: > median(Peabody*10) [1] 840 > mean(Peabody*10) [1] 816.75 > 840 - 816.75 [1] 23.25 # Yet the visual impression of skew is identical regardless of scale: > stem(Peabody) The decimal point is 1 digit(s) to the right of the | 5 | 7 6 | 14 6 | 55799 7 | 12 7 | 6679 8 | 01113444 8 | 566799 9 | 00112234 9 | 556 10 | 0 > stem(Peabody*10) The decimal point is 2 digit(s) to the right of the | 5 | 7 6 | 14 6 | 55799 7 | 12 7 | 6679 8 | 01113444 8 | 566799 9 | 00112234 9 | 556 10 | 0 # Pearson proposed the following statistic to create a skew index # that is independent of scale: > 3 * (mean(Peabody*10) - median(Peabody*10))/sd(Peabody*10) [1] -0.6387249 # That idea could be useful, so let's write a function that calculates # Pearson's skew index: > pskew <- function(x)return(3*(mean(x)-median(x))/sd(x)) > pskew function(x)return(3*(mean(x)-median(x))/sd(x)) > pskew(Peabody) [1] -0.6387249 # We confirm that the new function is there in our R space: > ls() [1] "CTPB" "i" "iqr" "Peabody" "pop" "pskew" "samp" [8] "x" # (And get rid of some old stuff we no longer care about.) > rm(CTPB,i,pop,samp,x) > ls() [1] "iqr" "Peabody" "pskew" # Here, we read in the entire Statlab data set: > Statlab <- read.csv("http://faculty.ucmerced.edu/jvevea/classes/202a/data/statlab (abridged).csv") > head(Statlab) CODE CBSEX CBLGTH CBWGT CTHGHT CTWGT CTPEA CTRA MBAG MBWGT MTHGHT MTWGT FBAG 1 1111 0 20.0 6.6 55.7 85 85 34 17 119 66.0 130 19 2 1112 0 20.0 6.4 48.9 59 74 34 17 130 62.8 159 23 3 1113 0 19.8 6.1 54.9 70 64 25 18 134 66.1 138 21 4 1114 0 19.5 7.0 53.6 88 87 43 18 135 61.8 123 26 5 1115 0 19.5 7.9 53.4 68 87 40 18 130 62.8 146 21 6 1116 0 22.0 9.5 59.9 93 83 37 18 104 63.4 116 17 FTHGHT FTWGT FIB FIT 1 70.1 171 33 150 2 65.0 130 40 175 3 70.0 175 44 116 4 71.8 196 42 112 5 68.0 163 50 129 6 74.0 180 0 214 # We can use "$" notation to get to individual variables in the data set: > head(Statlab$FIT) [1] 150 175 116 112 129 214 # But that can get tedious. If we "attach" the Statlab object (called a # "data frame" in R), we can access the individual variables by name: > attach(Statlab) > FIT [1] 150 175 116 112 129 214 142 120 104 194 114 170 125 170 176 96 84 124 [19] 150 28 216 108 89 179 180 100 207 114 180 220 120 120 140 120 150 0 [37] 83 133 156 114 122 168 120 150 126 144 180 172 72 108 144 130 140 150 [55] 200 75 120 190 180 97 45 152 100 220 213 148 180 100 177 140 91 150 . . . [1207] 120 200 200 216 120 220 90 120 45 100 230 120 144 160 132 193 120 174 [1225] 90 65 380 128 200 130 78 266 108 52 160 77 100 118 158 240 72 200 [1243] 150 100 188 35 68 135 145 90 200 184 312 240 230 90 200 96 144 118 [1261] 120 74 130 190 115 100 100 110 64 54 156 99 150 135 120 132 110 150 [1279] 322 292 160 180 190 120 100 146 120 170 127 87 228 66 132 175 105 121 # Even though a stem-and-leaf plot doesn't work well for a data set this large, # we can see that the FIT (family income) distribution is positively skewed: > stem(FIT) The decimal point is 2 digit(s) to the right of the | 0 | 002233334444444 0 | 55555556666666666666666677777777777777777777777777777777777788888888+67 1 | 00000000000000000000000000000000000000000000000000000000000000000000+422 1 | 55555555555555555555555555555555555555555555555555555555555555555555+258 2 | 00000000000000000000000000000000000000000000000000000000000000000000+106 2 | 55555555555555555555566666667777777777888888889999999 3 | 00000000000000000111111122222233334 3 | 555567888 4 | 0013 4 | 5 5 | 0004 5 | 6 | 0 6 | 7 | 7 | 8 | 0 # Pearson's skew index agrees: > pskew(FIT) [1] 0.4921092 # Here's a function that calculates the skew index that is based on # the third moment. (See today's Powerpoint for the forumula, as well as # for cautions about applying this to small samples.) > mskew <- function(x) { + meanX <- mean(x) + result <- sum((x-meanX)^3) + sd3 <- sd(x)^3 + result <- result/sd3 + N <- length(x) + result <- N*result / (N-1) / (N-2) + return(result) } # The moment-based index agrees that there is positive skew. (Note that # Pearson's index and the third moment index are not in the same metric; # don't try to compare their values.) > mskew(FIT) [1] 1.98027 # If we express income in dollars instead of hundreds of dollars, the # value of the index is not affected: > Income <- FIT*100 > mskew(Income) [1] 1.98027 > >