# We did a simulation to demonstrate what we mean when we say that
# the population variance is biased in small samples, whereas the
# sample variance is unbiased.

# First, we need a place to store results:

> pop <- NULL
> pop
NULL
> samp <- NULL

# Here's a loop. Everything between the curly braces will happen
# 500,000 times. Each time through the loop, "i" will be incremented
# by 1, so that, for example, samp[i] will be the first, then the second,
# then the third element of samp.

> for( i in 1:500000 ) {
+    x <- rnorm(10, 50, 10)
+    samp[i] <- var(x)
+    pop[i] <- 9*var(x) / 10 }

# The "rnorm" call generates a sample of 10 draws from a normal distribution
# with a mean of 50 and a standard deviation of 10. Because the variance is
# sd squared, the true value of the population variance is 100.

# Here are some results; note that in each case, the sample variance estimate
# is larger than the population estimate:

> head(pop)
[1]  45.70579 224.90874  59.47954  48.60493 189.25521 118.24044
> head(samp)
[1]  50.78421 249.89860  66.08837  54.00547 210.28356 131.37827
> length(pop)
[1] 500000

# The sample estimate is unbiased. That is to say, its mean is almost exactly
# the true value:

> mean(samp)
[1] 100.0362

# The population formula systematically underestimates the variance:

> mean(pop)
[1] 90.03258

# Recall that we had observed some evidence that our Peabody sample
# is negatively skewed:

> stem(Peabody)

  The decimal point is 1 digit(s) to the right of the |

   5 | 7
   6 | 14
   6 | 55799
   7 | 12
   7 | 6679
   8 | 01113444
   8 | 566799
   9 | 00112234
   9 | 556
  10 | 0

# (Disclaimer: we really shouldn't be taking visual evidence of skew
# very seriously in a sample this small.)

# The longer negative tail has pulled the mean downward, so that the
# mean is lower than the median:

> median(Peabody)
[1] 84
> mean(Peabody)
[1] 81.675

> 84-81.675
[1] 2.325

# However, if we change the scale of Peabody (here, multiplying it by 10),
# the magnitude of the difference changes:

> median(Peabody*10)
[1] 840
> mean(Peabody*10)
[1] 816.75
> 840 - 816.75
[1] 23.25

# Yet the visual impression of skew is identical regardless of scale:

> stem(Peabody)

  The decimal point is 1 digit(s) to the right of the |

   5 | 7
   6 | 14
   6 | 55799
   7 | 12
   7 | 6679
   8 | 01113444
   8 | 566799
   9 | 00112234
   9 | 556
  10 | 0

> stem(Peabody*10)

  The decimal point is 2 digit(s) to the right of the |

   5 | 7
   6 | 14
   6 | 55799
   7 | 12
   7 | 6679
   8 | 01113444
   8 | 566799
   9 | 00112234
   9 | 556
  10 | 0

# Pearson proposed the following statistic to create a skew index
# that is independent of scale:

> 3 * (mean(Peabody*10) - median(Peabody*10))/sd(Peabody*10)
[1] -0.6387249
 
# That idea could be useful, so let's write a function that calculates
# Pearson's skew index:

> pskew <- function(x)return(3*(mean(x)-median(x))/sd(x))
> pskew
function(x)return(3*(mean(x)-median(x))/sd(x))
> pskew(Peabody)
[1] -0.6387249

# We confirm that the new function is there in our R space:

> ls()
[1] "CTPB"    "i"       "iqr"     "Peabody" "pop"     "pskew"   "samp"   
[8] "x"      

# (And get rid of some old stuff we no longer care about.)

> rm(CTPB,i,pop,samp,x)
> ls()
[1] "iqr"     "Peabody" "pskew"  
 
# Here, we read in the entire Statlab data set:

> Statlab <- read.csv("http://faculty.ucmerced.edu/jvevea/classes/202a/data/statlab (abridged).csv")
> head(Statlab)
  CODE CBSEX CBLGTH CBWGT CTHGHT CTWGT CTPEA CTRA MBAG MBWGT MTHGHT MTWGT FBAG
1 1111     0   20.0   6.6   55.7    85    85   34   17   119   66.0   130   19
2 1112     0   20.0   6.4   48.9    59    74   34   17   130   62.8   159   23
3 1113     0   19.8   6.1   54.9    70    64   25   18   134   66.1   138   21
4 1114     0   19.5   7.0   53.6    88    87   43   18   135   61.8   123   26
5 1115     0   19.5   7.9   53.4    68    87   40   18   130   62.8   146   21
6 1116     0   22.0   9.5   59.9    93    83   37   18   104   63.4   116   17
  FTHGHT FTWGT FIB FIT
1   70.1   171  33 150
2   65.0   130  40 175
3   70.0   175  44 116
4   71.8   196  42 112
5   68.0   163  50 129
6   74.0   180   0 214

# We can use "$" notation to get to individual variables in the data set:

> head(Statlab$FIT)
[1] 150 175 116 112 129 214

# But that can get tedious. If we "attach" the Statlab object (called a
# "data frame" in R), we can access the individual variables by name:

> attach(Statlab)
> FIT
   [1] 150 175 116 112 129 214 142 120 104 194 114 170 125 170 176  96  84 124
  [19] 150  28 216 108  89 179 180 100 207 114 180 220 120 120 140 120 150   0
  [37]  83 133 156 114 122 168 120 150 126 144 180 172  72 108 144 130 140 150
  [55] 200  75 120 190 180  97  45 152 100 220 213 148 180 100 177 140  91 150
                             .
                             .
                             .
[1207] 120 200 200 216 120 220  90 120  45 100 230 120 144 160 132 193 120 174
[1225]  90  65 380 128 200 130  78 266 108  52 160  77 100 118 158 240  72 200
[1243] 150 100 188  35  68 135 145  90 200 184 312 240 230  90 200  96 144 118
[1261] 120  74 130 190 115 100 100 110  64  54 156  99 150 135 120 132 110 150
[1279] 322 292 160 180 190 120 100 146 120 170 127  87 228  66 132 175 105 121

# Even though a stem-and-leaf plot doesn't work well for a data set this large,
# we can see that the FIT (family income) distribution is positively skewed:

> stem(FIT)

  The decimal point is 2 digit(s) to the right of the |

  0 | 002233334444444
  0 | 55555556666666666666666677777777777777777777777777777777777788888888+67
  1 | 00000000000000000000000000000000000000000000000000000000000000000000+422
  1 | 55555555555555555555555555555555555555555555555555555555555555555555+258
  2 | 00000000000000000000000000000000000000000000000000000000000000000000+106
  2 | 55555555555555555555566666667777777777888888889999999
  3 | 00000000000000000111111122222233334
  3 | 555567888
  4 | 0013
  4 | 5
  5 | 0004
  5 | 
  6 | 0
  6 | 
  7 | 
  7 | 
  8 | 0

# Pearson's skew index agrees:

> pskew(FIT)
[1] 0.4921092
 
# Here's a function that calculates the skew index that is based on
# the third moment. (See today's Powerpoint for the forumula, as well as
# for cautions about applying this to small samples.)

> mskew <- function(x) {
+    meanX <- mean(x)
+    result <- sum((x-meanX)^3)
+    sd3 <- sd(x)^3
+    result <- result/sd3
+    N <- length(x)
+    result <- N*result / (N-1) / (N-2)
+    return(result) }

# The moment-based index agrees that there is positive skew. (Note that
# Pearson's index and the third moment index are not in the same metric;
# don't try to compare their values.)

> mskew(FIT)
[1] 1.98027

# If we express income in dollars instead of hundreds of dollars, the
# value of the index is not affected:

> Income <- FIT*100
> mskew(Income)
[1] 1.98027
>  
>