# We demonstrated the "sample()" function by sampling from our
# Peabody variable.

> attach(PeabodyFrame)
> Peabody
 [1]  69  72  94  64  80  77  96  86  89  69  92  71  81  90  84  76 100  57  61
[20]  84  81  65  87  92  89  79  91  65  91  81  86  85  95  93  83  76  84  90
[39]  95  67

# Here, we take 5 draws at random from that variable:

> sample(Peabody, 5)
[1] 90 81 94 89 69

# Here are 29 draws:

> sample(Peabody, 29)
 [1]  79  83  89  84  77  84  85  69  67  65  92  91  72  95  90  71 100  84  95
[20]  80  81  94  65  87  93  89  81  76  86

# If we take 40 draws, we actually get the complete variable, but in a random order.
# That's because, by default, R samples without replacement: once a case has been drawn,
# it can't be drawn again. So by the time we've taken 39 samples, only one remains, and
# we end up getting all 40 values:

> sample(Peabody, 40)
 [1]  85  84  69  92  94  64  57  81  72  81  79 100  81  76  77  89  95  83  80
[20]  91  71  61  89  69  84  92  91  67  96  93  84  90  65  76  65  95  86  90
[39]  87  86

# If we try to sample more than 40 values, R doesn't know what to do because there
# are only 40 cases:

> sample(Peabody, 41)
Error in sample.int(length(x), size, replace, prob) : 
  cannot take a sample larger than the population when 'replace = FALSE'

# If we want to sample WITH replacement, we can, but there will be duplicate draws:

> sample(Peabody, 50, replace=TRUE)
 [1]  67  65  89  92  57  81  87  90  76  65  71  79  89  86  76  95  84  81  81
[20]  89 100  84  93  95  72  90  65 100  83  65  61  95  89  85  84  84  72  95
[39]  92  77  96  84  93  87  83  90  80  90  90  95
 
# Let's do a little housecleaning:

> ls()
[1] "i"            "iqr"          "mskew"        "PeabodyFrame" "pskew"       
[6] "results"      "Statlab"      "statlab2"     "x"           
> 
> rm(i, results, x, statlab2, Statlab)
> ls()
[1] "iqr"          "mskew"        "PeabodyFrame" "pskew"       
 
# Here, I begin my demonstration of how to approach Homework One. The first thing
# you need to do is read in the full Statlab data set. If you have downloaded the
# csv file from the class webpage and saved it on your computer, you can read it 
# into R like this:

> Statlab <- read.csv( file.choose() )

# The file.choose() function lets you browse to find the file.

> head(Statlab)
  CODE CBSEX CBLGTH CBWGT CTHGHT CTWGT CTPEA CTRA MBAG MBWGT MTHGHT MTWGT FBAG
1 1111     0   20.0   6.6   55.7    85    85   34   17   119   66.0   130   19
2 1112     0   20.0   6.4   48.9    59    74   34   17   130   62.8   159   23
3 1113     0   19.8   6.1   54.9    70    64   25   18   134   66.1   138   21
4 1114     0   19.5   7.0   53.6    88    87   43   18   135   61.8   123   26
5 1115     0   19.5   7.9   53.4    68    87   40   18   130   62.8   146   21
6 1116     0   22.0   9.5   59.9    93    83   37   18   104   63.4   116   17
  FTHGHT FTWGT FIB FIT
1   70.1   171  33 150
2   65.0   130  40 175
3   70.0   175  44 116
4   71.8   196  42 112
5   68.0   163  50 129
6   74.0   180   0 214

# Here's another way to get the data:

> Statlab <- read.csv( "http://faculty.ucmerced.edu/jvevea/classes/105/data/statlab (abridged).csv")
 
# We can refer to specific elements of that data frame by using [row, column] notation. For
# example, if you look back at the results of our "head(Statlab)" command, you'll see that
# (ignoring column names) the number in the first row and first column is 1111. We can access
# that directly like this:

> Statlab[1,1]
[1] 1111

# Here are a few other example. Again, you can confirm that these match what we see in the
# "head(Statlab)" results:

> Statlab[1,3]
[1] 20
> Statlab[1,4]
[1] 6.6
> Statlab[6,5]
[1] 59.9

# If we specify just a row with no column, we get the entire row:

> Statlab[1,]
  CODE CBSEX CBLGTH CBWGT CTHGHT CTWGT CTPEA CTRA MBAG MBWGT MTHGHT MTWGT FBAG
1 1111     0     20   6.6   55.7    85    85   34   17   119     66   130   19
  FTHGHT FTWGT FIB FIT
1   70.1   171  33 150
> Statlab[33,]
   CODE CBSEX CBLGTH CBWGT CTHGHT CTWGT CTPEA CTRA MBAG MBWGT MTHGHT MTWGT FBAG
33 1163     0   18.5   4.6   50.1    56    82   39   20    97   64.6   116   21
   FTHGHT FTWGT FIB FIT
33     66   145  50 140

# Similarly, if we specify a column but no row, we get the entire column:

> Statlab[,2]
   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [445] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [482] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [519] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [556] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [593] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [630] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [667] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [704] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [741] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [778] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [815] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [852] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [889] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [926] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [963] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[1000] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[1037] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[1074] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[1111] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[1148] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[1185] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[1222] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[1259] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[1296] 1
 
# Now, let's look back at the sample command. Before, we sampled from the Peabody
# variable. But we can also sample from a list of numbers. Here, for example, we
# take several samples of size 3 from the whole numbers 1 to 10:
 
> sample(1:10, 3)
[1]  3 10  5
> sample(1:10, 3)
[1] 2 7 9
> sample(1:10, 3)
[1] 3 2 1
> sample(1:10, 3)
[1] 8 5 1
> sample(1:10, 3)
[1]  9 10  7

# Once again, sampling, by default, is without replacement, so if we try to sample
# more digits than 10, R chokes:

> sample(1:10, 15)
Error in sample.int(length(x), size, replace, prob) : 
  cannot take a sample larger than the population when 'replace = FALSE'
 
# Recall that there are 1296 rows in the Statlab data frame. So if we sample
# from the numbers 1:1296, we can think of those as representing rows. Here,
# I take a sample of 50 numbers from 1 to 1296:

> sample(1:1296, 50) -> temp
> temp
 [1]   35 1081  342  417  935 1176 1071  746  433  817  756  960   86 1220  496
[16] 1063  729   68  755  795  119  267  285   28 1167 1253  109   24  827 1163
[31]  224  174 1051  717   78  977  186  413  163    9  124  690  804  395  980
[46]  188  967  307   69  209

# It will be convenient to have these in the same order in which they appear
# in the data frame, so I sort them:

> myrows <- sort(temp)

# So these are the rows that my personal Statlab sample will contain:

> myrows
 [1]    9   24   28   35   68   69   78   86  109  119  124  163  174  186  188
[16]  209  224  267  285  307  342  395  413  417  433  496  690  717  729  746
[31]  755  756  795  804  817  827  935  960  967  977  980 1051 1063 1071 1081
[46] 1163 1167 1176 1220 1253

# My first sampled row number is 9. Here's the 9th row of the data frame:

> Statlab[9,]
  CODE CBSEX CBLGTH CBWGT CTHGHT CTWGT CTPEA CTRA MBAG MBWGT MTHGHT MTWGT FBAG
9 1123     0   21.5   8.2   56.8    68    72   21   13   128   65.4   141   23
  FTHGHT FTWGT FIB FIT
9     71   150  34 104

# Here, I extract all 50 rows that resulted from my sampling from 1:1296.

> JackStatlab <- Statlab[myrows,]

# You can see that the first row of my "JackStatlab" sample is the same as
# the 9th row of the Statlab data frame, because the first element of
# "myrows" was 9:

> JackStatlab[1,]
  CODE CBSEX CBLGTH CBWGT CTHGHT CTWGT CTPEA CTRA MBAG MBWGT MTHGHT MTWGT FBAG
9 1123     0   21.5   8.2   56.8    68    72   21   13   128   65.4   141   23
  FTHGHT FTWGT FIB FIT
9     71   150  34 104

# Similarly, the second row of my sample is the same as the 24th row of the larger
# data frame, because the second value of "myrows" was 24:

> JackStatlab[2,]
   CODE CBSEX CBLGTH CBWGT CTHGHT CTWGT CTPEA CTRA MBAG MBWGT MTHGHT MTWGT FBAG
24 1146     0   19.5   6.6   53.1    68    69   15   19   129   63.8   150   23
   FTHGHT FTWGT FIB FIT
24   73.5   154  32 179
 
# Using the dim() function, I can see that my new data frame has 50 rows (corresponding
# to the draws represented by "myrows") and 17 columns (corresponding to the 17 distinct
# variables in the Statlab data frame):

> dim(JackStatlab)
[1] 50 17

# And here are the first and second columns:

> JackStatlab[,1]
 [1] 1123 1146 1154 1165 1262 1263 1316 1332 1411 1425 1434 1541 1556 1616 1622
[16] 1655 2122 2233 2263 2341 2436 2565 2635 2643 3111 3254 4216 4263 4323 4352
[31] 4365 4366 4513 4526 4551 4565 5265 5346 5361 5415 5422 5621 5641 5653 6111
[46] 6325 6333 6346 6462 6555
> JackStatlab[,2]
 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
[39] 1 1 1 1 1 1 1 1 1 1 1 1
 
# Remember that the codebook posted on the class data page explains what these
# variables are.

# In your homework assignment, you are asked to save and upload a csv file
# of your sample. Here's one way to save it. The file.choose() function lets you
# browse to a location and enter the name of the csv file you want to create.
# (Be sure to use the extension ".csv" in the name you choose.)

> write.csv(JackStatlab, file.choose() )

# You can also specify an explicit path (this will be different on a Mac,
# but I can't demonstrate because I don't have a Mac):

> write.csv(JackStatlab, "c:/users/jvevea/desktop/JackStatlab2.csv")
 
# (At this point I demonstrated how to upload the csv file to Catcourses.)

# The homework assignment asks you to work with the Raven's Progressive Matrices
# variable. It will save a lot of typing if we extract that variable from the
# data frame and give it a name that will look attractive in graphics:
 
> Raven <- JackStatlab$CTRA
> Raven
 [1] 21 15 45 35 27 26 15 23 13 38 21 23 39 35 32 47 45 41 32 24 38 25 44 40 17
[26] 43 21 28 38 28 31 39 21 37 21 22 23 18 28 39 42 26 33 41 34 35 30 33 28 42

# The assignment asks you to do a good histogram of the variable. Very often, R's
# stem-and-leaf plot will make a good grouping decision (which is the first step
# in producing a good histogram):

> stem(Raven)

  The decimal point is 1 digit(s) to the right of the |

  1 | 3
  1 | 5578
  2 | 1111123334
  2 | 56678888
  3 | 0122334
  3 | 5557888999
  4 | 0112234
  4 | 557

# I like that! R has divided things into low teens, high teens, low twenties,
# high twenties, and so on. This results in 8 categories, which nicely matches
# our goal of having somewhere between 7 and 15 categories.

# R's default histogram would not get you a very good grade on the homework:

> hist(Raven)

# Let's make the aspect ratio wider:

> par(pin=c(6,4))
> hist(Raven)

# OK, but R has chosen really weird breakpoints. Let's force the hist() function
# to use the same breakpoints that we identified using the stem() function. Remember
# that the "real limits" of 10-14, 15-19, 20-24, 25-29 (etc.) are 9.5, 14.5, 19.5, 24.5,
# and so on. Here, we make R use those breakpoints:

> hist(Raven, breaks=seq(9.5,49.5,5))

# But now the labels on the X axis are in strange locations. Let's suppress the
# axis...

> hist(Raven, breaks=seq(9.5,49.5,5), axes=FALSE)

# ...and redraw it labeling the midpoints of the intervals:

> axis(side=1, at=seq(12, 47, 5))

# That's better, but the axis label is hanging in mid-air. Let's make it go exactly
# across the bottom of the bars:

> hist(Raven, breaks=seq(9.5,49.5,5), axes=FALSE)
> axis(side=1, at=seq(12, 47, 5), pos=0)

# We add the y-axis labels:

> axis(side=2, at=seq(0,10,2))

# And finally, we give the bars a line to sit on:

> abline(h=0)

# The homework also wants to see a description of the distribution
# using descriptive statistics. See the sample homework that will
# appear later today as a link from the syllabus for an example of
# how to use these in the assignment.

> mean(Raven)
[1] 30.84
> sd(Raven)
[1] 9.132494
> median(Raven)
[1] 31.5

# The mean is below the median, which might suggest negative skew
# (although that really wasn't evident in our histogram). So let's
# try the pskew() function we created:

> pskew
function(x) {
   return( 3 * (mean(x)-median(x))/sd(x)) }
<bytecode: 0x000000001a3c4ee0>

> pskew(Raven)
[1] -0.2168082
 

> median(Raven)
[1] 31.5

# If we are talking about the median for central tendency, we
# should use the IQR to talk about variability. Here's the
# function we created for that:

> iqr
function(x) {
   return( IQR(x, type=2)) }

> iqr(Raven)
[1] 16

# If we look directly at the third and first quartiles...

> quantile(Raven,.75, type=2)
75% 
 39 
> quantile(Raven,.25, type=2)
25% 
 23 

# ...we can see that they are about equidistant from the median,
# which is another indicator that the distribution is pretty symmetric:

> 31.5-23
[1] 8.5
> 39-31.5
[1] 7.5

# A box-and-whisker plot can also give us a good look at symmetry:

> boxplot(Raven)
> boxplot(Raven, main="Boxplot of Raven Distribution")