# Last time, we were able to get boys' and girls' subsets
# easily because the data set was already ordered girls first,
# then boys. Here's another way to extract subsets that
# doesn't depend on order:

> attach(JackStatlab)
> BoysRA <- subset(CTRA, CBSEX==1)
> BoysRA
 [1] 21 21 47 29 33 45 35 43 15 42 11 26 37 13 33 30 24 34
> GirlsRA <- subset(CTRA, CBSEX==0)
> GirlsRA
 [1] 33 25 42 20 21 15 34 16 35 45 33 47 50 31 26 28 35 44 34 48 40 31 50 34 48
[26] 18 29 20 28 24 42 40

# Here, we see that the data we got with the subser command
# was identical to what we extracted last time. (The cbind()
# function "binds" columns together.)

> cbind(BoysRA, BoysRaven)
      BoysRA BoysRaven
 [1,]     21        21
 [2,]     21        21
 [3,]     47        47
 [4,]     29        29
 [5,]     33        33
 [6,]     45        45
 [7,]     35        35
 [8,]     43        43
 [9,]     15        15
[10,]     42        42
[11,]     11        11
[12,]     26        26
[13,]     37        37
[14,]     13        13
[15,]     33        33
[16,]     30        30
[17,]     24        24
[18,]     34        34
 
# We are interested in the conditional distribution of Raven given Peabody.
# We begin to think about this by using the infamous median split, in which
# we define low Peabody scores as scores that fall at or below the median
# and high Peabody scores as scores above the median. (For reasons discussed
# in class, this is a VERY BAD idea. We're just doing it here to help us think
# about conditioning on a continuous variable.)

> CTPEA
 [1]  81  94  84  74  71  59  86  63  81 100  63  78  87  75  62  64  85  76  83
[20]  82  77  76 102  75  77  68  75  86  78  88 100  70  85  74 123  81  69  94
[39]  78  99  77  93  65  65  94  69  78  82  66  89
> median(CTPEA)
[1] 78

# We create subsets of Raven scores that depend on whether the corresponding
# Peabody score is above the median:

> LowRaven <- subset(CTRA, CTPEA <= median(CTPEA))
> HighRaven <- subset(CTRA, CTPEA > median(CTPEA))
> stem(LowRaven)

  The decimal point is 1 digit(s) to the right of the |

  1 | 13
  1 | 5568
  2 | 0114
  2 | 66889
  3 | 113334
  3 | 5
  4 | 004
  4 | 78

> stem(HighRaven)

  The decimal point is 1 digit(s) to the right of the |

  2 | 01459
  3 | 03444557
  4 | 22235578
  5 | 00

# It will be easier to compare the distributions if we force
# the same grouping:

> stem(LowRaven)

  The decimal point is 1 digit(s) to the right of the |

  1 | 13
  1 | 5568
  2 | 0114
  2 | 66889
  3 | 113334
  3 | 5
  4 | 004
  4 | 78

> stem(HighRaven, 2)

  The decimal point is 1 digit(s) to the right of the |

  2 | 014
  2 | 59
  3 | 03444
  3 | 557
  4 | 2223
  4 | 5578
  5 | 00

> mean(LowRaven)
[1] 28.14815
> mean(HighRaven)
[1] 36.73913
> median(LowRaven)
[1] 28
> median(HighRaven)
[1] 35
> mean(subset(CTRA, CTPEA <= median(CTPEA))
+ )
[1] 28.14815

# Here's another way to orchestrate a split. First, we create
# a logical variable that indicates whether the case is or is
# not above the median:

> Split <- CTPEA <= median(CTPEA)
> Split
 [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE
[13] FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE
[25]  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE
[37]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
[49]  TRUE FALSE

# Once we have that variable, we can use the tapply() function to get descriptive
# statistics for the subgroups. The syntax of tapply() is tapply(variable of interest,
# grouping variable, desired function):

> tapply(CTRA, Split, mean)
   FALSE     TRUE 
36.73913 28.14815 
> tapply(CTRA, Split, sd)
    FALSE      TRUE 
 9.091541 10.313187 
> tapply(CTRA, Split, median)
FALSE  TRUE 
   35    28 

# We can use this even with functions that we have created ourselves...

> tapply(CTRA, Split, iqr)
FALSE  TRUE 
   15    14 

# ...and with graphics functions:

> tapply(CTRA, Split, stem)

  The decimal point is 1 digit(s) to the right of the |

  2 | 01459
  3 | 03444557
  4 | 22235578
  5 | 00


  The decimal point is 1 digit(s) to the right of the |

  1 | 13
  1 | 5568
  2 | 0114
  2 | 66889
  3 | 113334
  3 | 5
  4 | 004
  4 | 78

$`FALSE`
NULL

$`TRUE`
NULL

# We can do a conditional boxplot using the "Split" variable that
# we created:
 
> boxplot(CTRA~Split)

# However, the result put the high-Peabody group at the left and the
# low-Peabody group at the right. We can correct that. Even though the
# "Split" variable is labeled FALSE or TRUE, those logical values are
# actually represented behind the scenes with 0's and 1's. So if we do
# this...

> Split <- 1-Split

# ...what was a 0 is now a 1, and what was a 1 is now a zero. (The values
# appear now as 0's and 1's because applying an arithmetic operation to the
# logical variable implied to R that we are thinking of it as a numerical
# variable.)

> Split
 [1] 1 1 1 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 0 1 1 0 1 0 1 1 0 1
[39] 0 1 0 1 0 0 1 0 0 1 0 1

# And now our conditional boxplot places the low and high Peabody samples
# in the more logical order:

> boxplot(CTRA~Split)

# After discussing all of the problems with using the median split, we decided
# to consider a quartile split. The following command creates a variable called
# Quartile that identifies which quartile of Peabody the child is in. To see how
# this works, do a "desk check" of, say, a case in the second quartile. It's not
# above the the 75th percentile, so the first line results in 0. It's not above
# the median, so the second line adds 0. It IS above the 25th percentile, so the
# third line adds 1, and then the "+1" adds another 1, resulting in 2 (i.e., the
# case is in the second quartile).

> Quartile <- as.numeric(CTPEA > quantile(CTPEA, .75, type=2)) +
+    as.numeric(CTPEA > median(CTPEA)) +
+    as.numeric(CTPEA > quantile(CTPEA, .25, type=2)) + 1
> Quartile
 [1] 3 4 3 2 1 1 3 1 3 4 1 2 4 2 1 1 3 2 3 3 2 2 4 2 2 1 2 3 2 4 4 1 3 2 4 3 1 4
[39] 2 4 2 4 1 1 4 1 2 3 1 4

# As expected, we get roughly one quarter of the cases in each quartile:

> table(Quartile)
Quartile
 1  2  3  4 
13 14 11 12 

# Here, we can see a similar trend in which the conditional mean of Raven
# increases as Peabody moves from lower to higher, and the variability stays
# pretty stable. 

> boxplot(CTRA~Quartile)

# We could imagine doing a Raven boxplot for each unique value of Peabody.
# We can see the same trend for central tendency of Raven conditioned on Peabody,
# but in most cases, there aren't enough Raven scores to do a proper boxplot:

> boxplot(CTRA~CTPEA)

# So why not just do a scatterplot, but keep on thinking the same way? What is
# happening to the conditional mean of Raven as we move continously from lower
# to higher Peabody scores? (It's going up.) What is happening to the variability
# of Raven as we move from low to high Peabody? (It's staying pretty stable.)

> plot(CTPEA, CTRA)

# A student asked to have a trend line added to the plot. Don't worry about
# this syntax; we'll deal with it a bit later in the semester.

> abline(lm(CTRA~CTPEA)$coef)

# Just as was the case with the hist() function, R doesn't do such a great job
# of scatterplots without some additional tweaks. First, let's get a more
# horizontal aspect ratio:

> par(pin=c(6,4))
> plot(CTPEA,CTRA)

# Let's get rid of those messy variable names:

> plot(CTPEA,CTRA, main="Conditional Distribution of Raven Given Peabody",
+    xlab="Peabody", ylab="Raven")

# The appearance of a scatterplot can differ greatly depending on how much white
# space is included around the cloud of points. I happen to think that R's
# scatterplots are more effective if we expand the white space by about 5% beyond
# R's default values. So let's figure out what 5% of the range of each variable
# is:

> max(Peabody)
[1] 100
> max(CTPEA)
[1] 123
> min(CTPEA)
[1] 59
> 123-59
[1] 64

# I create an "X increment" that is 5% of the range of the X-axis variable:

> xinc <- .05*64

# And I do the same for the Y axis:

> yinc <- .05*(max(CTRA)-min(CTRA))

> xinc
[1] 3.2
> yinc
[1] 1.95

# Here, we increase the x limit and y limit of the scatterplot, using
# those 5% increments:

> plot(CTPEA,CTRA, main="Conditional Distribution of Raven Given Peabody",
+    xlab="Peabody", ylab="Raven", xlim=c(min(CTPEA)-xinc, max(CTPEA)+xinc) ,
+    ylim=c(min(CTRA)-yinc,max(CTRA)+yinc))

# Sometimes, there will be more than one data point that corresponds to a point
# on the plot. For my data set, that's not the case. Notice that if I create a
# table of Raven by Peabody, every count is either 0 or 1:

> table(CTRA,CTPEA)
    CTPEA
CTRA 59 62 63 64 65 66 68 69 70 71 74 75 76 77 78 81 82 83 84 85 86 87 88 89 93
  11  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  13  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  15  1  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0
  16  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  18  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  20  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  1  0  0  0  0
  21  0  0  0  0  0  0  0  0  0  1  1  0  0  0  0  0  0  0  0  1  0  0  0  0  0
  24  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0
  25  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  26  0  1  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  28  0  0  0  1  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
  29  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  1  0  0  0  0  0  0  0  0  0
  30  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0
  31  0  0  0  0  0  0  0  0  0  0  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0
  33  0  0  1  0  0  0  0  1  0  0  0  0  0  0  1  1  0  0  0  0  0  0  0  0  0
  34  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  1  0  0  1  0  0  1  0
  35  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  1  0  0  0  1  0  0  0  0  0
  37  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  40  0  0  0  0  0  0  0  0  1  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0
  42  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  1
  43  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  44  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
  45  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  47  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
  48  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  1  0  0  0  0  0  0  0  0
  50  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0
    CTPEA
CTRA 94 99 100 102 123
  11  0  0   0   0   0
  13  0  0   0   0   0
  15  0  0   0   0   0
  16  0  0   0   0   0
  18  0  0   0   0   0
  20  0  0   0   0   0
  21  0  0   0   0   0
  24  0  0   0   0   0
  25  1  0   0   0   0
  26  0  0   0   0   0
  28  0  0   0   0   0
  29  0  0   0   0   0
  30  0  0   0   0   0
  31  0  0   0   0   0
  33  0  0   0   0   0
  34  0  0   0   0   0
  35  0  0   0   0   0
  37  1  0   0   0   0
  40  0  0   0   0   0
  42  0  0   1   0   0
  43  0  1   0   0   0
  44  0  0   0   0   0
  45  1  0   1   0   0
  47  0  0   0   0   1
  48  0  0   0   0   0
  50  0  0   0   1   0
 
# So here, to help illustrate a point, I add three identical cases to each
# variable:
 
> BadY <- c(CTRA, 30,30,30)
> BadX <- c(CTPEA, 85,85,85)

# If we do a scatterplot, we can't see that there are three points at
# Raven=30, Peabody=85:

> plot(BadX,BadY)

# Here, I create a little random noise for each axis. The first variable,
# ditherX, is sampled from a distribution in which any value between 0 and .2
# is equally likely. For the second variable, I used a smaller range, because
# the range of the Y-axis in the plot is smaller:

> ditherX <- runif(53, 0, .2)
> ditherY <- runif(53, 0, .1)
> ditherX
 [1] 0.087505983 0.155213970 0.068911170 0.014341557 0.109004363 0.034809976
 [7] 0.198280350 0.001970042 0.129341416 0.082249190 0.007078972 0.169864852
[13] 0.078051344 0.040840729 0.166545040 0.072559730 0.070068321 0.020116517
[19] 0.041777283 0.011802019 0.013846586 0.036482137 0.120888217 0.021230882
[25] 0.151896093 0.181786545 0.150030885 0.036551312 0.178787642 0.038823494
[31] 0.199155044 0.152443872 0.030771229 0.004071232 0.007310041 0.171050587
[37] 0.126734024 0.197034745 0.111080671 0.034843857 0.118768002 0.117787012
[43] 0.113331037 0.030746198 0.193877134 0.107055871 0.149829070 0.161127284
[49] 0.090026812 0.118356441 0.078827333 0.199853330 0.006419483

# If we add that bit of random noise to each point, we'll be able to see that
# there are more than one cases with Raven=30 and Peabody=85:

> plot(BadX+ditherX, BadY+ditherY)

# We may need to tweak the amount of noise a bit to get a satisfactory appearance:

> ditherY <- runif(53, 0, .15)
> plot(BadX+ditherX, BadY+ditherY)
 
# Note: if you DO use a dithered scatterplot, you should mention that it is
# dithered in the figure caption.