# Here is the object we created last time. It contains a variable # named "Peabody." > ls() [1] "PeabodyFrame" # If we "attach" the data frame, we can refer to Peabody directly # rather than extracting it using PeabodyFrame$Peabody (which would, # for example, lead to a pretty weird histogram title). > attach(PeabodyFrame) > Peabody [1] 69 72 94 64 80 77 96 86 89 69 92 71 81 90 84 76 100 57 61 [20] 84 81 65 87 92 89 79 91 65 91 81 86 85 95 93 83 76 84 90 [39] 95 67 # Here's the stem-and-leaf plot we did last time: > stem(Peabody) The decimal point is 1 digit(s) to the right of the | 5 | 7 6 | 14 6 | 55799 7 | 12 7 | 6679 8 | 01113444 8 | 566799 9 | 00112234 9 | 556 10 | 0 # R can do histograms, but there are some problems with the default # settings. It tends to produce square plots. It chooses poor breakpoints # and labels the edges of the histobars rather than the midpoints. It also # leaves the histogram hanging in mid air. > hist(Peabody) # We can force a more horizontal aspect ratio; we want the figure to be # about 2/3 as tall as it is wide. Here, "pin" stands for "plot inches," # se we are telling R that we want a figure that is 6 inches wide and # 4 inches tall. (We may need to drag the edge of the plot window in order # to fit everything.) > par(pin=c(6,4)) # The aspect ratio is improved, but the other problems remain: > hist(Peabody) # A look at the help page for the histogram function shows us that there's # a parameter called "breaks" that determines the breakpoints. > help(hist) starting httpd help server ... done # Here, we explicitly list the breakpoints we want: > hist(Peabody, breaks=c(54.5,59.5,64.5,69.5,74.5,79.5,84.5,89.5,94.5,99.5,104.5)) # We can actually do that with less typing by telling R we want them # to be a sequence from 54.5 to 104.5 in steps of 5: > hist(Peabody, breaks=seq(54.5,104.5,5)) # But the X-axis label is still hanging in mid-air, and the midpoints # of the intervals are not labeled. We can take control of the axis # by supressing them altogether... > hist(Peabody, breaks=seq(54.5,104.5,5),axes=FALSE) # ...and then adding them where we want them. Here, side=1 refers to the # bottom horizontal side. > axis(side=1, at=seq(57,102,5)) # That's better, except that the axis label is still hanging in mid-air. # If we specify "pos=0" R will draw the axis line at Y=0, which is where # we actually want it: > hist(Peabody, breaks=seq(54.5,104.5,5),axes=FALSE) > axis(side=1, at=seq(57,102,5), pos=0) # Now we add the left vertical axis: > axis(side=2,at=seq(0,8,2)) # Finally, we give the whole histogram a line to sit on: > abline(h=0) # We can specify a different title from the default: > hist(Peabody, breaks=seq(54.5,104.5,5),axes=FALSE, main="Histogram of Peabody Picture Vocabulary Scores") # If we make the title too long, it won't fit: > hist(Peabody, breaks=seq(54.5,104.5,5),axes=FALSE, main="Histogram of Peabody Picture Vocabulary Scores of 40 10-year-olds in Oakland, California") # But we can make it fit by adding a line feed (look for the "\n" roughly half way through the title): > hist(Peabody, breaks=seq(54.5,104.5,5),axes=FALSE, main="Histogram of Peabody Picture Vocabulary Scores\nof 40 10-year-olds in Oakland, California") > axis(side=1, at=seq(57,102,5), pos=0) > axis(side=2,at=seq(0,8,2)) > abline(h=0) # Now THAT'S a nice histogram! # We reviewed calculation of the mean and median: > mean(Peabody) [1] 81.675 > median(Peabody) [1] 84 # The 17th case has the value 100. Let's suppose that someone accidentally # entered that as 1000: > Peabody [1] 69 72 94 64 80 77 96 86 89 69 92 71 81 90 84 76 100 57 61 [20] 84 81 65 87 92 89 79 91 65 91 81 86 85 95 93 83 76 84 90 [39] 95 67 > Peabody[17] <- 1000 > Peabody [1] 69 72 94 64 80 77 96 86 89 69 92 71 81 90 84 [16] 76 1000 57 61 84 81 65 87 92 89 79 91 65 91 81 [31] 86 85 95 93 83 76 84 90 95 67 # The mean is highly affected by the bad data point: > mean(Peabody) [1] 104.175 # But the median is unchanged: > median(Peabody) [1] 84 > Peabody[17] <- 100 > mean(Peabody) [1] 81.675 # That illustrates the principle of "resistance" that can help us choose # a measure of central tendency. The median is said to be resistant to the # effects of extreme data or skewed distributions.