4 Statistics
4.1 Data Types
The most commom types are
## [1] 1 2 3
## [1] "hello world" "hi mom"
## [1] A B C
## Levels: A < B < C
## [1] Leipzig Los Angeles Logan
## Levels: Leipzig Logan Los Angeles
## [[1]]
## [1] 1 2 3
##
## [[2]]
## [1] "hello world" "hi mom"
##
## [[3]]
## [[3]][[1]]
## [1] A B C
## Levels: A < B < C
##
## [[3]][[2]]
## [[3]][[2]][[1]]
## [1] "...inception..."
# data.frames: your most common data type
# matrix of different data-types
# well-ordered lists
d0 <- data.frame(y=d1, x=d4)
d0
## y x
## 1 1 Leipzig
## 2 2 Los Angeles
## 3 3 Logan
4.2 Random Variables
The different types of data can be randomly generated on your computer. Random variables are vectors that appear to be generated from a probabilistic process.
## [1] 1
## [1] 1 0 1 0
## [1] -0.3731743 1.1680266 -1.1113642 -0.6227684
## [1] 0.58304825 0.03237704 0.59186274 0.56172962
4.3 Functions of Data
Two definitions to remember
- statistic a function of data
- sampling distribution how a statistic varies from sample to sample
The mean is a statistic
# compute the mean of a random sample
x <- runif(100)
hist(x, border=NA, main=NA)
m <- mean(x)
abline(v=m, col=2, lwd=2)
title(paste0('mean= ', round(m,2)))
# is m close to it's true value (1-0)/2=.5?
# what about mean(runif(1000)) ?
# what about mean( rbinom(100, 1, 0.5) )?
See how the mean varies from sample to sample to sample
# Three Sample Example
par(mfrow=c(1,3))
sapply(1:3, function(i){
x <- runif(100)
m <- mean(x)
hist(x,
breaks=seq(0,1,by=.1), #for comparability
main=NA, border=NA)
abline(v=m, col=2, lwd=2)
title(paste0('mean= ', round(m,2)))
return(m)
})
## [1] 0.4924445 0.5253882 0.5129768
examine the sampling distribution of the mean
sample_means <- sapply(1:1000, function(i){
m <- mean(runif(100))
return(m)
})
hist(sample_means, breaks=50, border=NA,
col=2, main='Sampling Distribution of the mean')
examine the sampling distribution of the standard deviation
## [1] 0.2802642 0.2832961 0.2909738
sample_sds <- sapply(1:1000, function(i){
s <- sd(runif(100))
return(s)
})
hist(sample_sds, breaks=50, border=NA,
col=4, main='Sampling Distribution of the sd')
examine the sampling distribution of “order statistics”
# Create 300 samples, each with 1000 random uniform variables
x <- sapply(1:300, function(i) runif(1000) )
# Median also looks normal
xmed <- apply(x,1,quantile, probs=.5)
hist(xmed, breaks=100, border=NA,
main='Sampling Distribution of the median')
# Maximum and Minumum do not!
xmin <- apply(x,1,quantile, probs=0)
xmax <- apply(x,1,quantile, probs=1)
par(mfrow=c(1,2))
hist(xmin, breaks=100, border=NA, main='Min')
hist(xmax, breaks=100, border=NA, main='Max')
title('Sampling Distributions', outer=T, line=-1)
Quantiles and coverage
# Upper and Lower Quantiles
xq <- apply(x,1,quantile, probs=c(.05,.95))
bks <- seq(0,1,by=.01)
hist(xq[1,], border=NA, col=rgb(0,0,1,.5),
main='quantile estimates', xlim=c(0,1), breaks=bks)
hist(xq[2,], border=NA, col=rgb(1,0,0,.5),
add=T, breaks=bks)
# "Pointwise" Coverage
xcov <- sapply(bks, function(b){
bl <- b >= xq[1,]
bu <- b <= xq[2,]
mean( bl&bu )
})
plot.new()
plot.window(xlim=c(0,1), ylim=c(0,1))
polygon( c(bks, rev(bks)), c(xcov, xcov*0), col=grey(.5,.5), border=NA)
mtext('Frequency each point was covered',2, line=2.5)
axis(1)
axis(2)
Try any function!
## [1] 0.5372232
## [1] 1.713471
4.4 Value of More Data
Each additional data point you have provides more information, which ultimately decreases the standard error of your estimates. However, it does so at a decreasing rate (known in economics as diminishing marginal returns).
Nseq <- seq(1,100, by=1) # Sample sizes
B <- 1000 # Number of draws per sample
SE <- sapply(Nseq, function(n){
sample_statistics <- sapply(1:B, function(b){
x <- rnorm(n) # Sample of size N
quantile(x,probs=.4) # Statistic
})
sd(sample_statistics)
})
par(mfrow=c(1,2))
plot(Nseq, SE, pch=16, col=grey(0,.5),
main='Absolute Gain',
ylab='standard error', xlab='sample size')
plot(Nseq[-1], abs(diff(SE)), pch=16, col=grey(0,.5),
main='Marginal Gain',
ylab='decrease in standard error', xlab='sample size')