4 Statistics

4.1 Data Types

The most commom types are

d1 <- 1:3 # cardinal data
d1
## [1] 1 2 3
d2 <- c('hello world', 'hi mom')  # character strings
d2
## [1] "hello world" "hi mom"
d3 <- factor(c('A','B','C'), ordered=T) # ordinal data
d3
## [1] A B C
## Levels: A < B < C
d4 <- factor(c('Leipzig','Los Angeles','Logan'), ordered=F) # categorical data
d4
## [1] Leipzig     Los Angeles Logan      
## Levels: Leipzig Logan Los Angeles
d5 <- list(d1, d2, list(d3, list('...inception...'))) # lists
d5
## [[1]]
## [1] 1 2 3
## 
## [[2]]
## [1] "hello world" "hi mom"     
## 
## [[3]]
## [[3]][[1]]
## [1] A B C
## Levels: A < B < C
## 
## [[3]][[2]]
## [[3]][[2]][[1]]
## [1] "...inception..."
# data.frames: your most common data type
    # matrix of different data-types
    # well-ordered lists
d0 <- data.frame(y=d1, x=d4)
d0
##   y           x
## 1 1     Leipzig
## 2 2 Los Angeles
## 3 3       Logan

4.2 Random Variables

The different types of data can be randomly generated on your computer. Random variables are vectors that appear to be generated from a probabilistic process.

# Random bernoulli (Coin Flip: Heads=1)
rbinom(1, 1, 0.5) # 1 Flip
## [1] 1
rbinom(4, 1, 0.5) # 4 Flips in row
## [1] 1 0 1 0
x0 <- rbinom(1000, 1, 0.5)
hist(x0)

# random standard-normal
rnorm(4) 
## [1] -0.3731743  1.1680266 -1.1113642 -0.6227684
x1 <- rnorm(1000)
hist(x1)

# random uniform
runif(4)
## [1] 0.58304825 0.03237704 0.59186274 0.56172962
x2 <- runif(1000)
hist(x2)

4.3 Functions of Data

Two definitions to remember

  • statistic a function of data
  • sampling distribution how a statistic varies from sample to sample

The mean is a statistic

# compute the mean of a random sample
x <- runif(100)
hist(x, border=NA, main=NA)
m <- mean(x)
abline(v=m, col=2, lwd=2)
title(paste0('mean= ', round(m,2)))

# is m close to it's true value (1-0)/2=.5?
# what about mean(runif(1000)) ?
# what about mean( rbinom(100, 1, 0.5) )?

See how the mean varies from sample to sample to sample

# Three Sample Example
par(mfrow=c(1,3))
sapply(1:3, function(i){
    x <- runif(100) 
    m <-  mean(x)
    hist(x,
        breaks=seq(0,1,by=.1), #for comparability
        main=NA, border=NA)
    abline(v=m, col=2, lwd=2)
    title(paste0('mean= ', round(m,2)))
    return(m)
})

## [1] 0.4924445 0.5253882 0.5129768

examine the sampling distribution of the mean

sample_means <- sapply(1:1000, function(i){
    m <- mean(runif(100))
    return(m)
})
hist(sample_means, breaks=50, border=NA,
    col=2, main='Sampling Distribution of the mean')

examine the sampling distribution of the standard deviation

three_sds <- c(  sd(runif(100)),  sd(runif(100)),  sd(runif(100))  )
three_sds
## [1] 0.2802642 0.2832961 0.2909738
sample_sds <- sapply(1:1000, function(i){
    s <- sd(runif(100))
    return(s)
})
hist(sample_sds, breaks=50, border=NA,
    col=4, main='Sampling Distribution of the sd')

examine the sampling distribution of “order statistics”

# Create 300 samples, each with 1000 random uniform variables
x <- sapply(1:300, function(i) runif(1000) )

# Median also looks normal
xmed <- apply(x,1,quantile, probs=.5)
hist(xmed, breaks=100, border=NA,
    main='Sampling Distribution of the median')

# Maximum and Minumum do not!
xmin <- apply(x,1,quantile, probs=0)
xmax <- apply(x,1,quantile, probs=1)
par(mfrow=c(1,2))
hist(xmin, breaks=100, border=NA, main='Min')
hist(xmax, breaks=100, border=NA, main='Max')
title('Sampling Distributions', outer=T, line=-1)

Quantiles and coverage

# Upper and Lower Quantiles
xq <- apply(x,1,quantile, probs=c(.05,.95))
bks <- seq(0,1,by=.01)
hist(xq[1,], border=NA, col=rgb(0,0,1,.5),
    main='quantile estimates', xlim=c(0,1), breaks=bks)
hist(xq[2,], border=NA, col=rgb(1,0,0,.5),
    add=T, breaks=bks)

# "Pointwise" Coverage
xcov <- sapply(bks, function(b){
    bl <- b >= xq[1,]
    bu <- b <= xq[2,]
    mean( bl&bu )
})
plot.new()
plot.window(xlim=c(0,1), ylim=c(0,1))
polygon( c(bks, rev(bks)), c(xcov, xcov*0), col=grey(.5,.5), border=NA)
mtext('Frequency each point was covered',2, line=2.5)
axis(1)
axis(2)

Try any function!

fun_of_rv <- function(f, n=100){
  x <- runif(n)
  y <- f(x)
  return(y)
}
fun_of_rv( f=mean )
## [1] 0.5372232
fun_of_rv( f=function(i){ diff(range(exp(i))) } )
## [1] 1.713471

4.4 Value of More Data

Each additional data point you have provides more information, which ultimately decreases the standard error of your estimates. However, it does so at a decreasing rate (known in economics as diminishing marginal returns).

Nseq <- seq(1,100, by=1) # Sample sizes
B <- 1000 # Number of draws per sample

SE <- sapply(Nseq, function(n){
    sample_statistics <- sapply(1:B, function(b){
        x <- rnorm(n) # Sample of size N
        quantile(x,probs=.4) # Statistic
    })
    sd(sample_statistics)
})

par(mfrow=c(1,2))
plot(Nseq, SE, pch=16, col=grey(0,.5),
    main='Absolute Gain',
    ylab='standard error', xlab='sample size')
plot(Nseq[-1], abs(diff(SE)), pch=16, col=grey(0,.5),
    main='Marginal Gain', 
    ylab='decrease in standard error', xlab='sample size')