4 Statistics

4.1 Data Types

The most commom types are

l1 <- 1:3 ## cardinal numbers
l1
## [1] 1 2 3
l2 <- factor(c('A','B','C'), ordered=T) ## ordinal numbers
l2
## [1] A B C
## Levels: A < B < C
l3 <- factor(c('Leipzig','Los Angeles','Logan'), ordered=F) ## categorical numbers
l3
## [1] Leipzig     Los Angeles Logan      
## Levels: Leipzig Logan Los Angeles
l4 <- c('hello world', 'hi mom')  ## character strings
l4
## [1] "hello world" "hi mom"
l5 <- list(l1, l2, list(l3, list('...inception...'))) ## lists
l5
## [[1]]
## [1] 1 2 3
## 
## [[2]]
## [1] A B C
## Levels: A < B < C
## 
## [[3]]
## [[3]][[1]]
## [1] Leipzig     Los Angeles Logan      
## Levels: Leipzig Logan Los Angeles
## 
## [[3]][[2]]
## [[3]][[2]][[1]]
## [1] "...inception..."
## data.frames: your most common data type
    ## matrix of different data-types
    ## well-ordered lists
l5 <- data.frame(x=l1, y=l2, z=l3)
l5
##   x y           z
## 1 1 A     Leipzig
## 2 2 B Los Angeles
## 3 3 C       Logan

4.2 Random Variables

The different types of data can be randomly generated on your computer. Random variables are vectors that appear to be generated from a probabilistic process.

## Random bernoulli (Coin Flip: Heads=1)
rbinom(1, 1, 0.5) ## 1 Flip
## [1] 0
rbinom(4, 1, 0.5) ## 4 Flips in row
## [1] 0 0 1 0
x0 <- rbinom(1000, 1, 0.5)
hist(x0)

## random standard-normal
rnorm(4) 
## [1] -2.4434343  1.8982638 -0.9916161  1.6988089
x1 <- rnorm(1000)
hist(x1)

## random uniform
runif(4)
## [1] 0.002416413 0.188919474 0.655073884 0.827586962
x2 <- runif(1000)
hist(x2)

4.3 Functions of Data

Two definitions to remember

  • statistic a function of data
  • sampling distribution how a statistic varies from sample to sample

The mean is a statistic

## compute the mean of a random sample
x <- runif(100)
hist(x)
m <- mean(x)
abline(v=m, col=2)

## is m close to it's true value (1-0)/2=.5?
## what about mean(runif(1000)) ?
## what about mean( rbinom(100, 1, 0.5) )?

see how the mean varies from sample to sample to sample

par(mfrow=c(1,3))
sapply(1:3, function(i){
    x <- runif(100) 
    m <-  mean(x)
    hist(x,
        main=paste0('mean= ', round(m,4)),
        breaks=seq(0,1,by=.1))
    abline(v=m, col=2)
    return(m)
})

## [1] 0.4861373 0.5096834 0.4485544

examine the sampling distribution of the mean

sample_means <- sapply(1:1000, function(i) mean(runif(100)) )
hist(sample_means, breaks=50, col=2, main='Sampling Distribution of the mean')

examine the sampling distribution of the standard deviation

three_sds <- c(  sd(runif(100)),  sd(runif(100)),  sd(runif(100))  )
three_sds
## [1] 0.2882008 0.2851244 0.2895885
sample_sds <- sapply(1:1000, function(i) sd(runif(100)) )
hist(sample_sds, breaks=50, col=4, main='Sampling Distribution of the sd')

examine the sampling distribution of “order statistics”

## Create 300 samples, each with 1000 random uniform variables
x <- sapply(1:300, function(i) runif(1000) )

## Median also looks normal
xmed <- apply(x,1,quantile, probs=.5)
hist(xmed,breaks=100)

## Maximum and Minumum do not!
xmin <- apply(x,1,quantile, probs=0)
xmax <- apply(x,1,quantile, probs=1)
par(mfrow=c(1,2))
hist(xmin,breaks=100)
hist(xmax,breaks=100)

## Upper and Lower Quantiles
xq <- apply(x,1,quantile, probs=c(.05,.95))
bks <- seq(0,1,by=.01)
hist(xq[1,], main='quantile estimates', col=rgb(0,0,1,.5), xlim=c(0,1), breaks=bks)
hist(xq[2,], col=rgb(1,0,0,.5), add=T, breaks=seq(0,1,by=.01))

## Coverage
xcov <- sapply(bks, function(b){
    bl <- b >= xq[1,]
    bu <- b <= xq[2,]
    mean( bl&bu )
})
plot.new()
plot.window(xlim=c(0,1), ylim=c(0,1))
polygon( c(bks, rev(bks)), c(xcov, xcov*0), col=grey(.5,.5), border=NA)
title('coverage frequency')
axis(1)
axis(2)

## Try any function!
fun_of_rv <- function(f, n=100){
  x <- runif(n)
  y <- f(x)
  return(y)
}
fun_of_rv( function(i){range(exp(i))})
## [1] 1.001370 2.670147

4.4 Value of More Data

Each additional data point you have provides more information, which ultimately decreases the standard error of your estimates. However, it does so at a decreasing rate (known in economics as diminishing marginal returns).

Nseq <- seq(1,100, by=1) ## Sample sizes
B <- 1000 ## Number of draws per sample

SE <- sapply(Nseq, function(n){
    sample_statistics <- sapply(1:B, function(b){
        x <- rnorm(n) ## Sample of size N
        quantile(x,probs=.4) ## Statistic
    })
    sd(sample_statistics)
})

par(mfrow=c(1,2))
plot(Nseq, SE, pch=16, col=grey(0,.5), main='Absolute Gain',
    ylab='standard error', xlab='sample size')
plot(Nseq[-1], abs(diff(SE)), pch=16, col=grey(0,.5), main='Marginal Gain', 
    ylab='decrease in standard error', xlab='sample size')