5 Data Analysis


Reading in

# Install R Data Package and Load in
install.packages('wooldridge')
library(wooldridge)
data('crime2')
data('crime4')

# Read in csv (downloaded from online)
# download source 'http://www.stern.nyu.edu/~wgreene/Text/Edition7/TableF19-3.csv'
# download destination '~/TableF19-3.csv'
read.csv('~/TableF19-3.csv')
 
# Can read in csv (directly from online)
# dat_csv <- read.csv('http://www.stern.nyu.edu/~wgreene/Text/Edition7/TableF19-3.csv')

# Read in stata data file from online
#library(haven)
#dat_stata <- read_dta('https://www.ssc.wisc.edu/~bhansen/econometrics/DS2004.dta')
#dat_stata <- as.data.frame(dat_stata)

# For More Introductory Econometrics Data, see 
# https://www.ssc.wisc.edu/~bhansen/econometrics/Econometrics%20Data.zip
# https://pages.stern.nyu.edu/~wgreene/Text/Edition7/tablelist8new.htm
# R packages: wooldridge, causaldata, Ecdat, AER, ....

Read in some historical data on crime in the US

head(USArrests)
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7
summary(USArrests)
##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00

5.1 Cleaning Data

Data transformation is often necessary before analysis, so remember to be careful and check your code is doing what you want. (If you have large datasets, you can always test out the code on a sample.)

# Function to Create Sample Datasets
make_noisy_data <- function(n, b=0){
    # Simple Data Generating Process
    x <- seq(1,10, length.out=n) 
    e <- rnorm(length(x), mean=0, sd=10)
    y <- b*x + e 
    # Obervations
    xy_mat <- data.frame(ID=seq(x), x=x, y=y)
    return(xy_mat)
}

# Two simulated datasets
dat1 <- make_noisy_data(6)
dat2 <- make_noisy_data(6)

# Merging data in long format
dat_merged_long <- rbind(
    cbind(dat1,DF=1),
    cbind(dat2,DF=2))

Now suppose we want to transform into wide format

# Merging data in wide format, First Attempt
dat_merged_wide <- cbind( dat1, dat2)
names(dat_merged_wide) <- c(paste0(names(dat1),'.1'), paste0(names(dat2),'.2'))

# Merging data in wide format, Second Attempt
# higher performance
dat_merged_wide2 <- merge(dat1, dat2,
    by='ID', suffixes=c('.1','.2'))
## CHECK they are the same.
identical(dat_merged_wide, dat_merged_wide2)
## [1] FALSE
# Inspect any differences

# Merging data in wide format, Third Attempt with dedicated package
# (highest performance but with new type of object)
library(data.table)
dat_merged_longDT <- as.data.table(dat_merged_long)
dat_melted <- melt(dat_merged_longDT, id.vars=c('ID', 'DF'))
dat_merged_wide3 <- dcast(dat_melted, ID~DF+variable)

## CHECK they are the same.
identical(dat_merged_wide, dat_merged_wide3)
## [1] FALSE

Often, however, we ultimately want data in long format

# Merging data in long format, Second Attempt with dedicated package 
dat_melted2 <- melt(dat_merged_wide3, measure=c("1_x","1_y","2_x","2_y"))
melt_vars <- strsplit(as.character(dat_melted2$variable),'_')
dat_melted2$DF <- sapply(melt_vars, `[[`,1)
dat_melted2$variable <- sapply(melt_vars, `[[`,2)
dat_merged_long2 <- dcast(dat_melted2, DF+ID~variable)
dat_merged_long2 <- as.data.frame(dat_merged_long2)

## CHECK they are the same.
identical( dat_merged_long2, dat_merged_long)
## [1] FALSE
# Further Inspect
dat_merged_long2 <- dat_merged_long2[,c('ID','x','y','DF')]
mapply( identical, dat_merged_long2, dat_merged_long)
##    ID     x     y    DF 
##  TRUE  TRUE  TRUE FALSE

For more tips, see https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-import.pdf and https://cran.r-project.org/web/packages/data.table/vignettes/datatable-reshape.html

5.2 Static Plots

5.2.1 Distributions

Histograms summarize distributions very effectively. And it is easy to show how distributions change via data splits. You can glue them together to convey more information all at once

#First glimpse
#hist(USArrests$Murder)

# Tailored Histogram 
ylim <- c(0,8)
xbks <-  seq(min(USArrests$Murder)-1, max(USArrests$Murder)+1, by=1)
par(fig=c(0,1,0,0.5), new=F)
hist(USArrests$Murder, breaks=xbks,
    main='All Data', font.main=1,
    xlab='Murder Arrests',
    border=NA, ylim=ylim)
rug(USArrests$Murder)

# Also show more information
# Split Data by Urban Population above/below mean
pop_mean <- mean(USArrests$UrbanPop)
murder_lowpop <- USArrests[USArrests$UrbanPop< pop_mean,'Murder']
murder_highpop <- USArrests[USArrests$UrbanPop>= pop_mean,'Murder']
cols <- c(low=rgb(0,0,1,.75), high=rgb(1,0,0,.75))

par(fig=c(0,.5,0.5,1), new=TRUE)
hist(murder_lowpop,
    breaks=xbks, col=cols[1],
    main='Urban Pop >= Mean', font.main=1,
    xlab='Murder Arrests',
    border=NA, ylim=ylim)
rug(murder_lowpop, col=cols[1])

par(fig=c(0.5,1,0.5,1), new=TRUE)
hist(murder_highpop,
    breaks=xbks, col=cols[2],
    main='Urban Pop < Mean', font.main=1,
    xlab='Murder Arrests',
    border=NA, ylim=ylim)
rug(murder_highpop, col=cols[2])

For more histogram visuals, see https://r-graph-gallery.com/histogram.html. Note that sometimes it is preferable to show the empirical cumulative distribution funtion (ECDF).

par(mfrow=c(1,2))
# Full Sample Density
hist(USArrests$Murder, 
    main='Density Function Estimate', font.main=1,
    xlab='Murder Arrests',
    breaks=xbks, freq=F, border=NA)

# Split Sample Distribution Comparison
F_lowpop <- ecdf(murder_lowpop)
plot(F_lowpop, col=cols[1],
    pch=16, xlab='Murder Arrests',
    main='Distribution Function Estimates',
    font.main=1, bty='n')
F_highpop <- ecdf(murder_highpop)
plot(F_highpop, add=T, col=cols[2], pch=16)

legend('bottomright', col=cols,
    pch=16, bty='n', inset=c(0,.1),
    title='% Urban Pop.',
    legend=c('Low (<= Mean)','High (>= Mean)'))

Boxplots show median, interquartile range, and outliers. As with histograms, you can also split data into groups and glue together

layout( t(c(1,2,2)))
boxplot(USArrests$Murder, main='',
    xlab='All Data', ylab='Murder Arrests')

# K Groups with even spacing
K <- 3
USArrests$UrbanPop_Kcut <- cut(USArrests$UrbanPop,K)
Kcols <- hcl.colors(K,alpha=.5)
boxplot(Murder~UrbanPop_Kcut, USArrests,
    main='', col=Kcols,
    xlab='Urban Population', ylab='')

# 4 Groups with equal observations
#Qcuts <- c(
#    '0%'=min(USArrests$UrbanPop)-10*.Machine$double.eps,
#    quantile(USArrests$UrbanPop, probs=c(.25,.5,.75,1)))
#USArrests$UrbanPop_cut <- cut(USArrests$UrbanPop, Qcuts)

5.2.2 Joint Distributions

Scatterplots are used frequently to summarize the joint relationship between two variables. They can be enhanced in several ways. As a default, use semi-transparent points so as not to hide any points (and perhaps see if your observations are concentrated anywhere).

plot(Murder~UrbanPop, USArrests, pch=16, col=grey(0.,.5))

Fit Lines, Size/Color/Shape You can add regression lines (and confidence intervals). You can also use size, color, and shape to distinguish different subsets.

# High Assault Areas
assault_high <- USArrests$Assault > median(USArrests$Assault)
cols <- ifelse(assault_high, rgb(1,0,0,.5), rgb(0,0,1,.5))

# Scatterplot
plot(Murder~UrbanPop, USArrests, pch=16, col=cols)
# Show High Assault Areas via 'cex='
# Show High Assault Areas via 'pch='

# Add the line of best fit for pooled data
reg <- lm(Murder~UrbanPop, data=USArrests)
abline(reg, lty=2)

# Could also add regression lines y for each data split
#reg_high <- lm(Murder~UrbanPop, data=USArrests[assault_high,])
#abline(reg_high, lty=2, col=rgb(1,0,0,1))
#reg_low <- lm(Murder~UrbanPop, data=USArrests[!assault_high,])
#abline(reg_low, lty=2, col= rgb(0,0,1,1))

# Could also add Confidence Intervals
# https://rpubs.com/aaronsc32/regression-confidence-prediction-intervals

Your first plot is typically standard. For others to easily comprehend your work, you must polish the plot.

# Data Generating Process
x <- seq(1, 10, by=.0002)
e <- rnorm(length(x), mean=0, sd=1)
y <- .25*x + e 
xy_dat <- data.frame(x=x, y=y)

# Plot
par(fig=c(0,1,0,0.9), new=F)
plot(y~x, xy_dat, pch=16, col=rgb(0,0,0,.05), cex=.5,
    xlab='', ylab='') # Format Axis Labels Seperately
mtext( 'y=0.25 x + e\n e ~ standard-normal', 2, line=2.2)
mtext( expression(x%in%~'[0,10]'), 1, line=2.2)

abline( lm(y~x, data=xy_dat), lty=2)

title('Plot with good features and excessive notation',
    adj=0, font.main=1)

# Outer Legend (https://stackoverflow.com/questions/3932038/)
outer_legend <- function(...) {
  opar <- par(fig=c(0, 1, 0, 1), oma=c(0, 0, 0, 0), 
    mar=c(0, 0, 0, 0), new=TRUE)
  on.exit(par(opar))
  plot(0, 0, type='n', bty='n', xaxt='n', yaxt='n')
  legend(...)
}
outer_legend('topright', legend='single data point',
    title='do you see the normal distribution?',
    pch=16, col=rgb(0,0,0,.1), cex=1, bty='n')

Can export figure with specific dimensions

pdf( 'Figures/plot_example.pdf', height=5, width=5)
# plot goes here
dev.off()

For plotting math, see https://astrostatistics.psu.edu/su07/R/html/grDevices/html/plotmath.html https://library.virginia.edu/data/articles/mathematical-annotation-in-r

For exporting options, see ?pdf. For saving other types of files, see png("*.png"), tiff("*.tiff"), and jpeg("*.jpg")

Marginal distributions.

# https://www.r-bloggers.com/2011/06/example-8-41-scatterplot-with-marginal-histograms/

# Setup Plot
layout( matrix(c(2,0,1,3), ncol=2, byrow=TRUE),
    widths=c(9/10,1/10), heights=c(1/10,9/10))

# Scatterplot
par(mar=c(4,4,1,1))
plot(Murder~UrbanPop, USArrests, pch=16, col=rgb(0,0,0,.5))

# Add Marginals
par(mar=c(0,4,1,1))
xhist <- hist(USArrests$UrbanPop, plot=FALSE)
barplot(xhist$counts, axes=FALSE, space=0, border=NA)

par(mar=c(4,0,1,1))
yhist <- hist(USArrests$Murder, plot=FALSE)
barplot(yhist$counts, axes=FALSE, space=0, horiz=TRUE, border=NA)

For plotting marginals, see https://r-graph-gallery.com/74-margin-and-oma-cheatsheet.html and https://jtr13.github.io/cc21fall2/tutorial-for-scatter-plot-with-marginal-distribution.html.

For some things to avoid, see https://www.data-to-viz.com/caveats.html

5.3 Interactive Plots

Especially for data exploration, your plots can also be interactive via https://plotly.com/r/. For more details, see examples and then applications.

#install.packages("plotly")
library(plotly)

Histograms https://plotly.com/r/histograms/

pop_mean <- mean(USArrests$UrbanPop)
murder_lowpop <- USArrests[USArrests$UrbanPop< pop_mean,'Murder']
murder_highpop <- USArrests[USArrests$UrbanPop>= pop_mean,'Murder']


fig <- plot_ly(alpha=0.6, 
    hovertemplate="%{y}")
fig <- fig %>% add_histogram(murder_lowpop, name='Low Pop. (< Mean)')
fig <- fig %>% add_histogram(murder_highpop, name='High Pop (>= Mean)')
fig <- fig %>% layout(barmode="stack") # barmode="overlay"
fig <- fig %>% layout(
    title="Crime and Urbanization in America 1975",
    xaxis = list(title='Murders Arrests per 100,000 People'),
    yaxis = list(title='Number of States'),
    legend=list(title=list(text='<b> % Urban Pop. </b>'))
)
fig

Boxplots https://plotly.com/r/box-plots/

USArrests$ID <- rownames(USArrests)
fig <- plot_ly(USArrests,
    y=~Murder, color=~cut(UrbanPop,4),
    alpha=0.6, type="box",
    pointpos=0, boxpoints = 'all',
    hoverinfo='text',    
    text = ~paste('<b>', ID, '</b>',
        "<br>Urban  :", UrbanPop,
        "<br>Assault:", Assault,
        "<br>Murder :", Murder))    
fig <- layout(fig,
    showlegend=FALSE,
    title='Crime and Urbanization in America 1975',
    xaxis = list(title = 'Percent of People in an Urban Area'),
    yaxis = list(title = 'Murders Arrests per 100,000 People'))
fig

Scatterplots https://plotly.com/r/bubble-charts/

# Simple Scatter Plot
#plot(Assault~UrbanPop, USArrests, col=grey(0,.5), pch=16,
#    cex=USArrests$Murder/diff(range(USArrests$Murder))*2,
#    main='US Murder arrests (per 100,000)')

# Scatter Plot
USArrests$ID <- rownames(USArrests)
fig <- plot_ly(
    USArrests, x = ~UrbanPop, y = ~Assault,
    mode='markers',
    type='scatter',
    hoverinfo='text',
    text = ~paste('<b>', ID, '</b>',
        "<br>Urban  :", UrbanPop,
        "<br>Assault:", Assault,
        "<br>Murder :", Murder),
    color=~Murder,
    marker=list(
        size=~Murder,
        opacity=0.5,
        showscale=T,  
        colorbar = list(title='Murder Arrests (per 100,000)')))
fig <- layout(fig,
    showlegend=F,
    title='Crime and Urbanization in America 1975',
    xaxis = list(title = 'Percent of People in an Urban Area'),
    yaxis = list(title = 'Assault Arrests per 100,000 People'))
fig