22  Misc. Multivariate Topics


Filtering.

In some cases, we may want to smooth time series data instead of predict into the future. We can distinguish two types of smoothing, incorporating future observations or not. When only weighting two other observations, the differences can be expressed as trying to estimate the average with different available data:

  • Filtering: \(\mathbb{E}[Y_{t} | X_{t-1}, X_{t-2}]\)
  • Smoothing: \(\mathbb{E}[Y_{t} | X_{t-1}, X_{t+1}]\)

One example of filtering is Exponential Filtering (sometimes confusingly referred to as “Exponential Smoothing”) which weights only previous observations using an exponential kernel.

Code
##################
# Time series data
##################

set.seed(1)
## Underlying Trend
x0 <- cumsum(rnorm(500,0,1))
## Observed Datapoints
x <- x0 + runif(length(x0),-10,10)
dat <- data.frame(t=seq(x), x0=x0, x=x)

## Asymmetric Kernel
#bw <- c(2/3,1/3)
#s1 <- filter(x, bw/sum(bw), sides=1)

## Symmetric Kernel
#bw <- c(1/6,2/3,1/6)
#s2 <- filter(x,  bw/sum(bw), sides=2)

There are several cross-validation procedures for filtering time series data . One is called time series cross-validation (TSCV), which is useful for temporally dependent data .

Code
## Plot Simulated Data
x <- dat$x
x0 <- dat$x0
par(fig = c(0,1,0,1), new=F)
plot(x, pch=16, col=grey(0,.25))
lines(x0, col=1, lty=1, lwd=2)

## Work with differenced data?
#n       <- length(Yt)
#plot(Yt[1:(n-1)], Yt[2:n],
#    xlab='d Y (t-1)', ylab='d Y (t)', 
#    col=grey(0,.5), pch=16)
#Yt <- diff(Yt)

## TSCV One Sided Moving Average
filter_bws <- seq(1,20,by=1)
filter_mape_bws <- sapply(filter_bws, function(h){
    bw <- c(0,rep(1/h,h)) ## Leave current one out
    s2 <- filter(x, bw, sides=1)
    pe <- s2 - x
    mape <- mean( abs(pe)^2, na.rm=T)
})
filter_mape_star <- filter_mape_bws[which.min(filter_mape_bws)]
filter_h_star <- filter_bws[which.min(filter_mape_bws)]
filter_tscv <- filter(x,  c(rep(1/filter_h_star,filter_h_star)), sides=1)
# Plot Optimization Results
#par(fig = c(0.07, 0.35, 0.07, 0.35), new=T) 
#plot(filter_bws, filter_mape_bws, type='o', ylab='mape', pch=16)
#points(filter_h_star, filter_mape_star, pch=19, col=2, cex=1.5)

## TSCV for LLLS
library(np)
llls_bws <- seq(8,28,by=1)
llls_burnin <- 10
llls_mape_bws <- sapply(llls_bws, function(h){ # cat(h,'\n')
    pe <- sapply(llls_burnin:nrow(dat), function(t_end){
        dat_t <- dat[dat$t<t_end, ]
        reg <- npreg(x~t, data=dat_t, bws=h,
            ckertype='epanechnikov',
            bandwidth.compute=F, regtype='ll')
        edat <- dat[dat$t==t_end,]
        pred <- predict(reg, newdata=edat)
        pe <- edat$x - pred
        return(pe)    
    })
    mape <- mean( abs(pe)^2, na.rm=T)
})
llls_mape_star <- llls_mape_bws[which.min(llls_mape_bws)]
llls_h_star <- llls_bws[which.min(llls_mape_bws)]
#llls_tscv <- predict( npreg(x~t, data=dat, bws=llls_h_star,
#    bandwidth.compute=F, regtype='ll', ckertype='epanechnikov'))
llls_tscv <- sapply(llls_burnin:nrow(dat), function(t_end, h=llls_h_star){
    dat_t <- dat[dat$t<t_end, ]
    reg <- npreg(x~t, data=dat_t, bws=h,
        ckertype='epanechnikov',
        bandwidth.compute=F, regtype='ll')
    edat <- dat[dat$t==t_end,]
    pred <- predict(reg, newdata=edat)
    return(pred)    
})

## Compare Fits Qualitatively
lines(filter_tscv, col=2, lty=1, lwd=1)
lines(llls_burnin:nrow(dat), llls_tscv, col=4, lty=1, lwd=1)
legend('topleft', lty=1, col=c(1,2,4), bty='n',
    c('Underlying Trend', 'MA-1sided + TSCV', 'LLLS-1sided + TSCV'))

## Compare Fits Quantitatively
cbind(
    bandwidth=c(LLLS=llls_h_star, MA=filter_h_star),
    mape=round(c(LLLS=llls_mape_star, MA=filter_mape_star),2) )

## See also https://cran.r-project.org/web/packages/smoots/smoots.pdf
#https://otexts.com/fpp3/tscv.html
#https://robjhyndman.com/hyndsight/tscvexample/

Cross Validation.

Perhaps the most common approach to selecting a bandwidth is to minimize error. Leave-one-out Cross-validation minimizes the average “leave-one-out” mean square prediction errors: \[\begin{eqnarray} \min_{\mathbf{H}} \quad \frac{1}{n} \sum_{i=1}^{n} \left[ \hat{Y}_{i} - \hat{y_{[i]}}(\mathbf{X},\mathbf{H}) \right]^2, % \widehat{Y_{[i]}}(\mathbf{X},\mathbf{H}) &=& \sum_{j\neq i} k(\mathbf{X}_{j},\mathbf{X}_{i},\mathbf{H}) \left[ \widehat{\alpha}(\mathbf{X}_{j}) + \widehat{\beta}(\mathbf{X}_{j}) \mathbf{X}_{i} \right] \end{eqnarray}\] where \(\hat{y}_{[i]}(\mathbf{X},\mathbf{H})\) is the model predicted value at \(\mathbf{X}_{i}\) based on a dataset that excludes \(\mathbf{X}_{i}\), and \(\mathbf{H}\) is matrix of bandwidths. With a weighted least squares regression on three explanatory variables, for example, the matrix is \[\begin{eqnarray} \mathbf{H}=\begin{pmatrix} h_{1} & 0 & 0 \\ 0 & h_{2} & 0 \\ 0 & 0 & h_{3} \\ \end{pmatrix}, \end{eqnarray}\] where each \(h_{k}\) is the bandwidth for variable \(X_{k}\).

There are many types of cross-validation . For example, one extension is k-fold cross validation, which splits \(N\) datapoints into \(k=1...K\) groups, each sized \(B\), and predicts values for the left-out group. adjusts for the degrees of freedom, whereas the function in R uses by default. You can refer to extensions on a case by case basis.