# Computation time and features

#### 2018-09-12

This vignette shows comparisons in terms of computation time with other packages and alternative base R solutions. Specifically, we will make comparisons with the roll package and zoo package. It should be stressed though that the other solutions do additional things than this package does. E.g., there is not performed any rank test in the function in this package. We start by showing the comparisons of computation times and then we show different options. R versions of the functions are shown to make it clear what the output is. Some function definitions are shown at the end.

# Comparisons

We start by simulating the data

set.seed(32981054)
n <- 10000
p <- 6
wdth = 50
X <- matrix(rnorm(p * n), n, p)
y <- drop(X %*% runif(p)) + rnorm(n)
df <- data.frame(y, X)
frm <- eval(parse(text = paste0(
"formula(y ~ -1 + ", paste0("X", 1:p, collapse = " + "), ")")))

Then we check that the functions give the same (the function definitions are at the end of this document)

library(rollRegres)
all.equal(roll_regress_R(X, y, wdth), roll_regres.fit(X, y, wdth)$coefs, check.attributes = FALSE) #R [1] TRUE all.equal(roll_regress_R(X, y, wdth), roll_regres(frm, df, wdth)$coefs,
check.attributes = FALSE)
#R [1] TRUE
all.equal(roll_regress_R(X, y, wdth), roll_regress_zoo(X, y, wdth),
check.attributes = FALSE)
#R [1] TRUE
all.equal(roll_regress_R(X, y, wdth), roll_regress_R_for_loop(X, y, wdth),
check.attributes = FALSE)
#R [1] TRUE
all.equal(roll_regress_R(X, y, wdth), roll_lm_func(X, y, wdth),
check.attributes = FALSE)
#R [1] TRUE

and here then we compare the computation time

microbenchmark::microbenchmark(
roll_regress            = roll_regres.fit(X, y, wdth),
# this will be slower due to call to model.matrix and model.frame
roll_regress_df         = roll_regres(frm, df, wdth),
roll_regress_R          = roll_regress_R(X, y, wdth),
roll_regress_zoo        = roll_regress_zoo(X, y, wdth),
roll_regress_R_for_loop = roll_regress_R_for_loop(X, y, wdth),
roll_lm = roll_lm_func(X, y, wdth),
times = 5)
#R Unit: milliseconds
#R                     expr        min         lq       mean     median         uq
#R             roll_regress   6.531166   6.593586   7.286604   7.273093   7.656304
#R          roll_regress_df   7.813933   8.429440   8.629657   8.459860   8.845440
#R           roll_regress_R 187.399656 191.192251 197.489145 193.989290 203.531619
#R         roll_regress_zoo 827.319568 842.312173 853.858177 862.576090 868.272489
#R  roll_regress_R_for_loop 416.702749 418.746406 436.568840 436.063555 439.137533
#R                  roll_lm 158.625707 161.093265 162.873651 161.649512 162.038253
#R         max neval
#R    8.378872     5
#R    9.599614     5
#R  211.332909     5
#R  868.810564     5
#R  472.193955     5
#R  170.961519     5

# here is the formula used above
frm
#R y ~ -1 + X1 + X2 + X3 + X4 + X5 + X6

This section will cover some additional features.

## Expanding window

Here are expanding window regressions with additional output

#####
# simulate data
set.seed(65731482)
n <- 100
p <- 2
X <- matrix(rnorm(p * n), n, p)
y <- drop(X %*% runif(p)) + rnorm(n)

#####
# use package function
pck_out <- roll_regres.fit(
X, y, width = 50L, do_downdates = FALSE,
do_compute = c("sigmas", "r.squareds", "1_step_forecasts"))

#####
# assign R-version
R_func <- function(X, y, width){
n <- nrow(X)
p <- ncol(X)
out <- matrix(NA_real_, n, p)
sigmas             <- rep(NA_real_, n)
r.squared          <- rep(NA_real_, n)
one_step_forecasts <- rep(NA_real_, n)

for(i in width:n){
idx <- 1:i
fit <- lm(y[idx] ~ -1 + X[idx, , drop = FALSE])
out[i, ] <- fit$coefficients su <- summary(fit) sigmas[i] <- su$sigma

ss1 <- sum((y[idx] - mean(y[idx]))^2)
ss2 <- sum(fit$residuals^2) r.squared[i] <- 1 - ss2 / ss1 if(i < n){ next_i <- i + 1L one_step_forecasts[next_i] <- fit$coefficients %*% X[next_i, ]
}
}

list(coef = out, sigmas = sigmas, r.squared = r.squared,
one_step_forecasts = one_step_forecasts)
}

R_out <- R_func(X, y, 50L)

#####
# gives the same
stopifnot(isTRUE(all.equal(R_out$coef , pck_out$coefs)))
stopifnot(isTRUE(all.equal(R_out$sigmas , pck_out$sigmas)))
stopifnot(isTRUE(all.equal(R_out$r.squared , pck_out$r.squared)))
stopifnot(isTRUE(all.equal(R_out$one_step_forecasts, pck_out$one_step_forecasts)))

## Update in blocks

You can use the grp argument to make updates in blocks. E.g., here is an example with weekly data

#####
# simulate data
set.seed(68799379)
week <- as.integer(gl(25, 7))
#R [1] 1 1 1 1 1 1
n <- length(week)
p <- 2
X <- matrix(rnorm(p * n), n, p)
y <- drop(X %*% runif(p)) + rnorm(n)

#####
# use package function
pck_out <- roll_regres.fit(
X, y, width = 10L, grp = week,
do_compute = c("sigmas", "r.squareds", "1_step_forecasts"))

#####
# assign R-version
R_func <- function(X, y, width, grp){
grp <- grp + 1L - min(grp)
u_grp = unique(grp)
n <- nrow(X)
p <- ncol(X)
out <- matrix(NA_real_, n, p)
sigmas             <- rep(NA_real_, n)
r.squared          <- rep(NA_real_, n)
one_step_forecasts <- rep(NA_real_, n)

start_val <- min(which(u_grp >= width))
for(g in u_grp[start_val:length(u_grp)]){
idx <- which(grp %in% (g - width + 1L):g)
i <- which(grp == g)
fit <- lm(y[idx] ~ -1 + X[idx, , drop = FALSE])
out[i, ] <- sapply(fit$coefficients, rep, times = length(i)) su <- summary(fit) sigmas[i] <- su$sigma

ss1 <- sum((y[idx] - mean(y[idx]))^2)
ss2 <- sum(fit$residuals^2) r.squared[i] <- 1 - ss2 / ss1 if(g != max(grp)){ next_g <- grp[min(which(grp > g))] next_g <- which(grp == next_g) one_step_forecasts[next_g] <- X[next_g, ] %*% fit$coefficients
}
}

list(coef = out, sigmas = sigmas, r.squared = r.squared,
one_step_forecasts = one_step_forecasts)
}

R_out <- R_func(X, y, 10L, grp = week)

#####
# gives the same
stopifnot(isTRUE(all.equal(R_out$coef , pck_out$coefs)))
stopifnot(isTRUE(all.equal(R_out$sigmas , pck_out$sigmas)))
stopifnot(isTRUE(all.equal(R_out$r.squared , pck_out$r.squared)))
stopifnot(isTRUE(all.equal(R_out$one_step_forecasts, pck_out$one_step_forecasts)))

## Block with minimum required number of observation

Suppose that we are performing rolling regression on stock data over a yearly window. Further, suppose that there are som gaps in the data where we do not have data and we require at least 6 months of data. This can be done as follows

#####
# simulate data
set.seed(96235555)
n   <- 10L * 12L * 21L # x years w/ 12 months of 21 trading days
month <- (seq_len(n) - 1L) %/% 21L + 1L # group by months
set.seed(29478439)
X <- matrix(rnorm(n * 4L), ncol = 4L)
y <- rnorm(n)

# randomly drop rows
keep <- seq_along(y) %in% sample.int(nrow(X), as.integer(n * .5))
X     <- X    [keep, ]
y     <- y    [keep]
month <- month[keep]

#####
# use package function
pck_out <- roll_regres.fit(
X, y, width = 12L, grp = month, min_obs = 21L * 6L, do_downdates = TRUE,
do_compute = c("sigmas", "r.squareds"))

#####
# assign R-version
R_func <- function(X, y, width, grp, downdate, min_obs){
grp <- grp + 1L - min(grp)
u_grp = unique(grp)
n <- nrow(X)
p <- ncol(X)
out <- matrix(NA_real_, n, p)
sigmas    <- rep(NA_real_, n)
r.squared <- rep(NA_real_, n)

start_val <- min(which(u_grp >= width))
for(g in u_grp[start_val:length(u_grp)]){
idx <-
if(downdate)
which(grp %in% (g - width + 1L):g) else
which(grp %in% 1:g)
i <- which(grp == g)
if(length(idx) < min_obs)
next
fit <- lm(y[idx] ~ -1 + X[idx, , drop = FALSE])
out[i, ] <- sapply(fit$coefficients, rep, times = length(i)) su <- summary(fit) sigmas[i] <- su$sigma

ss1 <- sum((y[idx] - mean(y[idx]))^2)
ss2 <- sum(fit$residuals^2) r.squared[i] <- 1 - ss2 / ss1 } list(coef = out, sigmas = sigmas, r.squared = r.squared) } R_out <- R_func(X, y, width = 12L, downdate = TRUE, grp = month, min_obs = 21L * 6L) ##### # gives the same stopifnot(isTRUE(all.equal(R_out$coef     , pck_out$coefs))) stopifnot(isTRUE(all.equal(R_out$sigmas   , pck_out$sigmas))) stopifnot(isTRUE(all.equal(R_out$r.squared, pck_out$r.squared))) # Session info sessionInfo() #R R version 3.5.0 (2018-04-23) #R Platform: x86_64-w64-mingw32/x64 (64-bit) #R Running under: Windows 10 x64 (build 17134) #R #R Matrix products: default #R #R locale: #R [1] LC_COLLATE=C #R [2] LC_CTYPE=English_United States.1252 #R [3] LC_MONETARY=English_United States.1252 #R [4] LC_NUMERIC=C #R [5] LC_TIME=English_United States.1252 #R #R attached base packages: #R [1] compiler stats graphics grDevices utils datasets methods #R [8] base #R #R other attached packages: #R [1] rollRegres_0.1.1 roll_1.0.7 zoo_1.8-2 #R #R loaded via a namespace (and not attached): #R [1] Rcpp_0.12.18 lattice_0.20-35 digest_0.6.15 #R [4] rprojroot_1.3-2 grid_3.5.0 backports_1.1.2 #R [7] magrittr_1.5 evaluate_0.10.1 RcppParallel_4.4.0 #R [10] stringi_1.1.7 checkmate_1.8.5 rmarkdown_1.9 #R [13] tools_3.5.0 stringr_1.3.0 yaml_2.1.18 #R [16] microbenchmark_1.4-4 htmltools_0.3.6 knitr_1.20 # Function definitions Here are the function definitions ##### # Pure R version of package function roll_regress_R <- function(X, y, width){ n <- nrow(X) p <- ncol(X) out <- matrix(NA_real_, p, n) dummy_1 <- matrix(0.) dummy_2 <- numeric(p) dummy_3 <- numeric(p) is_first <- TRUE for(i in width:n){ if(is_first){ is_first <- FALSE qr. <- qr(X[1:width, ]) R <- qr.R(qr.) # Use X^T X <- t(X) XtY <- drop(tcrossprod(y[1:width], X[, 1:width])) } else { x_new <- X[, i] x_old <- X[, i - width] # update R rollRegres:::dchud_wrap( R, p, p, x_new, dummy_1, 0L, 0L, 0., 0., dummy_2, dummy_3) # downdate R rollRegres:::dchdd_wrap( R, p, p, x_old, dummy_1, 0L, 0L, 0., 0., dummy_2, dummy_3, integer(1)) # update XtY XtY <- XtY + y[i] * x_new - y[i - width] * x_old } coef. <- .Internal(backsolve(R, XtY, p, TRUE, TRUE)) coef. <- .Internal(backsolve(R, coef., p, TRUE, FALSE)) out[, i] <- coef. } t(out) } ##### # simple R version roll_regress_R_for_loop <- function(X, y, width){ n <- nrow(X) p <- ncol(X) out <- matrix(NA_real_, n, p) for(i in width:n){ idx <- (i - width + 1L):i out[i, ] <- lm.fit(X[idx, , drop = FALSE], y[idx])$coefficients
}

out
}

#####
# zoo version
.zoo_inner <- function(Z) {
fit <- lm.fit(Z[, -1, drop = FALSE], Z[, 1])
fitcoefficients } library(zoo) roll_regress_zoo <- function(x, y, width){ Z <- cbind(y, X) rollapply(Z, width, FUN = .zoo_inner, by.column = FALSE, align = "right", fill = NA_real_) } ##### # roll_lm library(roll) roll_lm_func <- function(x, y ,width) roll_lm(X, matrix(y, ncol = 1), wdth, intercept = FALSE)coefficients[, -1L]

# use one thread as other methods are easily to compute in parallel too
roll_lm_func            <- cmpfun(roll_lm_func)