Introduction to Kmisc

Kmisc introduces a grab-bag of utility functions that should be useful to various kinds of useRs. Some of the most useful functions in the package are demoed in this vignette.

set.seed(123)
library(data.table)
library(Kmisc)
library(lattice)
library(grid)
library(Rcpp)
library(knitr)
library(microbenchmark)
dat <- data.frame( x=letters[1:4], y=1:4, z=LETTERS[1:4] )
opts_chunk$set(
  results="markup"
)

without: This function is used to remove columns from a list / data.frame.

## let's remove columns 'x' and 'z' from dat.
tryCatch( dat[ -c('x', 'z') ], error=function(e) print(e$message) )
## [1] "invalid argument to unary operator"
## oh :(
dat[ !(names(dat) %in% c('x', 'z')) ]
##   y
## 1 1
## 2 2
## 3 3
## 4 4
## I always find that a bit awkward. Let's use Kmisc's without instead.
without(dat, x, z)
##   y
## 1 1
## 2 2
## 3 3
## 4 4

extract: Extract vectors from a data.frame or list. Although there is already a good subsetting syntax for lists and vectors, I wanted a complementary function for without.

extract(dat, x, y)
##   x y
## 1 a 1
## 2 b 2
## 3 c 3
## 4 d 4

re_without, re_extract: Extract variables whose names don't match / do match a regular expression pattern.

re_extract(dat, "[xy]")
##   x y
## 1 a 1
## 2 b 2
## 3 c 3
## 4 d 4
re_without(dat, "[xy]")
##   z
## 1 A
## 2 B
## 3 C
## 4 D

swap: Replace elements in a vector.

tDat <- dat ## make a temporary copy of dat

## Replace some elements in tDat$y
tDat$y <- swap( tDat$y, from=c(2, 4), to=c(20, 40) )
cbind( dat$y, tDat$y )
##      [,1] [,2]
## [1,]    1    1
## [2,]    2   20
## [3,]    3    3
## [4,]    4   40

factor_to_char, char_to_factor: A set of functions that recurse through a list / data.frame and set all elements that are characters to factors, and vice versa.

bDat <- data.frame( x=rnorm(10), y=sample(letters,10), z=sample(letters,10) )
str( bDat )
## 'data.frame':    10 obs. of  3 variables:
##  $ x: num  -0.5605 -0.2302 1.5587 0.0705 0.1293 ...
##  $ y: Factor w/ 10 levels "c","f","k","l",..: 10 7 6 9 5 8 3 4 2 1
##  $ z: Factor w/ 10 levels "a","d","e","f",..: 10 9 7 8 1 5 6 3 4 2
str( factor_to_char(bDat) )
## 'data.frame':    10 obs. of  3 variables:
##  $ x: num  -0.5605 -0.2302 1.5587 0.0705 0.1293 ...
##  $ y: chr  "x" "r" "p" "w" ...
##  $ z: chr  "z" "w" "q" "s" ...

dapply: The data.frame version of the l/sapply series of functions.

Why have this function when sapply still does much the same? I always get frustrated with the fact that either an array or a list is returned by sapply, but never a data.frame.

dat <- data.frame( x = rnorm(100), y = rnorm(100), z = rnorm(100) )
dapply( dat, summary )
##               x      y       z
## Min.    -2.3100 -2.050 -2.0100
## 1st Qu. -0.6260 -0.728 -0.5740
## Median  -0.0587 -0.206  0.0535
## Mean    -0.0098 -0.024  0.1010
## 3rd Qu.  0.5620  0.572  0.7790
## Max.     2.1900  3.240  2.2900

kMerge: Left joins, aka. merge( all.x=TRUE, ... ) without any mangling of the order.

dat1 <- data.frame( id=5:1, x=c("a","a","b","b","b"), y=rnorm(5) )
dat2 <- data.frame( id=c(1, 2, 4), z=rnorm(3) )

## default merge changes id order
merge( dat1, dat2, by="id", all.x=TRUE )
##   id x       y        z
## 1  1 b -0.5229  0.68375
## 2  2 b  0.6608 -0.06082
## 3  3 b -1.3388       NA
## 4  4 a  1.2181  0.63296
## 5  5 a  0.2374       NA
## even the sort parameter can't save you
merge( dat1, dat2, by="id", all.x=TRUE, sort=TRUE )
##   id x       y        z
## 1  1 b -0.5229  0.68375
## 2  2 b  0.6608 -0.06082
## 3  3 b -1.3388       NA
## 4  4 a  1.2181  0.63296
## 5  5 a  0.2374       NA
# kMerge keeps it as is
kMerge( dat1, dat2, by="id" )
##   id x       y        z
## 1  5 a  0.2374       NA
## 2  4 a  1.2181  0.63296
## 3  3 b -1.3388       NA
## 4  2 b  0.6608 -0.06082
## 5  1 b -0.5229  0.68375

in_interval: A fast C implementation for determing which elements of a vector x lie within an interval [lo, hi).

x <- runif(10)*10; lo <- 5; hi <- 10
print( data.frame( x=x, between_5_and_10=in_interval(x, lo, hi) ) )
##         x between_5_and_10
## 1  9.0915             TRUE
## 2  0.5638            FALSE
## 3  5.0291             TRUE
## 4  3.5054            FALSE
## 5  8.4556             TRUE
## 6  8.0644             TRUE
## 7  1.1733            FALSE
## 8  7.1269             TRUE
## 9  2.3527            FALSE
## 10 0.7496            FALSE

stack_list: Use this to stack data.frames in a list. This can be useful if e.g. you've run some kind of bootstrap procedure and have all your results stored in as a list of data.frames – even do.call( rbind, dfs ) can be slow. The difference is even more prominent when used on very large lists.

This is partially deprecated by data.table::rbindlist now, which has a much faster C implementation.

dfs <- replicate(1E3, 
  data.frame(x=rnorm(10), y=sample(letters,10), z=sample(LETTERS,10)),
  simplify=FALSE
  )
str( stack_list(dfs) )
## 'data.frame':    10000 obs. of  4 variables:
##  $ x         : num  1.519 0.377 -2.052 -1.364 -0.201 ...
##  $ y         : chr  "n" "e" "l" "f" ...
##  $ z         : chr  "E" "C" "K" "I" ...
##  $ list_index: int  1 1 1 1 1 1 1 1 1 1 ...
system.time( stack_list(dfs) )
##    user  system elapsed 
##   0.005   0.001   0.005
system.time( do.call(rbind, dfs) )
##    user  system elapsed 
##   0.258   0.057   0.316
system.time( data.table::rbindlist(dfs) )
##    user  system elapsed 
##   0.002   0.000   0.002

Fast String Operations

R is missing some nice builtin 'string' functions. I've introduced a few functions for common string operations.

str_rev: Reverses a character vector; ie, a vector of strings. str_rev2 is there if you need to reverse a potentially unicode string.

str_rev( c("ABC", "DEF", NA, paste(LETTERS, collapse="") ) )
## [1] "CBA"                        "FED"                       
## [3] NA                           "ZYXWVUTSRQPONMLKJIHGFEDCBA"
str_rev2( c("はひふへほ", "abcdef") )
## [1] "ほへふひは" "fedcba"

str_slice: Slices a vector of strings at consecutive indices n. str_slice2 exists for potentially unicode strings.

str_slice( c("ABCDEF", "GHIJKL", "MNOP", "QR"), 2 )
## [[1]]
## [1] "AB" "CD" "EF"
## 
## [[2]]
## [1] "GH" "IJ" "KL"
## 
## [[3]]
## [1] "MN" "OP"
## 
## [[4]]
## [1] "QR"
str_slice2( "ハッピー", 2 )
## [[1]]
## [1] "ハッ" "ピー"

str_sort: sort a string.

str_sort("asnoighewgypfuiweb")
## [1] "abeefgghiinopsuwwy"

str_collapse: Collapse a string using Rcpp sugar; operates like R's paste(..., collapse=""), but works much faster.

str_collapse( c("ABC", "DEF", "GHI") )
## [1] "ABCDEFGHI"

File I/O

Sometimes, you get really large data files that just aren't going to fit into RAM. You really wish you could split them up in a structured way, transform them in some way, and then put them back together. One might consider a more 'enterprise' edition of the split-apply-combine framework (Hadoop and friends), but one dirty alternative is to use C++ to munge through a text file and pull out things that we actually want.

split_file: This function splits a delimited file into multiple files, according to unique entries in a chosen column.

extract_rows_from_file: From a delimited text file, extract only the rows for which the entries in a particular column match some set of items that you wish to keep.

C++ Function Generators

Use these functions to generate C++ / Rcpp-backed functions for common R-style operations.

Rcpp_tapply_generator: Generate fast tapply style functions from C++ code through Rcpp. See the example.

dat <- data.frame( y=rnorm(100), x=sample(letters[1:5], 100, TRUE) )
tMean <- Rcpp_tapply_generator("return mean(x);")
## C++ source code will be written to /var/folders/m7/_xnnz_b53kjgggkb1drc1f8c0000gn/T//Rtmpta8yeC/file10116a9945fb.cpp .
## Compiling...
## Done!
with( dat, tMean(y, x) )
##      a      b      c      d      e 
## 0.3928 0.1372 0.1822 0.2425 0.4269
with( dat, tapply(y, x, mean) )
##      a      b      c      d      e 
## 0.3928 0.1372 0.1822 0.2425 0.4269
microbenchmark(
  Kmisc=with( dat, tMean(y, x) ),
  R=with( dat, tapply(y, x, mean) ),
  times=5
)
## Unit: microseconds
##   expr    min     lq median     uq    max neval
##  Kmisc  39.87  41.81  43.91  53.81  64.59     5
##      R 162.08 166.99 167.76 172.80 350.73     5

Rcpp_apply_generator: An apply function generator tailored to 2D matrices. However, your function definition must return a scalar value.

aMean <- Rcpp_apply_generator("return mean(x);")
## C++ source code will be written to /var/folders/m7/_xnnz_b53kjgggkb1drc1f8c0000gn/T//Rtmpta8yeC/file10116655ca52c.cpp .
## Compiling...
## Done!
mat <- matrix( rnorm(100), nrow=10 )
aMean(mat, 2)
##  [1] -0.76100 -0.85464  0.19350  0.11695 -0.40288 -0.01592  0.20021
##  [8] -0.29928 -0.78401 -0.43277
apply(mat, 2, mean)
##  [1] -0.76100 -0.85464  0.19350  0.11695 -0.40288 -0.01592  0.20021
##  [8] -0.29928 -0.78401 -0.43277
microbenchmark(
  Kmisc=aMean(mat, 2),
  R=apply(mat, 2, mean)
)
## Unit: microseconds
##   expr    min      lq  median      uq    max neval
##  Kmisc  5.476   6.152   8.085   8.714  79.91   100
##      R 95.778 100.463 102.379 104.464 162.53   100

Faster Versions of Commonly Used R Functions

tapply_: This function operates like tapply but works faster through a faster factor generating function, as well as an optimized split. Note that it is however restricted to the (common) case of your value and grouping variables being column vectors.

library(microbenchmark)
y <- rnorm(1000); x <- sample(letters[1:5], 1000, TRUE)
tapply(y, x, mean)
##         a         b         c         d         e 
##  0.092720  0.005166  0.039360 -0.033520 -0.037047
tapply_(y, x, mean)
##         a         b         c         d         e 
##  0.092720  0.005166  0.039360 -0.033520 -0.037047
microbenchmark( times=10,
  tapply(y, x, mean),
  tapply_(y, x, mean),
  tMean(y, x)
)
## Unit: microseconds
##                 expr    min     lq median    uq   max neval
##   tapply(y, x, mean) 301.32 400.87 438.35 448.5 647.9    10
##  tapply_(y, x, mean)  83.86  87.17 111.85 136.3 159.8    10
##          tMean(y, x)  66.90  68.99  88.05 112.5 584.7    10

melt_: This function operates like reshape2:::melt, but works almost entirely through C and hence is much faster.

dat <- data.frame(
  id=LETTERS[1:5],
  x1=rnorm(5),
  x2=rnorm(5),
  x3=rnorm(5)
)
print(dat)
##   id       x1      x2      x3
## 1  A -0.07795 -0.7611  1.2437
## 2  B -0.53704 -2.5297 -0.4835
## 3  C -0.93115  1.1976 -1.3957
## 4  D -0.33415  0.5021 -0.1184
## 5  E -0.21133 -0.3940  0.4228
melt_(dat, id.vars="id")
##    id variable    value
## 1   A       x1 -0.07795
## 2   B       x1 -0.53704
## 3   C       x1 -0.93115
## 4   D       x1 -0.33415
## 5   E       x1 -0.21133
## 6   A       x2 -0.76112
## 7   B       x2 -2.52968
## 8   C       x2  1.19765
## 9   D       x2  0.50212
## 10  E       x2 -0.39404
## 11  A       x3  1.24369
## 12  B       x3 -0.48355
## 13  C       x3 -1.39575
## 14  D       x3 -0.11838
## 15  E       x3  0.42275

factor_: A faster, simpler implementation of factor through Rcpp. This might be useful in some rare cases where speed is essential.

lets <- sample(letters, 1E6, TRUE)
stopifnot( identical(
  factor_(lets),
  factor(lets)
) )
microbenchmark( times=5,
  factor_(lets),
  factor(lets)
)
## Unit: milliseconds
##           expr    min     lq median     uq   max neval
##  factor_(lets)  9.734  9.842   9.88  9.911 10.44     5
##   factor(lets) 34.711 37.046  39.95 41.072 46.74     5

html: Custom HTML in an R Markdown document.

html(
  table( class="table table-bordered table-striped table-condensed table-hover", ## bootstrap classes
    tr(
      td("Apples"),
      td("Bananas")
    ),
    tr(
      td("20"),
      td("30")
    )
  )
)
Apples Bananas
20 30

anatomy, anat: Like str, but much faster. It won't choke on very large data.frames.

df <- data.table(x=1, y=2)
str(df)
## Classes 'data.table' and 'data.frame':   1 obs. of  2 variables:
##  $ x: num 1
##  $ y: num 2
##  - attr(*, ".internal.selfref")=<externalptr>
anatomy(df)
## 'data.table', 'data.frame' with 1 row and 2 columns:
##   $ x: num  1
##   $ y: num  2
## - attr(*, ".internal.selfref")=<externalptr>
h