NA rm

John Mount

2018-01-20

Removing NAs with dplyr.

Setup

library('dplyr')
library('replyr')
d <- data.frame(x=c(1,2,2),
                y=c(3,5,NA),
                z=c(NA,'a','b'),
                stringsAsFactors = FALSE)
print(d)
#>   x  y    z
#> 1 1  3 <NA>
#> 2 2  5    a
#> 3 2 NA    b

if (requireNamespace("RSQLite")) {
  my_db <- dplyr::src_sqlite(":memory:", create = TRUE)
  # my_db <- sparklyr::spark_connect(version='2.0.0', master = "local")
  class(my_db)
  dRemote <- replyr::replyr_copy_to(my_db,d,'d',rowNumberColumn='rowNum')
} else {
  dRemote <- d # local stand in when we can't make remote
}
print(dRemote)
#> # Source: table<d> [?? x 4]
#> # Database: sqlite 3.19.3 [:memory:]
#>       x     y z     rowNum
#>   <dbl> <dbl> <chr>  <int>
#> 1  1.00  3.00 <NA>       1
#> 2  2.00  5.00 a          2
#> 3  2.00 NA    b          3

na.omit and complete.cases are the usual ways to detect and eliminate NAs in a local data.frame (Note: we don’t consider removal a correct solution when building predictive models, see please here for some discussion on other data cleaning strategies.) However, they do not work on remote tbl types:

complete.cases

complete.cases(d)
#> [1] FALSE  TRUE FALSE

complete.cases(dRemote)
#> Error in complete.cases(dRemote): invalid 'type' (list) of argument

You can’t rely on “stack overflow” solutions that look “dplyr-y” to work on remote data. For example: http://stackoverflow.com/questions/22353633/filter-for-complete-cases-in-data-frame-using-dplyr-case-wise-deletion

d %>% filter(complete.cases(.))
#>   x y z
#> 1 2 5 a

dRemote %>% filter(complete.cases(.))
#> Error in UseMethod("escape"): no applicable method for 'escape' applied to an object of class "c('tbl_dbi', 'tbl_sql', 'tbl_lazy', 'tbl')"

The solution is to map NA managing functions into the dplyr controlled remote data item.

We would think can compose this use “pure dplyr operations” as follows, but again we end up with differences in local and remote performance.

d$rowNum <- seq_len(nrow(d))
d %>% mutate_all(funs(is.na)) %>%
  mutate(nNAinRow=rowSums(.)-rowNum)
#>       x     y     z rowNum nNAinRow
#> 1 FALSE FALSE  TRUE  FALSE        1
#> 2 FALSE FALSE FALSE  FALSE        0
#> 3 FALSE  TRUE FALSE  FALSE        1

dRemote %>% mutate_all(funs(is.na)) %>%
  mutate(nNAinRow=rowSums(.)-rowNum)
#> Error in UseMethod("escape"): no applicable method for 'escape' applied to an object of class "c('tbl_dbi', 'tbl_sql', 'tbl_lazy', 'tbl')"

What we see is that “what is pure dplyr” depends on what operators are available on the service providing back-end. For example the mutate_if forms (equivilent to SQL UPDATE WHERE) are currently not available for remote sources:

dRemote %>% mutate_if(TRUE,is.na) # not correct code, just to trigger "local sources" msg
#> Error: length(.p) == length(vars) is not TRUE

At some point we admit we are going to have to work over the columns by hand (though we will try to keep them remote).

dRemote %>% mutate(nna=0) %>%
  mutate(nna=nna+ifelse(is.na(x),1,0)) %>% 
  mutate(nna=nna+ifelse(is.na(y),1,0)) %>% 
  mutate(nna=nna+ifelse(is.na(z),1,0))  
#> # Source: lazy query [?? x 5]
#> # Database: sqlite 3.19.3 [:memory:]
#>       x     y z     rowNum   nna
#>   <dbl> <dbl> <chr>  <int> <dbl>
#> 1  1.00  3.00 <NA>       1  1.00
#> 2  2.00  5.00 a          2  0   
#> 3  2.00 NA    b          3  1.00

Because remote data sources may not have the same value coercion rules as R we should get used to being more careful about types (hence the ifelse).

This means we have to parameterize variable names to write code like the following:

cols = setdiff(colnames(dRemote),'rowNum')
dRemote %>% mutate(nna=0) -> dTmp
for(ci in cols) {
  dTmp %>% 
    mutate_(.dots=stats::setNames(paste0('nna+ifelse(is.na(',ci,'),1,0)'),'nna')) -> 
    dTmp
}
print(dTmp)
#> # Source: lazy query [?? x 5]
#> # Database: sqlite 3.19.3 [:memory:]
#>       x     y z     rowNum   nna
#>   <dbl> <dbl> <chr>  <int> <dbl>
#> 1  1.00  3.00 <NA>       1  1.00
#> 2  2.00  5.00 a          2  0   
#> 3  2.00 NA    b          3  1.00

Or using wrapr::let:

cols = setdiff(colnames(dRemote),'rowNum')
dRemote %>% mutate(nna=0) -> dTmp
for(ci in cols) {
  let(list(TARGETCOL=ci),
      dTmp %>% mutate(nna=nna+ifelse(is.na(TARGETCOL),1,0)) -> dTmp
  )
}
print(dTmp)
#> # Source: lazy query [?? x 5]
#> # Database: sqlite 3.19.3 [:memory:]
#>       x     y z     rowNum   nna
#>   <dbl> <dbl> <chr>  <int> <dbl>
#> 1  1.00  3.00 <NA>       1  1.00
#> 2  2.00  5.00 a          2  0   
#> 3  2.00 NA    b          3  1.00

What we see is that “what is pure dplyr” depends on what operators are available on the service providing back-end.

na.omit

na.omit(d)
#>   x y z rowNum
#> 2 2 5 a      2

na.omit(dRemote)
#> # Source: table<d> [?? x 4]
#> # Database: sqlite 3.19.3 [:memory:]
#>       x     y z     rowNum
#>   <dbl> <dbl> <chr>  <int>
#> 1  1.00  3.00 <NA>       1
#> 2  2.00  5.00 a          2
#> 3  2.00 NA    b          3

It is fairly clear how we can build na.omit from complete.cases.

Cleanup

rm(list=ls())
gc()
#>           used (Mb) gc trigger (Mb) max used (Mb)
#> Ncells 1025222 54.8    1770749 94.6  1770749 94.6
#> Vcells 1606859 12.3    3434918 26.3  3330046 25.5