Advanced Queries with purrr

Emanuel Rodriguez

2019-04-10

This article outlines more advanced cases of the cdec_query() function.

Query Multiple Stations

cdec_query() takes just one instance of its arguments, so a user is not allowed to call

cdec_query(station = c("ccr", "kwk"), dur_code = "h", sensor_num = "25")

but we can implement this same query using higher level functions.

Using purrr

library(purrr)
library(CDECRetrieve)

stations_of_interest <- c("kwk", "ccr", "bsf")

# 'map' through the stations of interest and apply them to the function
map(stations_of_interest, function(s) {
  cdec_query(station = s, sensor_num = "25", dur_code = "h")
})
#> [[1]]
#> # A tibble: 73 x 5
#>    agency_cd location_id datetime            parameter_cd parameter_value
#>    <chr>     <chr>       <dttm>              <chr>                  <dbl>
#>  1 CDEC      KWK         2019-04-08 00:00:00 25                      48.2
#>  2 CDEC      KWK         2019-04-08 01:00:00 25                      48.2
#>  3 CDEC      KWK         2019-04-08 02:00:00 25                      48.2
#>  4 CDEC      KWK         2019-04-08 03:00:00 25                      48.2
#>  5 CDEC      KWK         2019-04-08 04:00:00 25                      48.3
#>  6 CDEC      KWK         2019-04-08 05:00:00 25                      48.3
#>  7 CDEC      KWK         2019-04-08 06:00:00 25                      48.3
#>  8 CDEC      KWK         2019-04-08 07:00:00 25                      48.3
#>  9 CDEC      KWK         2019-04-08 08:00:00 25                      48.4
#> 10 CDEC      KWK         2019-04-08 09:00:00 25                      48.4
#> # … with 63 more rows
#> 
#> [[2]]
#> # A tibble: 73 x 5
#>    agency_cd location_id datetime            parameter_cd parameter_value
#>    <chr>     <chr>       <dttm>              <chr>                  <dbl>
#>  1 CDEC      CCR         2019-04-08 00:00:00 25                      48.1
#>  2 CDEC      CCR         2019-04-08 01:00:00 25                      48.2
#>  3 CDEC      CCR         2019-04-08 02:00:00 25                      48.2
#>  4 CDEC      CCR         2019-04-08 03:00:00 25                      48.2
#>  5 CDEC      CCR         2019-04-08 04:00:00 25                      48.2
#>  6 CDEC      CCR         2019-04-08 05:00:00 25                      48.2
#>  7 CDEC      CCR         2019-04-08 06:00:00 25                      48.3
#>  8 CDEC      CCR         2019-04-08 07:00:00 25                      48.3
#>  9 CDEC      CCR         2019-04-08 08:00:00 25                      48.4
#> 10 CDEC      CCR         2019-04-08 09:00:00 25                      48.4
#> # … with 63 more rows
#> 
#> [[3]]
#> # A tibble: 73 x 5
#>    agency_cd location_id datetime            parameter_cd parameter_value
#>    <chr>     <chr>       <dttm>              <chr>                  <dbl>
#>  1 CDEC      BSF         2019-04-08 00:00:00 25                      49.4
#>  2 CDEC      BSF         2019-04-08 01:00:00 25                      49.4
#>  3 CDEC      BSF         2019-04-08 02:00:00 25                      49.4
#>  4 CDEC      BSF         2019-04-08 03:00:00 25                      49.4
#>  5 CDEC      BSF         2019-04-08 04:00:00 25                      49.4
#>  6 CDEC      BSF         2019-04-08 05:00:00 25                      49.3
#>  7 CDEC      BSF         2019-04-08 06:00:00 25                      49.3
#>  8 CDEC      BSF         2019-04-08 07:00:00 25                      49.3
#>  9 CDEC      BSF         2019-04-08 08:00:00 25                      49.4
#> 10 CDEC      BSF         2019-04-08 09:00:00 25                      49.5
#> # … with 63 more rows

Using map will return the call to cdec_query, returning each as an element of a list. This is ok, but we know that the query will return a data.frame with the same structure for each so we can combine these to get one dataframe.

This can be done by using a variant of the map function, map_df()

temp_data <- map_df(stations_of_interest, function(s) {
  cdec_query(station = s, sensor_num = "25", dur_code = "h")
})

head(temp_data)
#> # A tibble: 6 x 5
#>   agency_cd location_id datetime            parameter_cd parameter_value
#>   <chr>     <chr>       <dttm>              <chr>                  <dbl>
#> 1 CDEC      KWK         2019-04-08 00:00:00 25                      48.2
#> 2 CDEC      KWK         2019-04-08 01:00:00 25                      48.2
#> 3 CDEC      KWK         2019-04-08 02:00:00 25                      48.2
#> 4 CDEC      KWK         2019-04-08 03:00:00 25                      48.2
#> 5 CDEC      KWK         2019-04-08 04:00:00 25                      48.3
#> 6 CDEC      KWK         2019-04-08 05:00:00 25                      48.3

and now we can visualize these,

library(ggplot2)

temp_data %>% 
  ggplot(aes(datetime, parameter_value, color=location_id)) + geom_line()

Great!

We can still improve this by using some shortcut features in purrr. Namely we can shorten the function part of map_df using the ~ shortcut.

# here ~ tells map that this a function, and to interpret '.' as a value
# being passed from the `stations_of_interest`
map_df(stations_of_interest, ~cdec_query(., "25", "h"))
#> # A tibble: 219 x 5
#>    agency_cd location_id datetime            parameter_cd parameter_value
#>    <chr>     <chr>       <dttm>              <chr>                  <dbl>
#>  1 CDEC      KWK         2019-04-08 00:00:00 25                      48.2
#>  2 CDEC      KWK         2019-04-08 01:00:00 25                      48.2
#>  3 CDEC      KWK         2019-04-08 02:00:00 25                      48.2
#>  4 CDEC      KWK         2019-04-08 03:00:00 25                      48.2
#>  5 CDEC      KWK         2019-04-08 04:00:00 25                      48.3
#>  6 CDEC      KWK         2019-04-08 05:00:00 25                      48.3
#>  7 CDEC      KWK         2019-04-08 06:00:00 25                      48.3
#>  8 CDEC      KWK         2019-04-08 07:00:00 25                      48.3
#>  9 CDEC      KWK         2019-04-08 08:00:00 25                      48.4
#> 10 CDEC      KWK         2019-04-08 09:00:00 25                      48.4
#> # … with 209 more rows

Combination of multiple Arguments

The above section gave us the basic recipe for playing with multiple stations (and multiple of any other arguments), in this section we expand this to tackle multiple combinations of each. There are many ways to accomplish this, but I outline a recipe that has worked well for me.

First lets gather all the pieces we want to have changing,

stations_of_interest <- c("ccr", "kwk")
sensors_of_interest <- c("25", "1")
dur_code = "h"

Now we can use yet another variant of map, pmap_df to pass in an arbitrary number of arguments,

pmap_df(list(stations_of_interest, 
             sensors_of_interest, 
             dur_code), ~cdec_query(station = ..1, sensor_num = ..2, dur_code = ..3))
#> # A tibble: 146 x 5
#>    agency_cd location_id datetime            parameter_cd parameter_value
#>    <chr>     <chr>       <dttm>              <chr>                  <dbl>
#>  1 CDEC      CCR         2019-04-08 00:00:00 25                      48.1
#>  2 CDEC      CCR         2019-04-08 01:00:00 25                      48.2
#>  3 CDEC      CCR         2019-04-08 02:00:00 25                      48.2
#>  4 CDEC      CCR         2019-04-08 03:00:00 25                      48.2
#>  5 CDEC      CCR         2019-04-08 04:00:00 25                      48.2
#>  6 CDEC      CCR         2019-04-08 05:00:00 25                      48.2
#>  7 CDEC      CCR         2019-04-08 06:00:00 25                      48.3
#>  8 CDEC      CCR         2019-04-08 07:00:00 25                      48.3
#>  9 CDEC      CCR         2019-04-08 08:00:00 25                      48.4
#> 10 CDEC      CCR         2019-04-08 09:00:00 25                      48.4
#> # … with 136 more rows

Here we use ..1, ..2, ..3 to refer to the first, second and third elements of the list, we can grow this any number of arguments.

Making a custom function

In the above we don’t really change dur_code but we still need to supply it into the function. We can save some time by creating a function out of cdec_query(), using purrr::partial(), we can create a function that takes two arguments and has “dur_code” fixed to “h”. This allow us to use map2_df instead of pmap_df.

cdec_query_hourly <- purrr::partial(cdec_query, dur_code="h")

# we only have to supply two arguments
map2_df(stations_of_interest, sensors_of_interest, 
        ~cdec_query_hourly(station=.x, sensor_num=.y))
#> # A tibble: 146 x 5
#>    agency_cd location_id datetime            parameter_cd parameter_value
#>    <chr>     <chr>       <dttm>              <chr>                  <dbl>
#>  1 CDEC      CCR         2019-04-08 00:00:00 25                      48.1
#>  2 CDEC      CCR         2019-04-08 01:00:00 25                      48.2
#>  3 CDEC      CCR         2019-04-08 02:00:00 25                      48.2
#>  4 CDEC      CCR         2019-04-08 03:00:00 25                      48.2
#>  5 CDEC      CCR         2019-04-08 04:00:00 25                      48.2
#>  6 CDEC      CCR         2019-04-08 05:00:00 25                      48.2
#>  7 CDEC      CCR         2019-04-08 06:00:00 25                      48.3
#>  8 CDEC      CCR         2019-04-08 07:00:00 25                      48.3
#>  9 CDEC      CCR         2019-04-08 08:00:00 25                      48.4
#> 10 CDEC      CCR         2019-04-08 09:00:00 25                      48.4
#> # … with 136 more rows

All combinations of arguments

Here is a way to query for a set of all combination of inputs for a given set of stations and sensors.

Note: you need confirm these combinations make sense to query.

stations_of_interest <- c("bsf", "kwk")
sensors_of_interest <- c("25", "27")
dur_code <- "h"

# data frame of all combinations
ins <- expand.grid(x=stations_of_interest, 
                   y=sensors_of_interest, 
                   z=dur_code, 
                   stringsAsFactors = FALSE)

temp_and_turb <- 
  pmap_df(list(ins$x, ins$y, ins$z), ~cdec_query(..1, ..2, ..3))

head(temp_and_turb)
#> # A tibble: 6 x 5
#>   agency_cd location_id datetime            parameter_cd parameter_value
#>   <chr>     <chr>       <dttm>              <chr>                  <dbl>
#> 1 CDEC      BSF         2019-04-08 00:00:00 25                      49.4
#> 2 CDEC      BSF         2019-04-08 01:00:00 25                      49.4
#> 3 CDEC      BSF         2019-04-08 02:00:00 25                      49.4
#> 4 CDEC      BSF         2019-04-08 03:00:00 25                      49.4
#> 5 CDEC      BSF         2019-04-08 04:00:00 25                      49.4
#> 6 CDEC      BSF         2019-04-08 05:00:00 25                      49.3

and now we can visualize these

param_names <- c("25" = "Temperature (F)", "27" = "Turbidity (NTU)")
temp_and_turb %>% 
  ggplot(aes(datetime, parameter_value, color=location_id)) + 
  geom_line() + 
  facet_grid(. ~ param_names[parameter_cd], scales = "free")