Advanced topics in the rmongodb Package

MongoDB (www.mongodb.org) is a scalable, high-performance, document-oriented NoSQL database. The rmongodb package provides an interface from the statistical software R (www.r-project.org) to MongoDB and back using the mongodb-C library.

This vignette will provide demos for advanced topics in the rmongodb package.

Connecting R to MongoDB

First of all we have to load the library and connect to a MongoDB. In this case we connect to our local MongoDB installation.

library(rmongodb)
mongo <- mongo.create()
mongo.is.connected(mongo)
## [1] TRUE

Inserting Big Data

We will use the “Zip Code Data Set” from MongoDB (http://docs.mongodb.org/manual/tutorial/aggregation-zip-code-data-set/) in the following examples. The data set is included in the rmongodb package and can be loaded using the command data(zips). The data set is available as JSON and contains zip code data from the US.

# load example data set from rmongodb
data(zips)
head(zips)
##      city         loc       pop   state _id    
## [1,] "ACMAR"      Numeric,2 6055  "AL"  "35004"
## [2,] "ADAMSVILLE" Numeric,2 10616 "AL"  "35005"
## [3,] "ADGER"      Numeric,2 3205  "AL"  "35006"
## [4,] "KEYSTONE"   Numeric,2 14218 "AL"  "35007"
## [5,] "NEW SITE"   Numeric,2 19942 "AL"  "35010"
## [6,] "ALPINE"     Numeric,2 3062  "AL"  "35014"
zips[1,]$loc
## [1] -86.52  33.58
# rename _id field. The original zips data set holds duplicate _id values which will fale during the import
colnames(zips)[5] <- "orig_id"

# create BSON batch object
ziplist <- list()
ziplist <- apply( zips, 1, function(x) c( ziplist, x ) )
res <- lapply( ziplist, function(x) mongo.bson.from.list(x) )

if(mongo.is.connected(mongo) == TRUE){
  mongo.insert.batch(mongo, "rmongodb.zips", res )
}
## [1] TRUE

Let's check if all the inserted data is available in MongoDB.

dim(zips)
## [1] 29470     5
if(mongo.is.connected(mongo) == TRUE){
  nr <- mongo.count(mongo, "rmongodb.zips")
  print( nr )
  res <- mongo.find.all(mongo, "rmongodb.zips", limit=20)
  head( res )
}
## [1] 58940
## [[1]]
## [[1]]$`_id`
## [1] "545a7aa908a841baf228108a"
## 
## [[1]]$city
## [1] "ACMAR"
## 
## [[1]]$loc
## [1] -86.52  33.58
## 
## [[1]]$pop
## [1] 6055
## 
## [[1]]$state
## [1] "AL"
## 
## [[1]]$orig_id
## [1] "35004"
## 
## 
## [[2]]
## [[2]]$`_id`
## [1] "545a7aa908a841baf228108b"
## 
## [[2]]$city
## [1] "ADAMSVILLE"
## 
## [[2]]$loc
## [1] -86.96  33.59
## 
## [[2]]$pop
## [1] 10616
## 
## [[2]]$state
## [1] "AL"
## 
## [[2]]$orig_id
## [1] "35005"
## 
## 
## [[3]]
## [[3]]$`_id`
## [1] "545a7aa908a841baf228108c"
## 
## [[3]]$city
## [1] "ADGER"
## 
## [[3]]$loc
## [1] -87.17  33.43
## 
## [[3]]$pop
## [1] 3205
## 
## [[3]]$state
## [1] "AL"
## 
## [[3]]$orig_id
## [1] "35006"
## 
## 
## [[4]]
## [[4]]$`_id`
## [1] "545a7aa908a841baf228108d"
## 
## [[4]]$city
## [1] "KEYSTONE"
## 
## [[4]]$loc
## [1] -86.81  33.24
## 
## [[4]]$pop
## [1] 14218
## 
## [[4]]$state
## [1] "AL"
## 
## [[4]]$orig_id
## [1] "35007"
## 
## 
## [[5]]
## [[5]]$`_id`
## [1] "545a7aa908a841baf228108e"
## 
## [[5]]$city
## [1] "NEW SITE"
## 
## [[5]]$loc
## [1] -85.95  32.94
## 
## [[5]]$pop
## [1] 19942
## 
## [[5]]$state
## [1] "AL"
## 
## [[5]]$orig_id
## [1] "35010"
## 
## 
## [[6]]
## [[6]]$`_id`
## [1] "545a7aa908a841baf228108f"
## 
## [[6]]$city
## [1] "ALPINE"
## 
## [[6]]$loc
## [1] -86.21  33.33
## 
## [[6]]$pop
## [1] 3062
## 
## [[6]]$state
## [1] "AL"
## 
## [[6]]$orig_id
## [1] "35014"

MongoDB Aggregation Framework

The MongoDB aggregation framework is one of the top MongoDB features. Aggregation operations process data records and return computed results. They group values from multiple documents together and can perform a variety of operations on the grouped data to return a single result.

Let's compute the total population of each state and group by state:

pipe_1 <- mongo.bson.from.JSON('{"$group":{"_id":"$state", "totalPop":{"$sum":"$pop"}}}')
cmd_list <- list(pipe_1)
cmd_list
## [[1]]
##  $group : 3   
##      _id : 2      $state
##      totalPop : 3     
##          $sum : 2     $pop
if(mongo.is.connected(mongo) == TRUE){
  res <- mongo.aggregation(mongo, "rmongodb.zips", cmd_list)
  head( mongo.bson.value(res, "result") )
}
## $`0`
## $`0`$`_id`
## [1] "WV"
## 
## $`0`$totalPop
## [1] 3586954
## 
## 
## $`1`
## $`1`$`_id`
## [1] "WA"
## 
## $`1`$totalPop
## [1] 9733384
## 
## 
## $`2`
## $`2`$`_id`
## [1] "VA"
## 
## $`2`$totalPop
## [1] 12374716
## 
## 
## $`3`
## $`3`$`_id`
## [1] "VT"
## 
## $`3`$totalPop
## [1] 1125516
## 
## 
## $`4`
## $`4`$`_id`
## [1] "UT"
## 
## $`4`$totalPop
## [1] 3445700
## 
## 
## $`5`
## $`5`$`_id`
## [1] "TN"
## 
## $`5`$totalPop
## [1] 9754370

Ok, I am only interested in the states with populations above 10 Million:

pipe_1 <- mongo.bson.from.JSON('{"$group":{"_id":"$state", "totalPop":{"$sum":"$pop"}}}')
pipe_2 <- mongo.bson.from.JSON('{"$match":{"totalPop":{"$gte":15000000}}}')
cmd_list <- list(pipe_1, pipe_2)
if(mongo.is.connected(mongo) == TRUE){
  res <- mongo.aggregation(mongo, "rmongodb.zips", cmd_list)
  res
}
##  result : 4   
##      0 : 3    
##          _id : 2      PA
##          totalPop : 1     23763286.000000
## 
##      1 : 3    
##          _id : 2      OH
##          totalPop : 1     21694230.000000
## 
##      2 : 3    
##          _id : 2      NY
##          totalPop : 1     35980910.000000
## 
##      3 : 3    
##          _id : 2      TX
##          totalPop : 1     33973020.000000
## 
##      4 : 3    
##          _id : 2      MI
##          totalPop : 1     18590594.000000
## 
##      5 : 3    
##          _id : 2      FL
##          totalPop : 1     25875852.000000
## 
##      6 : 3    
##          _id : 2      NJ
##          totalPop : 1     15460376.000000
## 
##      7 : 3    
##          _id : 2      IL
##          totalPop : 1     22861204.000000
## 
##      8 : 3    
##          _id : 2      CA
##          totalPop : 1     59520042.000000
## 
## 
##  ok : 1   1.000000

GridFS with rmongodb

GridFS is a specification for storing and retrieving files that exceed the BSON-document size limit of 16MB. There are several commands in rmongodb available to work with gridFS.

if(mongo.is.connected(mongo) == TRUE){
  mgrids <- mongo.gridfs.create(mongo, "rmongodb", prefix = "fs")
  mongo.gridfs.store.file(mgrids, "faust.txt", "Faust")
  gf <- mongo.gridfs.find(mgrids, "Faust")
  mongo.gridfile.get.length(gf)
  mongo.gridfile.get.chunk.count(gf)
}
## [1] 1

Dropping/removing collections and databases and closing the connection to MongoDB

After you've finished your analysis it's a good idea to destroy the connection to MongoDB and clean up the collections.

if(mongo.is.connected(mongo) == TRUE){
  mongo.drop(mongo, "rmongodb.zips")
  mongo.drop.database(mongo, "rmongodb")

  # close connection
  mongo.destroy(mongo)
}
## NULL

Feedback and Issues

Please do not hesitate to contact us if there are any issues using rmongodb. Issues or pull requests can be submitted via github: https://github.com/mongosoup/rmongodb