MongoDB (www.mongodb.org) is a scalable, high-performance, document-oriented NoSQL database. The rmongodb package provides an interface from the statistical software R (www.r-project.org) to MongoDB and back using the mongodb-C library.
This vignette will provide demos for advanced topics in the rmongodb package.
First of all we have to load the library and connect to a MongoDB. In this case we connect to our local MongoDB installation.
library(rmongodb)
mongo <- mongo.create()
mongo.is.connected(mongo)
## [1] TRUE
We will use the “Zip Code Data Set” from MongoDB (http://docs.mongodb.org/manual/tutorial/aggregation-zip-code-data-set/) in the following examples. The data set is included in the rmongodb package and can be loaded using the command data(zips). The data set is available as JSON and contains zip code data from the US.
# load example data set from rmongodb
data(zips)
head(zips)
## city loc pop state _id
## [1,] "ACMAR" Numeric,2 6055 "AL" "35004"
## [2,] "ADAMSVILLE" Numeric,2 10616 "AL" "35005"
## [3,] "ADGER" Numeric,2 3205 "AL" "35006"
## [4,] "KEYSTONE" Numeric,2 14218 "AL" "35007"
## [5,] "NEW SITE" Numeric,2 19942 "AL" "35010"
## [6,] "ALPINE" Numeric,2 3062 "AL" "35014"
zips[1,]$loc
## [1] -86.52 33.58
# rename _id field. The original zips data set holds duplicate _id values which will fale during the import
colnames(zips)[5] <- "orig_id"
# create BSON batch object
ziplist <- list()
ziplist <- apply( zips, 1, function(x) c( ziplist, x ) )
res <- lapply( ziplist, function(x) mongo.bson.from.list(x) )
if(mongo.is.connected(mongo) == TRUE){
mongo.insert.batch(mongo, "rmongodb.zips", res )
}
## [1] TRUE
Let's check if all the inserted data is available in MongoDB.
dim(zips)
## [1] 29470 5
if(mongo.is.connected(mongo) == TRUE){
nr <- mongo.count(mongo, "rmongodb.zips")
print( nr )
res <- mongo.find.all(mongo, "rmongodb.zips", limit=20)
head( res )
}
## [1] 58940
## [[1]]
## [[1]]$`_id`
## [1] "545a7aa908a841baf228108a"
##
## [[1]]$city
## [1] "ACMAR"
##
## [[1]]$loc
## [1] -86.52 33.58
##
## [[1]]$pop
## [1] 6055
##
## [[1]]$state
## [1] "AL"
##
## [[1]]$orig_id
## [1] "35004"
##
##
## [[2]]
## [[2]]$`_id`
## [1] "545a7aa908a841baf228108b"
##
## [[2]]$city
## [1] "ADAMSVILLE"
##
## [[2]]$loc
## [1] -86.96 33.59
##
## [[2]]$pop
## [1] 10616
##
## [[2]]$state
## [1] "AL"
##
## [[2]]$orig_id
## [1] "35005"
##
##
## [[3]]
## [[3]]$`_id`
## [1] "545a7aa908a841baf228108c"
##
## [[3]]$city
## [1] "ADGER"
##
## [[3]]$loc
## [1] -87.17 33.43
##
## [[3]]$pop
## [1] 3205
##
## [[3]]$state
## [1] "AL"
##
## [[3]]$orig_id
## [1] "35006"
##
##
## [[4]]
## [[4]]$`_id`
## [1] "545a7aa908a841baf228108d"
##
## [[4]]$city
## [1] "KEYSTONE"
##
## [[4]]$loc
## [1] -86.81 33.24
##
## [[4]]$pop
## [1] 14218
##
## [[4]]$state
## [1] "AL"
##
## [[4]]$orig_id
## [1] "35007"
##
##
## [[5]]
## [[5]]$`_id`
## [1] "545a7aa908a841baf228108e"
##
## [[5]]$city
## [1] "NEW SITE"
##
## [[5]]$loc
## [1] -85.95 32.94
##
## [[5]]$pop
## [1] 19942
##
## [[5]]$state
## [1] "AL"
##
## [[5]]$orig_id
## [1] "35010"
##
##
## [[6]]
## [[6]]$`_id`
## [1] "545a7aa908a841baf228108f"
##
## [[6]]$city
## [1] "ALPINE"
##
## [[6]]$loc
## [1] -86.21 33.33
##
## [[6]]$pop
## [1] 3062
##
## [[6]]$state
## [1] "AL"
##
## [[6]]$orig_id
## [1] "35014"
The MongoDB aggregation framework is one of the top MongoDB features. Aggregation operations process data records and return computed results. They group values from multiple documents together and can perform a variety of operations on the grouped data to return a single result.
Let's compute the total population of each state and group by state:
pipe_1 <- mongo.bson.from.JSON('{"$group":{"_id":"$state", "totalPop":{"$sum":"$pop"}}}')
cmd_list <- list(pipe_1)
cmd_list
## [[1]]
## $group : 3
## _id : 2 $state
## totalPop : 3
## $sum : 2 $pop
if(mongo.is.connected(mongo) == TRUE){
res <- mongo.aggregation(mongo, "rmongodb.zips", cmd_list)
head( mongo.bson.value(res, "result") )
}
## $`0`
## $`0`$`_id`
## [1] "WV"
##
## $`0`$totalPop
## [1] 3586954
##
##
## $`1`
## $`1`$`_id`
## [1] "WA"
##
## $`1`$totalPop
## [1] 9733384
##
##
## $`2`
## $`2`$`_id`
## [1] "VA"
##
## $`2`$totalPop
## [1] 12374716
##
##
## $`3`
## $`3`$`_id`
## [1] "VT"
##
## $`3`$totalPop
## [1] 1125516
##
##
## $`4`
## $`4`$`_id`
## [1] "UT"
##
## $`4`$totalPop
## [1] 3445700
##
##
## $`5`
## $`5`$`_id`
## [1] "TN"
##
## $`5`$totalPop
## [1] 9754370
Ok, I am only interested in the states with populations above 10 Million:
pipe_1 <- mongo.bson.from.JSON('{"$group":{"_id":"$state", "totalPop":{"$sum":"$pop"}}}')
pipe_2 <- mongo.bson.from.JSON('{"$match":{"totalPop":{"$gte":15000000}}}')
cmd_list <- list(pipe_1, pipe_2)
if(mongo.is.connected(mongo) == TRUE){
res <- mongo.aggregation(mongo, "rmongodb.zips", cmd_list)
res
}
## result : 4
## 0 : 3
## _id : 2 PA
## totalPop : 1 23763286.000000
##
## 1 : 3
## _id : 2 OH
## totalPop : 1 21694230.000000
##
## 2 : 3
## _id : 2 NY
## totalPop : 1 35980910.000000
##
## 3 : 3
## _id : 2 TX
## totalPop : 1 33973020.000000
##
## 4 : 3
## _id : 2 MI
## totalPop : 1 18590594.000000
##
## 5 : 3
## _id : 2 FL
## totalPop : 1 25875852.000000
##
## 6 : 3
## _id : 2 NJ
## totalPop : 1 15460376.000000
##
## 7 : 3
## _id : 2 IL
## totalPop : 1 22861204.000000
##
## 8 : 3
## _id : 2 CA
## totalPop : 1 59520042.000000
##
##
## ok : 1 1.000000
GridFS is a specification for storing and retrieving files that exceed the BSON-document size limit of 16MB. There are several commands in rmongodb available to work with gridFS.
if(mongo.is.connected(mongo) == TRUE){
mgrids <- mongo.gridfs.create(mongo, "rmongodb", prefix = "fs")
mongo.gridfs.store.file(mgrids, "faust.txt", "Faust")
gf <- mongo.gridfs.find(mgrids, "Faust")
mongo.gridfile.get.length(gf)
mongo.gridfile.get.chunk.count(gf)
}
## [1] 1
After you've finished your analysis it's a good idea to destroy the connection to MongoDB and clean up the collections.
if(mongo.is.connected(mongo) == TRUE){
mongo.drop(mongo, "rmongodb.zips")
mongo.drop.database(mongo, "rmongodb")
# close connection
mongo.destroy(mongo)
}
## NULL
Please do not hesitate to contact us if there are any issues using rmongodb. Issues or pull requests can be submitted via github: https://github.com/mongosoup/rmongodb