MongoDB (www.mongodb.org) is a scalable, high-performance, document-oriented NoSQL database. The rmongodb package provides an interface from R (www.r-project.org) to MongoDB and back using the mongodb-C library.

Installing and Loading rmongodb Package

There is a stable CRAN version of rmongodb available:

install.packages("rmongodb")

You can also install the latest development version from github:

library(devtools)
install_github("rmongodb", "mongosoup")

The installation should very simple. No local MongoDB installation is required. Only if you install from source the RUnit test will need a local MongoDB installation.

Afterwards you can load rmongodb as any other package:

library(rmongodb)

First Steps in rmongodb Package

Connect to MongoDB

mongo <- mongo.create()
mongo
## [1] 0
## attr(,"mongo")
## <pointer: 0x101709780>
## attr(,"class")
## [1] "mongo"
## attr(,"host")
## [1] "127.0.0.1"
## attr(,"name")
## [1] ""
## attr(,"username")
## [1] ""
## attr(,"password")
## [1] ""
## attr(,"db")
## [1] "admin"
## attr(,"timeout")
## [1] 0
mongo.is.connected(mongo)
## [1] TRUE
## [1] FALSE

Get Databases

mongo.get.databases(mongo)
## [1] "ccp"       "foo"       "grid"      "kinesis"   "rmongodb"  "rtraining"
## [7] "test"

Get Collections

db <- "rmongodb"
coll <- "rmongodb.zips"
mongo.get.database.collections(mongo, db)
## [1] "rmongodb.zips"

Get Size of Collection

mongo.count(mongo, coll)
## [1] 5351

Get Values for a Key

res <- mongo.distinct(mongo, coll, "city")
head(res)
## [1] "ACMAR"      "ADAMSVILLE" "ADGER"      "KEYSTONE"   "NEW SITE"  
## [6] "ALPINE"

Find some first data

cityone <- mongo.find.one(mongo, coll, "{\"city\":\"COLORADO CITY\"}")
cityone
##  city : 2     COLORADO CITY
##  loc : 4      
##      0 : 1    -112.952427
##      1 : 1    36.976266
## 
##  pop : 1      3065.000000
##  state : 2    AZ
##  _id : 2      86021
mongo.bson.to.list(cityone)
## $city
## [1] "COLORADO CITY"
## 
## $loc
## [1] -112.95   36.98
## 
## $pop
## [1] 3065
## 
## $state
## [1] "AZ"
## 
## $`_id`
## [1] "86021"

For a long time it was all about creating BSON objects

Since rmongodb version 1.2 you can use JSON directly.

buf <- mongo.bson.buffer.create()
mongo.bson.buffer.append(buf, "city", "COLORADO CITY")
## [1] TRUE
query <- mongo.bson.from.buffer(buf)
query
##  city : 2     COLORADO CITY
mongo.bson.from.JSON("{\"city\":\"COLORADO CITY\"}")
##  city : 2     COLORADO CITY

Find more data

pop <- mongo.distinct(mongo, coll, "pop")
hist(pop)

plot of chunk unnamed-chunk-12

boxplot(pop)

plot of chunk unnamed-chunk-12


mongo.count(mongo, coll, "{\"pop\":{\"$lte\":2}}")
## [1] 25
pops <- mongo.find.all(mongo, coll, "{\"pop\":{\"$lte\":2}}")
## Warning: This fails for most NoSQL data structures. I am working on a new
## solution
head(pops)
##     city            loc       pop state _id    
## val "ALLEN"         Numeric,2 0   "AL"  "36419"
## val "CHEVAK"        Numeric,2 0   "AK"  "99563"
## val "CROOKED CREEK" Numeric,2 1   "AK"  "99575"
## val "EMMONAK"       Numeric,2 0   "AK"  "99581"
## val "GRAYLING"      Numeric,2 0   "AK"  "99590"
## val "NAKNEK"        Numeric,2 0   "AK"  "99633"
dim(pops)
## [1] 25  5

Find more data with more compley query

library(jsonlite)
json <- "{\"pop\":{\"$lte\":2}, \"pop\":{\"$gte\":1}}"
cat(prettify(json))
## {
##  "pop" : {
##      "$lte" : 2
##  },
##  "pop" : {
##      "$gte" : 1
##  }
## }
validate(json)
## [1] TRUE
mongo.count(mongo, coll, json)
## [1] 4
pops <- mongo.find.all(mongo, coll, json)
## Warning: This fails for most NoSQL data structures. I am working on a new
## solution
head(pops)
##     city               loc       pop state _id    
## val "CROOKED CREEK"    Numeric,2 1   "AK"  "99575"
## val "HUALAPAI"         Numeric,2 2   "AZ"  "86412"
## val "IRVINE"           Numeric,2 1   "CA"  "92718"
## val "KENNEDY SPACE CE" Numeric,2 1   "FL"  "32815"
dim(pops)
## [1] 4 5

# still inefficient!
mongo.cursor.to.data.frame
## function (cursor, nullToNA = TRUE, ...) 
## {
##     warning("This fails for most NoSQL data structures. I am working on a new solution")
##     res <- data.frame()
##     while (mongo.cursor.next(cursor)) {
##         val <- mongo.bson.to.list(mongo.cursor.value(cursor))
##         if (nullToNA == TRUE) 
##             val[sapply(val, is.null)] <- NA
##         val <- val[sapply(val, class) != "mongo.oid"]
##         res <- rbind(res, as.data.frame(val, ...))
##     }
##     as.data.frame(res)
## }
## <environment: namespace:rmongodb>

Insert some data to MongoDB

# insert data
icoll <- paste(db, "test", sep = ".")
a <- mongo.bson.from.JSON("{\"ident\":\"a\", \"name\":\"Markus\", \"age\":33}")
b <- mongo.bson.from.JSON("{\"ident\":\"b\", \"name\":\"MongoSoup\", \"age\":1}")
c <- mongo.bson.from.JSON("{\"ident\":\"c\", \"name\":\"UseR\", \"age\":18}")
mongo.insert.batch(mongo, icoll, list(a, b, c))
## [1] TRUE

mongo.get.database.collections(mongo, db)
## [1] "rmongodb.zips" "rmongodb.test"
mongo.find.all(mongo, icoll)
## Warning: This fails for most NoSQL data structures. I am working on a new
## solution
##     _id ident name        age
## val 0   "a"   "Markus"    33 
## val 1   "b"   "MongoSoup" 1  
## val 0   "c"   "UseR"      18

Update documents in MongoDB

mongo.update(mongo, icoll, "{\"ident\":\"b\"}", "{\"$inc\":{\"age\":3}}")
## [1] TRUE

mongo.find.all(mongo, icoll)
## Warning: This fails for most NoSQL data structures. I am working on a new
## solution
##     _id ident name        age
## val 1   "a"   "Markus"    33 
## val 1   "b"   "MongoSoup" 4  
## val 1   "c"   "UseR"      18

Create indices for efficient queries

mongo.index.create(mongo, icoll, "{\"ident\":1}")
## NULL
# check mongoshell!

Drop / Remove collections and databases and Close Connection to MongoDB

mongo.drop(mongo, icoll)
## [1] TRUE
mongo.drop.database(mongo, db)
## [1] TRUE
mongo.get.database.collections(mongo, db)
## character(0)

# close connection
mongo.destroy(mongo)
## NULL

Advanced Steps in rmongodb Package

mongo <- mongo.create()

Insert Big Data

data(zips)
head(zips)
##      city         loc       pop   state _id    
## [1,] "ACMAR"      Numeric,2 6055  "AL"  "35004"
## [2,] "ADAMSVILLE" Numeric,2 10616 "AL"  "35005"
## [3,] "ADGER"      Numeric,2 3205  "AL"  "35006"
## [4,] "KEYSTONE"   Numeric,2 14218 "AL"  "35007"
## [5,] "NEW SITE"   Numeric,2 19942 "AL"  "35010"
## [6,] "ALPINE"     Numeric,2 3062  "AL"  "35014"
zips[1, ]$loc
## [1] -86.52  33.58

res <- list(length(dim(zips)[1]))
for (i in 1:dim(zips)[1]) {
    tmp <- zips[i, ]
    res[[i]] <- mongo.bson.from.list(tmp)
}
mongo.insert.batch(mongo, "rmongodb.zips", res)
## [1] FALSE

mongo.count(mongo, icoll)
## [1] 0
mongo.find.all(mongo, icoll)
## Warning: This fails for most NoSQL data structures. I am working on a new
## solution
## list()

MongoDB Aggregation Framework

JSON to BSON not yet working for this secnario

buf <- mongo.bson.buffer.create()
mongo.bson.buffer.start.object(buf, "$group")
## [1] TRUE
mongo.bson.buffer.append(buf, "_id", "$state")
## [1] TRUE
mongo.bson.buffer.start.object(buf, "totalPop")
## [1] TRUE
mongo.bson.buffer.append(buf, "$sum", "$pop")
## [1] TRUE
mongo.bson.buffer.finish.object(buf)
## [1] TRUE
mongo.bson.buffer.finish.object(buf)
## [1] TRUE
bson <- mongo.bson.from.buffer(buf)

bufall <- mongo.bson.buffer.create()
mongo.bson.buffer.append(bufall, "aggregate", "zips")
## [1] TRUE
mongo.bson.buffer.start.array(bufall, "pipeline")
## [1] TRUE
mongo.bson.buffer.append(bufall, "0", bson)
## [1] TRUE
mongo.bson.buffer.finish.object(bufall)
## [1] TRUE
cmd <- mongo.bson.from.buffer(bufall)
cmd
##  aggregate : 2    zips
##  pipeline : 4     
##      0 : 3    
##          $group : 3   
##              _id : 2      $state
##              totalPop : 3     
##                  $sum : 2     $pop
res <- mongo.command(mongo, db, cmd)
res
##  result : 4   
##      0 : 3    
##          _id : 2      FL
##          totalPop : 1     12937926.000000
## 
##      1 : 3    
##          _id : 2      AL
##          totalPop : 1     4040587.000000
## 
##      2 : 3    
##          _id : 2      AZ
##          totalPop : 1     3665228.000000
## 
##      3 : 3    
##          _id : 2      DC
##          totalPop : 1     606900.000000
## 
##      4 : 3    
##          _id : 2      CO
##          totalPop : 1     3294394.000000
## 
##      5 : 3    
##          _id : 2      AR
##          totalPop : 1     2350725.000000
## 
##      6 : 3    
##          _id : 2      DE
##          totalPop : 1     666168.000000
## 
##      7 : 3    
##          _id : 2      GA
##          totalPop : 1     6478216.000000
## 
##      8 : 3    
##          _id : 2      CT
##          totalPop : 1     3287116.000000
## 
##      9 : 3    
##          _id : 2      CA
##          totalPop : 1     29760021.000000
## 
##      10 : 3   
##          _id : 2      AK
##          totalPop : 1     550043.000000
## 
## 
##  ok : 1   1.000000

GridFS with rmongodb

GridFS is a specification for storing and retrieving files that exceed the BSON-document size limit of 16MB.

mgrids <- mongo.gridfs.create(mongo, db, prefix = "fs")
mongo.gridfs.store.file(mgrids, "faust.txt", "Faust")
## [1] TRUE
gf <- mongo.gridfs.find(mgrids, "Faust")
mongo.gridfile.get.length(gf)
## [1] 229649
mongo.gridfile.get.chunk.count(gf)
## [1] 1



# close connection
mongo.drop.database(mongo, db)
## [1] TRUE
mongo.destroy(mongo)
## NULL