MongoDB (www.mongodb.org) is a scalable, high-performance, document-oriented NoSQL database. The rmongodb package provides an interface from R (www.r-project.org) to MongoDB and back using the mongodb-C library.
There is a stable CRAN version of rmongodb available:
install.packages("rmongodb")
You can also install the latest development version from github:
library(devtools)
install_github("rmongodb", "mongosoup")
The installation should very simple. No local MongoDB installation is required. Only if you install from source the RUnit test will need a local MongoDB installation.
Afterwards you can load rmongodb as any other package:
library(rmongodb)
mongo <- mongo.create()
mongo
## [1] 0
## attr(,"mongo")
## <pointer: 0x101709780>
## attr(,"class")
## [1] "mongo"
## attr(,"host")
## [1] "127.0.0.1"
## attr(,"name")
## [1] ""
## attr(,"username")
## [1] ""
## attr(,"password")
## [1] ""
## attr(,"db")
## [1] "admin"
## attr(,"timeout")
## [1] 0
mongo.is.connected(mongo)
## [1] TRUE
## [1] FALSE
mongo.get.databases(mongo)
## [1] "ccp" "foo" "grid" "kinesis" "rmongodb" "rtraining"
## [7] "test"
db <- "rmongodb"
coll <- "rmongodb.zips"
mongo.get.database.collections(mongo, db)
## [1] "rmongodb.zips"
mongo.count(mongo, coll)
## [1] 5351
res <- mongo.distinct(mongo, coll, "city")
head(res)
## [1] "ACMAR" "ADAMSVILLE" "ADGER" "KEYSTONE" "NEW SITE"
## [6] "ALPINE"
cityone <- mongo.find.one(mongo, coll, "{\"city\":\"COLORADO CITY\"}")
cityone
## city : 2 COLORADO CITY
## loc : 4
## 0 : 1 -112.952427
## 1 : 1 36.976266
##
## pop : 1 3065.000000
## state : 2 AZ
## _id : 2 86021
mongo.bson.to.list(cityone)
## $city
## [1] "COLORADO CITY"
##
## $loc
## [1] -112.95 36.98
##
## $pop
## [1] 3065
##
## $state
## [1] "AZ"
##
## $`_id`
## [1] "86021"
Since rmongodb version 1.2 you can use JSON directly.
buf <- mongo.bson.buffer.create()
mongo.bson.buffer.append(buf, "city", "COLORADO CITY")
## [1] TRUE
query <- mongo.bson.from.buffer(buf)
query
## city : 2 COLORADO CITY
mongo.bson.from.JSON("{\"city\":\"COLORADO CITY\"}")
## city : 2 COLORADO CITY
pop <- mongo.distinct(mongo, coll, "pop")
hist(pop)
boxplot(pop)
mongo.count(mongo, coll, "{\"pop\":{\"$lte\":2}}")
## [1] 25
pops <- mongo.find.all(mongo, coll, "{\"pop\":{\"$lte\":2}}")
## Warning: This fails for most NoSQL data structures. I am working on a new
## solution
head(pops)
## city loc pop state _id
## val "ALLEN" Numeric,2 0 "AL" "36419"
## val "CHEVAK" Numeric,2 0 "AK" "99563"
## val "CROOKED CREEK" Numeric,2 1 "AK" "99575"
## val "EMMONAK" Numeric,2 0 "AK" "99581"
## val "GRAYLING" Numeric,2 0 "AK" "99590"
## val "NAKNEK" Numeric,2 0 "AK" "99633"
dim(pops)
## [1] 25 5
library(jsonlite)
json <- "{\"pop\":{\"$lte\":2}, \"pop\":{\"$gte\":1}}"
cat(prettify(json))
## {
## "pop" : {
## "$lte" : 2
## },
## "pop" : {
## "$gte" : 1
## }
## }
validate(json)
## [1] TRUE
mongo.count(mongo, coll, json)
## [1] 4
pops <- mongo.find.all(mongo, coll, json)
## Warning: This fails for most NoSQL data structures. I am working on a new
## solution
head(pops)
## city loc pop state _id
## val "CROOKED CREEK" Numeric,2 1 "AK" "99575"
## val "HUALAPAI" Numeric,2 2 "AZ" "86412"
## val "IRVINE" Numeric,2 1 "CA" "92718"
## val "KENNEDY SPACE CE" Numeric,2 1 "FL" "32815"
dim(pops)
## [1] 4 5
# still inefficient!
mongo.cursor.to.data.frame
## function (cursor, nullToNA = TRUE, ...)
## {
## warning("This fails for most NoSQL data structures. I am working on a new solution")
## res <- data.frame()
## while (mongo.cursor.next(cursor)) {
## val <- mongo.bson.to.list(mongo.cursor.value(cursor))
## if (nullToNA == TRUE)
## val[sapply(val, is.null)] <- NA
## val <- val[sapply(val, class) != "mongo.oid"]
## res <- rbind(res, as.data.frame(val, ...))
## }
## as.data.frame(res)
## }
## <environment: namespace:rmongodb>
# insert data
icoll <- paste(db, "test", sep = ".")
a <- mongo.bson.from.JSON("{\"ident\":\"a\", \"name\":\"Markus\", \"age\":33}")
b <- mongo.bson.from.JSON("{\"ident\":\"b\", \"name\":\"MongoSoup\", \"age\":1}")
c <- mongo.bson.from.JSON("{\"ident\":\"c\", \"name\":\"UseR\", \"age\":18}")
mongo.insert.batch(mongo, icoll, list(a, b, c))
## [1] TRUE
mongo.get.database.collections(mongo, db)
## [1] "rmongodb.zips" "rmongodb.test"
mongo.find.all(mongo, icoll)
## Warning: This fails for most NoSQL data structures. I am working on a new
## solution
## _id ident name age
## val 0 "a" "Markus" 33
## val 1 "b" "MongoSoup" 1
## val 0 "c" "UseR" 18
mongo.update(mongo, icoll, "{\"ident\":\"b\"}", "{\"$inc\":{\"age\":3}}")
## [1] TRUE
mongo.find.all(mongo, icoll)
## Warning: This fails for most NoSQL data structures. I am working on a new
## solution
## _id ident name age
## val 1 "a" "Markus" 33
## val 1 "b" "MongoSoup" 4
## val 1 "c" "UseR" 18
mongo.index.create(mongo, icoll, "{\"ident\":1}")
## NULL
# check mongoshell!
mongo.drop(mongo, icoll)
## [1] TRUE
mongo.drop.database(mongo, db)
## [1] TRUE
mongo.get.database.collections(mongo, db)
## character(0)
# close connection
mongo.destroy(mongo)
## NULL
mongo <- mongo.create()
data(zips)
head(zips)
## city loc pop state _id
## [1,] "ACMAR" Numeric,2 6055 "AL" "35004"
## [2,] "ADAMSVILLE" Numeric,2 10616 "AL" "35005"
## [3,] "ADGER" Numeric,2 3205 "AL" "35006"
## [4,] "KEYSTONE" Numeric,2 14218 "AL" "35007"
## [5,] "NEW SITE" Numeric,2 19942 "AL" "35010"
## [6,] "ALPINE" Numeric,2 3062 "AL" "35014"
zips[1, ]$loc
## [1] -86.52 33.58
res <- list(length(dim(zips)[1]))
for (i in 1:dim(zips)[1]) {
tmp <- zips[i, ]
res[[i]] <- mongo.bson.from.list(tmp)
}
mongo.insert.batch(mongo, "rmongodb.zips", res)
## [1] FALSE
mongo.count(mongo, icoll)
## [1] 0
mongo.find.all(mongo, icoll)
## Warning: This fails for most NoSQL data structures. I am working on a new
## solution
## list()
JSON to BSON not yet working for this secnario
buf <- mongo.bson.buffer.create()
mongo.bson.buffer.start.object(buf, "$group")
## [1] TRUE
mongo.bson.buffer.append(buf, "_id", "$state")
## [1] TRUE
mongo.bson.buffer.start.object(buf, "totalPop")
## [1] TRUE
mongo.bson.buffer.append(buf, "$sum", "$pop")
## [1] TRUE
mongo.bson.buffer.finish.object(buf)
## [1] TRUE
mongo.bson.buffer.finish.object(buf)
## [1] TRUE
bson <- mongo.bson.from.buffer(buf)
bufall <- mongo.bson.buffer.create()
mongo.bson.buffer.append(bufall, "aggregate", "zips")
## [1] TRUE
mongo.bson.buffer.start.array(bufall, "pipeline")
## [1] TRUE
mongo.bson.buffer.append(bufall, "0", bson)
## [1] TRUE
mongo.bson.buffer.finish.object(bufall)
## [1] TRUE
cmd <- mongo.bson.from.buffer(bufall)
cmd
## aggregate : 2 zips
## pipeline : 4
## 0 : 3
## $group : 3
## _id : 2 $state
## totalPop : 3
## $sum : 2 $pop
res <- mongo.command(mongo, db, cmd)
res
## result : 4
## 0 : 3
## _id : 2 FL
## totalPop : 1 12937926.000000
##
## 1 : 3
## _id : 2 AL
## totalPop : 1 4040587.000000
##
## 2 : 3
## _id : 2 AZ
## totalPop : 1 3665228.000000
##
## 3 : 3
## _id : 2 DC
## totalPop : 1 606900.000000
##
## 4 : 3
## _id : 2 CO
## totalPop : 1 3294394.000000
##
## 5 : 3
## _id : 2 AR
## totalPop : 1 2350725.000000
##
## 6 : 3
## _id : 2 DE
## totalPop : 1 666168.000000
##
## 7 : 3
## _id : 2 GA
## totalPop : 1 6478216.000000
##
## 8 : 3
## _id : 2 CT
## totalPop : 1 3287116.000000
##
## 9 : 3
## _id : 2 CA
## totalPop : 1 29760021.000000
##
## 10 : 3
## _id : 2 AK
## totalPop : 1 550043.000000
##
##
## ok : 1 1.000000
GridFS is a specification for storing and retrieving files that exceed the BSON-document size limit of 16MB.
mgrids <- mongo.gridfs.create(mongo, db, prefix = "fs")
mongo.gridfs.store.file(mgrids, "faust.txt", "Faust")
## [1] TRUE
gf <- mongo.gridfs.find(mgrids, "Faust")
mongo.gridfile.get.length(gf)
## [1] 229649
mongo.gridfile.get.chunk.count(gf)
## [1] 1
# close connection
mongo.drop.database(mongo, db)
## [1] TRUE
mongo.destroy(mongo)
## NULL