On most platforms, starting the MongoDB server is as easy as:
mongod
If this is the first time you run MongoDB, you need to create a directory /data/db
and make it writable (use C:/data/db
on Windows).
# Init connection to local mongod
library(mongolite)
m <- mongo(collection = "diamonds")
# Insert test data
data(diamonds, package="ggplot2")
m$insert(diamonds)
# Check records
m$count()
nrow(diamonds)
# Perform a query and retrieve data
out <- m$find('{"cut" : "Premium", "price" : { "$lt" : 1000 } }')
# Compare
nrow(out)
nrow(subset(diamonds, cut == "Premium" & price < 1000))
# Cross-table
tbl <- m$mapreduce(
map = "function(){emit({cut:this.cut, color:this.color}, 1)}",
reduce = "function(id, counts){return Array.sum(counts)}"
)
# Same as:
data.frame(with(diamonds, table(cut, color)))
# Stream jsonlines into a connection
tmp <- tempfile()
m$export(file(tmp))
# Stream it back in R
library(jsonlite)
mydata <- stream_in(file(tmp))
# Or into mongo
m2 <- mongo("diamonds2")
m2$count()
m2$import(file(tmp))
m2$count()
# Remove the collection
m$drop()
m2$drop()
Some example queries from the dplyr tutorials.
# Insert some data
data(flights, package = "nycflights13")
m <- mongo(collection = "nycflights")
m$insert(flights)
# Basic queries
m$count('{"month":1, "day":1}')
jan1 <- m$find('{"month":1, "day":1}')
# Sorting
jan1 <- m$find('{"month":1,"day":1}', sort='{"distance":-1}')
head(jan1)
# Sorting on large data requires index
m$index(add = "distance")
allflights <- m$find(sort='{"distance":-1}')
# Select columns
jan1 <- m$find('{"month":1,"day":1}', fields = '{"_id":0, "distance":1, "carrier":1}')
# List unique values
m$distinct("carrier")
m$distinct("carrier", '{"distance":{"$gt":3000}}')
# Tabulate
m$aggregate('[{"$group":{"_id":"$carrier", "count": {"$sum":1}, "average":{"$avg":"$distance"}}}]')
# Map-reduce (binning)
hist <- m$mapreduce(
map = "function(){emit(Math.floor(this.distance/100)*100, 1)}",
reduce = "function(id, counts){return Array.sum(counts)}"
)
# Dump to bson
dump <- tempfile()
m$export(file(dump), bson = TRUE)
# Remove the collection
m$drop()
# Restore
m$count()
m$import(file(dump), bson = TRUE)
m$count()
Example data with zipcodes from mongolite tutorial. This dataset has an _id
column so you cannot insert it more than once.
library(jsonlite)
library(mongolite)
# Stream from url into mongo
m <- mongo("zips", verbose = FALSE)
stream_in(url("http://media.mongodb.org/zips.json"), handler = function(df){
m$insert(df)
})
# Check count
m$count()
# Import. Note the 'location' column is actually an array!
zips <- m$find()
m$drop()
Stream large bulk samples from openweathermap with deeply nested data (takes a while).
m <- mongo("weather", verbose = FALSE)
stream_in(gzcon(url("http://bulk.openweathermap.org/sample/daily_14.json.gz")), handler = function(df){
m$insert(df)
}, pagesize = 50)
berlin <- m$find('{"city.name" : "Berlin"}')
print(berlin$data)