library("quanteda")
## Package version: 3.2.5
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: 10 of 10 threads used.
## See https://quanteda.io for tutorials and examples.
library("quanteda.textmodels")
quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)
Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.
For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.
# large movie review database of 50,000 movie reviews
load(url("https://quanteda.org/data/data_corpus_LMRD.rda"))
<- tokens(data_corpus_LMRD) %>%
dfmat dfm()
<- dfm_subset(dfmat, set == "train")
dfmat_train <- dfm_subset(dfmat, set == "test") dfmat_test
Comparing the performance of fitting the model:
library("microbenchmark")
microbenchmark(
multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
times = 20
)## Warning in microbenchmark(multi = textmodel_nb(dfmat_train,
## dfmat_train$polarity, : less accurate nanosecond times to avoid potential
## integer overflows
## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 57.51496 59.61240 60.94069 60.41127 61.40837 71.28518 20
## bern 65.50394 67.49475 74.66298 75.15694 80.91059 86.06970 20
And for prediction:
microbenchmark(
multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
newdata = dfmat_test),
bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
newdata = dfmat_test),
times = 20
)## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 69.19012 70.62748 74.89261 72.12195 82.20672 84.92847 20
## bern 98.69926 103.15457 113.37922 109.13323 114.31378 209.02530 20
Now let’s see how textmodel_nb()
compares to equivalent
functions from other packages. Multinomial:
library("fastNaiveBayes")
library("naivebayes")
## naivebayes 0.9.7 loaded
microbenchmark(
textmodels = {
<- textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
tmod <- predict(tmod, newdata = dfmat_test)
pred
},fastNaiveBayes = {
<- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
tmod <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
pred
},naivebayes = {
= multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
tmod <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
pred
},times = 20
)## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodels 67.24242 68.50483 73.75045 69.56937 77.97316 89.5873 20
## fastNaiveBayes 92.94659 94.14844 98.15999 96.64456 99.96038 107.6829 20
## naivebayes 79.81048 81.43014 95.43330 85.33449 87.74892 278.7189 20
And Bernoulli. Note here that while we are supplying the Boolean
matrix to textmodel_nb()
, this re-weighting from the count
matrix would have been performed automatically within the function had
we not done so in advance - it’s done here just for comparison.
<- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_train_bern <- dfm_weight(dfmat_test, scheme = "boolean")
dfmat_test_bern
microbenchmark(
textmodels = {
<- textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
tmod <- predict(tmod, newdata = dfmat_test)
pred
},fastNaiveBayes = {
<- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
tmod <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
pred
},naivebayes = {
= bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
tmod <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
pred
},times = 20
)## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodels 97.20604 100.37224 107.3485 107.9191 111.5670 124.8818 20
## fastNaiveBayes 105.96520 107.88027 113.5174 115.6564 117.6728 120.8316 20
## naivebayes 92.22064 94.37407 103.9120 100.9323 105.4600 185.5323 20
Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). “Learning Word Vectors for Sentiment Analysis”. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7, <URL: https://CRAN.R-project.org/package=naivebayes>. Date: 2020-03-08.
Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.
Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.1. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-05-04.