textmodel Performance Comparisons

Kenneth Benoit

library("quanteda.textmodels")
library("quanteda")
## Package version: 2.9.9000
## Unicode version: 10.0
## ICU version: 61.1
## Parallel computing: 12 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:quanteda.textmodels':
## 
##     data_dfm_lbgexample

Naive Bayes

quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)

Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.

For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.

# large movie review database of 50,000 movie reviews
load(url("https://www.dropbox.com/s/sjdfmx8ggwfda5o/data_corpus_LMRD.rda?dl=1"))

dfmat <- dfm(data_corpus_LMRD)
## Warning: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
dfmat_train <- dfm_subset(dfmat, set == "train")
dfmat_test <- dfm_subset(dfmat, set == "test")

Comparing the performance of fitting the model:

library("microbenchmark")
microbenchmark(
    multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
    bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
    times = 50
)
## Unit: milliseconds
##   expr      min       lq     mean   median       uq      max neval
##  multi 120.8378 126.7613 148.6513 137.5227 156.2886 222.3262    50
##   bern 136.0139 146.3253 176.0036 158.7573 200.1413 302.9387    50

And for prediction:

microbenchmark(
    multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
                    newdata = dfmat_test),
    bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
                   newdata = dfmat_test),
    times = 50
)
## Unit: milliseconds
##   expr      min       lq     mean   median       uq      max neval
##  multi 138.6582 144.2464 160.8712 148.4776 159.7530 243.6032    50
##   bern 186.8728 195.2588 223.5854 203.6466 248.1028 434.6949    50

Now let’s see how textmodel_nb() compares to equivalent functions from other packages. Multinomial:

library("fastNaiveBayes")
library("naivebayes")
## naivebayes 0.9.7 loaded

microbenchmark(
    textmodels = {
      tmod <-  textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
      pred <- predict(tmod, newdata = dfmat_test)
    },
    fastNaiveBayes = { 
      tmod <- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
      pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
    },
    naivebayes = {
      tmod = multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
      pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
    },
    times = 50
)
## Unit: milliseconds
##            expr      min       lq     mean   median       uq      max neval
##      textmodels 127.9517 147.1456 165.0356 155.7075 191.6396 248.1058    50
##  fastNaiveBayes 171.2638 204.6710 226.5297 223.9563 239.2497 307.8638    50
##      naivebayes 134.0913 166.7573 187.9579 178.8508 195.6350 413.2840    50

And Bernoulli. Note here that while we are supplying the boolean matrix to textmodel_nb(), this re-weighting from the count matrix would have been performed automatically within the function had we not done so in advance - it’s done here just for comparison.

dfmat_train_bern <- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_test_bern <- dfm_weight(dfmat_test, scheme = "boolean")

microbenchmark(
    textmodels = {
      tmod <-  textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
      pred <- predict(tmod, newdata = dfmat_test)
    },
    fastNaiveBayes = { 
      tmod <- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
      pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
    },
    naivebayes = {
      tmod = bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
      pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
    },
    times = 50
)
## Unit: milliseconds
##            expr      min       lq     mean   median       uq      max neval
##      textmodels 189.9112 205.2109 245.0538 233.4364 267.6617 423.6066    50
##  fastNaiveBayes 235.8041 245.0874 274.9936 277.9706 298.1427 347.1536    50
##      naivebayes 181.5758 190.1133 207.9998 196.4659 218.2553 283.0401    50

😎

References

Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). “Learning Word Vectors for Sentiment Analysis”. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).

Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7, <URL: https://CRAN.R-project.org/package=naivebayes>. Date: 2020-03-08.

Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.

Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.0. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-02-23.