Overview

Package provides pipe-style interface for data.table package. It preserves all data.table features without significant impact on performance. let and take functions are simplified interfaces for most common data manipulation tasks.

     mtcars %>%
        let(mpg_hp = mpg/hp) %>%
        take(mean(mpg_hp), by = am)
      mtcars %>%
         let(new_var = 42,
             new_var2 = new_var*hp) %>%
         head()
    iris %>%
      let_all(
          scaled = (.x - mean(.x))/sd(.x),
          by = Species) %>%
       head()
    iris %>%
      take_all(
          mean = if(startsWith(.name, "Sepal")) mean(.x),
          median = if(startsWith(.name, "Petal")) median(.x),
          by = Species
      )
    new_var = "my_var"
    old_var = "mpg"
    mtcars %>%
        let((new_var) := get(old_var)*2) %>%
        head()
     
    # or,  
    expr = quote(mean(cyl))
    mtcars %>% 
        let((new_var) := eval(expr)) %>% 
        head()
    
    # the same with `take` 
    by_var = "vs,am"
    take(mtcars, (new_var) := eval(expr), by = by_var)

query_if function translates its arguments one-to-one to [.data.table method. Additionally there are some conveniences such as automatic data.frame conversion to data.table.

vlookup & xlookup

Let’s make datasets for lookups:

workers = fread("
    name company
    Nick Acme
    John Ajax
    Daniela Ajax
")

positions = fread("
    name position
    John designer
    Daniela engineer
    Cathie manager
")

# xlookup
workers = let(workers,
  position = xlookup(name, positions$name, positions$position)
)

# vlookup
# by default we search in the first column and return values from second column
workers = let(workers,
  position = vlookup(name, positions, no_match = "Not found")
)

# the same 
workers = let(workers,
  position = vlookup(name, positions, 
                     result_column = "position", 
                     no_match = "Not found") # or, result_column = 2 
)

head(workers)
##       name company  position
## 1:    Nick    Acme Not found
## 2:    John    Ajax  designer
## 3: Daniela    Ajax  engineer

More examples

We will use for demonstartion well-known mtcars dataset and some examples from dplyr package.

library(maditr)
data(mtcars)

# Newly created variables are available immediately
mtcars %>%
    let(
        cyl2 = cyl * 2,
        cyl4 = cyl2 * 2
    ) %>% head()
##     mpg cyl disp  hp drat    wt  qsec vs am gear carb cyl2 cyl4
## 1: 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4   12   24
## 2: 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4   12   24
## 3: 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1    8   16
## 4: 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1   12   24
## 5: 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2   16   32
## 6: 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1   12   24
# You can also use let() to remove variables and
# modify existing variables
mtcars %>%
    let(
        mpg = NULL,
        disp = disp * 0.0163871 # convert to litres
    ) %>% head()
##    cyl     disp  hp drat    wt  qsec vs am gear carb
## 1:   6 2.621936 110 3.90 2.620 16.46  0  1    4    4
## 2:   6 2.621936 110 3.90 2.875 17.02  0  1    4    4
## 3:   4 1.769807  93 3.85 2.320 18.61  1  1    4    1
## 4:   6 4.227872 110 3.08 3.215 19.44  1  0    3    1
## 5:   8 5.899356 175 3.15 3.440 17.02  0  0    3    2
## 6:   6 3.687098 105 2.76 3.460 20.22  1  0    3    1
# window functions are useful for grouped computations
mtcars %>%
    let(rank = rank(-mpg, ties.method = "min"),
        by = cyl) %>%
    head()
##     mpg cyl disp  hp drat    wt  qsec vs am gear carb rank
## 1: 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4    2
## 2: 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4    2
## 3: 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1    8
## 4: 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1    1
## 5: 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2    2
## 6: 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1    6
# You can drop variables by setting them to NULL
mtcars %>%
    let(cyl = NULL) %>%
    head()
##     mpg disp  hp drat    wt  qsec vs am gear carb
## 1: 21.0  160 110 3.90 2.620 16.46  0  1    4    4
## 2: 21.0  160 110 3.90 2.875 17.02  0  1    4    4
## 3: 22.8  108  93 3.85 2.320 18.61  1  1    4    1
## 4: 21.4  258 110 3.08 3.215 19.44  1  0    3    1
## 5: 18.7  360 175 3.15 3.440 17.02  0  0    3    2
## 6: 18.1  225 105 2.76 3.460 20.22  1  0    3    1
# keeps all existing variables
mtcars %>%
    let(displ_l = disp / 61.0237) %>%
    head()
##     mpg cyl disp  hp drat    wt  qsec vs am gear carb  displ_l
## 1: 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4 2.621932
## 2: 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4 2.621932
## 3: 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1 1.769804
## 4: 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1 4.227866
## 5: 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2 5.899347
## 6: 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1 3.687092
# keeps only the variables you create
mtcars %>%
    take(displ_l = disp / 61.0237) %>% 
    head()
##     displ_l
## 1: 2.621932
## 2: 2.621932
## 3: 1.769804
## 4: 4.227866
## 5: 5.899347
## 6: 3.687092
# can refer to both contextual variables and variable names:
var = 100
mtcars %>%
    let(cyl = cyl * var) %>%
    head()
##     mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1: 21.0 600  160 110 3.90 2.620 16.46  0  1    4    4
## 2: 21.0 600  160 110 3.90 2.875 17.02  0  1    4    4
## 3: 22.8 400  108  93 3.85 2.320 18.61  1  1    4    1
## 4: 21.4 600  258 110 3.08 3.215 19.44  1  0    3    1
## 5: 18.7 800  360 175 3.15 3.440 17.02  0  0    3    2
## 6: 18.1 600  225 105 2.76 3.460 20.22  1  0    3    1
# select rows
mtcars %>%
    rows(am==0) %>% 
    head()
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1: 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 2: 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 3: 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## 4: 14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## 5: 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 6: 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
# select rows with compound condition
mtcars %>%
    rows(am==0 & mpg>mean(mpg))
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1: 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 2: 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 3: 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## 4: 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
# select columns
mtcars %>% 
    columns(vs:carb, cyl)
##                     vs am gear carb cyl
## Mazda RX4            0  1    4    4   6
## Mazda RX4 Wag        0  1    4    4   6
## Datsun 710           1  1    4    1   4
## Hornet 4 Drive       1  0    3    1   6
## Hornet Sportabout    0  0    3    2   8
## Valiant              1  0    3    1   6
## Duster 360           0  0    3    4   8
## Merc 240D            1  0    4    2   4
## Merc 230             1  0    4    2   4
## Merc 280             1  0    4    4   6
## Merc 280C            1  0    4    4   6
## Merc 450SE           0  0    3    3   8
## Merc 450SL           0  0    3    3   8
## Merc 450SLC          0  0    3    3   8
## Cadillac Fleetwood   0  0    3    4   8
## Lincoln Continental  0  0    3    4   8
## Chrysler Imperial    0  0    3    4   8
## Fiat 128             1  1    4    1   4
## Honda Civic          1  1    4    2   4
## Toyota Corolla       1  1    4    1   4
## Toyota Corona        1  0    3    1   4
## Dodge Challenger     0  0    3    2   8
## AMC Javelin          0  0    3    2   8
## Camaro Z28           0  0    3    4   8
## Pontiac Firebird     0  0    3    2   8
## Fiat X1-9            1  1    4    1   4
## Porsche 914-2        0  1    5    2   4
## Lotus Europa         1  1    5    2   4
## Ford Pantera L       0  1    5    4   8
## Ferrari Dino         0  1    5    6   6
## Maserati Bora        0  1    5    8   8
## Volvo 142E           1  1    4    2   4
mtcars %>% 
    columns(-am, -cyl)    
##                      mpg  disp  hp drat    wt  qsec vs gear carb
## Mazda RX4           21.0 160.0 110 3.90 2.620 16.46  0    4    4
## Mazda RX4 Wag       21.0 160.0 110 3.90 2.875 17.02  0    4    4
## Datsun 710          22.8 108.0  93 3.85 2.320 18.61  1    4    1
## Hornet 4 Drive      21.4 258.0 110 3.08 3.215 19.44  1    3    1
## Hornet Sportabout   18.7 360.0 175 3.15 3.440 17.02  0    3    2
## Valiant             18.1 225.0 105 2.76 3.460 20.22  1    3    1
## Duster 360          14.3 360.0 245 3.21 3.570 15.84  0    3    4
## Merc 240D           24.4 146.7  62 3.69 3.190 20.00  1    4    2
## Merc 230            22.8 140.8  95 3.92 3.150 22.90  1    4    2
## Merc 280            19.2 167.6 123 3.92 3.440 18.30  1    4    4
## Merc 280C           17.8 167.6 123 3.92 3.440 18.90  1    4    4
## Merc 450SE          16.4 275.8 180 3.07 4.070 17.40  0    3    3
## Merc 450SL          17.3 275.8 180 3.07 3.730 17.60  0    3    3
## Merc 450SLC         15.2 275.8 180 3.07 3.780 18.00  0    3    3
## Cadillac Fleetwood  10.4 472.0 205 2.93 5.250 17.98  0    3    4
## Lincoln Continental 10.4 460.0 215 3.00 5.424 17.82  0    3    4
## Chrysler Imperial   14.7 440.0 230 3.23 5.345 17.42  0    3    4
## Fiat 128            32.4  78.7  66 4.08 2.200 19.47  1    4    1
## Honda Civic         30.4  75.7  52 4.93 1.615 18.52  1    4    2
## Toyota Corolla      33.9  71.1  65 4.22 1.835 19.90  1    4    1
## Toyota Corona       21.5 120.1  97 3.70 2.465 20.01  1    3    1
## Dodge Challenger    15.5 318.0 150 2.76 3.520 16.87  0    3    2
## AMC Javelin         15.2 304.0 150 3.15 3.435 17.30  0    3    2
## Camaro Z28          13.3 350.0 245 3.73 3.840 15.41  0    3    4
## Pontiac Firebird    19.2 400.0 175 3.08 3.845 17.05  0    3    2
## Fiat X1-9           27.3  79.0  66 4.08 1.935 18.90  1    4    1
## Porsche 914-2       26.0 120.3  91 4.43 2.140 16.70  0    5    2
## Lotus Europa        30.4  95.1 113 3.77 1.513 16.90  1    5    2
## Ford Pantera L      15.8 351.0 264 4.22 3.170 14.50  0    5    4
## Ferrari Dino        19.7 145.0 175 3.62 2.770 15.50  0    5    6
## Maserati Bora       15.0 301.0 335 3.54 3.570 14.60  0    5    8
## Volvo 142E          21.4 121.0 109 4.11 2.780 18.60  1    4    2
# regular expression pattern
columns(iris, "^Petal") %>% head() # variables which start from 'Petal'
##   Petal.Length Petal.Width
## 1          1.4         0.2
## 2          1.4         0.2
## 3          1.3         0.2
## 4          1.5         0.2
## 5          1.4         0.2
## 6          1.7         0.4
columns(iris, "Width$") %>% head() # variables which end with 'Width'
##   Sepal.Width Petal.Width
## 1         3.5         0.2
## 2         3.0         0.2
## 3         3.2         0.2
## 4         3.1         0.2
## 5         3.6         0.2
## 6         3.9         0.4
# move Species variable to the front
# pattern "^." matches all variables
columns(iris, Species, "^.") %>% head()
##   Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1  setosa          5.1         3.5          1.4         0.2
## 2  setosa          4.9         3.0          1.4         0.2
## 3  setosa          4.7         3.2          1.3         0.2
## 4  setosa          4.6         3.1          1.5         0.2
## 5  setosa          5.0         3.6          1.4         0.2
## 6  setosa          5.4         3.9          1.7         0.4
# pattern "^.*al" means "contains 'al'"
columns(iris, "^.*al") %>% head()
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
# numeric indexing - all variables except Species
columns(iris, 1:4) %>% head()
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
# A 'take' with summary functions applied without 'by' argument returns an aggregated data
mtcars %>%
    take(mean = mean(disp), n = .N)
##        mean  n
## 1: 230.7219 32
# Usually, you'll want to group first
mtcars %>%
    take(mean = mean(disp), n = .N, by = am)
##    am     mean  n
## 1:  1 143.5308 13
## 2:  0 290.3789 19
# grouping by multiple variables
mtcars %>%
    take(mean = mean(disp), n = .N, by = list(am, vs))
##    am vs     mean  n
## 1:  1  0 206.2167  6
## 2:  1  1  89.8000  7
## 3:  0  1 175.1143  7
## 4:  0  0 357.6167 12
# You can group by expressions:
mtcars %>%
    take_all(
        mean,
        by = list(vsam = vs + am)
    )
##    vsam      mpg      cyl     disp        hp     drat       wt     qsec
## 1:    1 20.28462 5.692308 189.4692 138.46154 3.738462 3.038846 18.04231
## 2:    2 28.37143 4.000000  89.8000  80.57143 4.148571 2.028286 18.70000
## 3:    0 15.05000 8.000000 357.6167 194.16667 3.120833 4.104083 17.14250
##        gear     carb
## 1: 4.076923 3.307692
## 2: 4.142857 1.428571
## 3: 3.000000 3.083333
# modify all non-grouping variables in-place
mtcars %>%
    let_all((.x - mean(.x))/sd(.x), by = am) %>%
    head()
##           mpg        cyl       disp         hp       drat         wt       qsec
## 1: -0.5501185  0.5945745  0.1888587 -0.2004008 -0.4120299  0.3387459 -0.5021316
## 2: -0.5501185  0.5945745  0.1888587 -0.2004008 -0.4120299  0.7520483 -0.1896942
## 3: -0.2582189 -0.6936702 -0.4074443 -0.4026317 -0.5493732 -0.1474922  0.6974050
## 4:  1.1091990 -0.6133196 -0.2938955 -0.9323843 -0.5259081 -0.7124963  0.7176592
## 5:  0.4049674  0.6814663  0.6319326  0.2733692 -0.3474750 -0.4230701 -0.6641654
## 6:  0.2484716 -0.6133196 -0.5934281 -1.0251346 -1.3416023 -0.3973433  1.1630407
##            vs am       gear       carb
## 1: -1.0377490  1 -0.7595545  0.4944600
## 2: -1.0377490  1 -0.7595545  0.4944600
## 3:  0.8894992  1 -0.7595545 -0.8829642
## 4:  1.2743862  0 -0.5026247 -1.5141438
## 5: -0.7433919  0 -0.5026247 -0.6423641
## 6:  1.2743862  0 -0.5026247 -1.5141438
# modify all non-grouping variables to new variables
mtcars %>%
    let_all(scaled = (.x - mean(.x))/sd(.x), by = am) %>%
    head()
##     mpg cyl disp  hp drat    wt  qsec vs am gear carb mpg_scaled cyl_scaled
## 1: 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4 -0.5501185  0.5945745
## 2: 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4 -0.5501185  0.5945745
## 3: 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1 -0.2582189 -0.6936702
## 4: 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1  1.1091990 -0.6133196
## 5: 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2  0.4049674  0.6814663
## 6: 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1  0.2484716 -0.6133196
##    disp_scaled  hp_scaled drat_scaled  wt_scaled qsec_scaled  vs_scaled
## 1:   0.1888587 -0.2004008  -0.4120299  0.3387459  -0.5021316 -1.0377490
## 2:   0.1888587 -0.2004008  -0.4120299  0.7520483  -0.1896942 -1.0377490
## 3:  -0.4074443 -0.4026317  -0.5493732 -0.1474922   0.6974050  0.8894992
## 4:  -0.2938955 -0.9323843  -0.5259081 -0.7124963   0.7176592  1.2743862
## 5:   0.6319326  0.2733692  -0.3474750 -0.4230701  -0.6641654 -0.7433919
## 6:  -0.5934281 -1.0251346  -1.3416023 -0.3973433   1.1630407  1.2743862
##    gear_scaled carb_scaled
## 1:  -0.7595545   0.4944600
## 2:  -0.7595545   0.4944600
## 3:  -0.7595545  -0.8829642
## 4:  -0.5026247  -1.5141438
## 5:  -0.5026247  -0.6423641
## 6:  -0.5026247  -1.5141438
# conditionally modify all variables
iris %>%
    let_all(mean = if(is.numeric(.x)) mean(.x)) %>%
    head()
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.Length_mean
## 1:          5.1         3.5          1.4         0.2  setosa          5.843333
## 2:          4.9         3.0          1.4         0.2  setosa          5.843333
## 3:          4.7         3.2          1.3         0.2  setosa          5.843333
## 4:          4.6         3.1          1.5         0.2  setosa          5.843333
## 5:          5.0         3.6          1.4         0.2  setosa          5.843333
## 6:          5.4         3.9          1.7         0.4  setosa          5.843333
##    Sepal.Width_mean Petal.Length_mean Petal.Width_mean
## 1:         3.057333             3.758         1.199333
## 2:         3.057333             3.758         1.199333
## 3:         3.057333             3.758         1.199333
## 4:         3.057333             3.758         1.199333
## 5:         3.057333             3.758         1.199333
## 6:         3.057333             3.758         1.199333
# modify all variables conditionally on name
iris %>%
    let_all(
        mean = if(startsWith(.name, "Sepal")) mean(.x),
        median = if(startsWith(.name, "Petal")) median(.x),
        by = Species
    ) %>%
    head()
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.Length_mean
## 1:          5.1         3.5          1.4         0.2  setosa             5.006
## 2:          4.9         3.0          1.4         0.2  setosa             5.006
## 3:          4.7         3.2          1.3         0.2  setosa             5.006
## 4:          4.6         3.1          1.5         0.2  setosa             5.006
## 5:          5.0         3.6          1.4         0.2  setosa             5.006
## 6:          5.4         3.9          1.7         0.4  setosa             5.006
##    Sepal.Width_mean Petal.Length_median Petal.Width_median
## 1:            3.428                 1.5                0.2
## 2:            3.428                 1.5                0.2
## 3:            3.428                 1.5                0.2
## 4:            3.428                 1.5                0.2
## 5:            3.428                 1.5                0.2
## 6:            3.428                 1.5                0.2
# aggregation with 'take_all'
mtcars %>%
    take_all(mean = mean(.x), sd = sd(.x), n = .N, by = am)
##    am mpg_mean cyl_mean disp_mean  hp_mean drat_mean  wt_mean qsec_mean
## 1:  1 24.39231 5.076923  143.5308 126.8462  4.050000 2.411000  17.36000
## 2:  0 17.14737 6.947368  290.3789 160.2632  3.286316 3.768895  18.18316
##      vs_mean gear_mean carb_mean   mpg_sd   cyl_sd   disp_sd    hp_sd   drat_sd
## 1: 0.5384615  4.384615  2.923077 6.166504 1.552500  87.20399 84.06232 0.3640513
## 2: 0.3684211  3.210526  2.736842 3.833966 1.544657 110.17165 53.90820 0.3923039
##        wt_sd  qsec_sd     vs_sd   gear_sd  carb_sd mpg_n cyl_n disp_n hp_n
## 1: 0.6169816 1.792359 0.5188745 0.5063697 2.177978    13    13     13   13
## 2: 0.7774001 1.751308 0.4955946 0.4188539 1.147079    19    19     19   19
##    drat_n wt_n qsec_n vs_n gear_n carb_n
## 1:     13   13     13   13     13     13
## 2:     19   19     19   19     19     19
# conditionally aggregate all variables
iris %>%
    take_all(mean = if(is.numeric(.x)) mean(.x))
##    Sepal.Length_mean Sepal.Width_mean Petal.Length_mean Petal.Width_mean
## 1:          5.843333         3.057333             3.758         1.199333
# aggregate all variables conditionally on name
iris %>%
    take_all(
        mean = if(startsWith(.name, "Sepal")) mean(.x),
        median = if(startsWith(.name, "Petal")) median(.x),
        by = Species
    )
##       Species Sepal.Length_mean Sepal.Width_mean Petal.Length_median
## 1:     setosa             5.006            3.428                1.50
## 2: versicolor             5.936            2.770                4.35
## 3:  virginica             6.588            2.974                5.55
##    Petal.Width_median
## 1:                0.2
## 2:                1.3
## 3:                2.0
# parametric evaluation:
var = quote(mean(cyl))
mtcars %>% 
    let(mean_cyl = eval(var)) %>% 
    head()
##     mpg cyl disp  hp drat    wt  qsec vs am gear carb mean_cyl
## 1: 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4   6.1875
## 2: 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4   6.1875
## 3: 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1   6.1875
## 4: 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1   6.1875
## 5: 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2   6.1875
## 6: 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1   6.1875
take(mtcars, eval(var))
##    eval(var)
## 1:    6.1875
# all together
new_var = "mean_cyl"
mtcars %>% 
    let((new_var) := eval(var)) %>% 
    head()
##     mpg cyl disp  hp drat    wt  qsec vs am gear carb mean_cyl
## 1: 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4   6.1875
## 2: 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4   6.1875
## 3: 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1   6.1875
## 4: 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1   6.1875
## 5: 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2   6.1875
## 6: 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1   6.1875
take(mtcars, (new_var) := eval(var))
##    mean_cyl
## 1:   6.1875

Variable selection in the expressions

You can use ‘columns’ inside expression in the ‘take’/‘let’. ‘columns’ will be replaced with data.table with selected columns. In ‘let’ in the expressions with ‘:=’, ‘cols’ or ‘%to%’ can be placed in the left part of the expression. It is usefull for multiple assignment. There are four ways of column selection:

  1. Simply by column names
  2. By variable ranges, e. g. vs:carb. Alternatively, you can use ‘%to%’ instead of colon: ‘vs %to% carb’.
  3. With regular expressions. Characters which start with ‘^’ or end with $ considered as Perl-style regular expression patterns. For example, ‘^Petal’ returns all variables started with ‘Petal’. ‘Width$’ returns all variables which end with ‘Width’. Pattern ‘^.’ matches all variables and pattern ’^.*my_str’ is equivalent to contains “my_str”’.
  4. By character variables with interpolated parts. Expression in the curly brackets inside characters will be evaluated in the parent frame with ‘text_expand’ function. For example, a{1:3} will be transformed to the names ‘a1’, ‘a2’, ‘a3’. ‘cols’ is just a shortcut for ‘columns’.
# range selection
iris %>% 
    let(
        avg = rowMeans(Sepal.Length %to% Petal.Width)
    ) %>% 
    head()
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species   avg
## 1:          5.1         3.5          1.4         0.2  setosa 2.550
## 2:          4.9         3.0          1.4         0.2  setosa 2.375
## 3:          4.7         3.2          1.3         0.2  setosa 2.350
## 4:          4.6         3.1          1.5         0.2  setosa 2.350
## 5:          5.0         3.6          1.4         0.2  setosa 2.550
## 6:          5.4         3.9          1.7         0.4  setosa 2.850
# multiassignment
iris %>% 
    let(
        # starts with Sepal or Petal
        multipled1 %to% multipled4 := cols("^(Sepal|Petal)")*2
    ) %>% 
    head()
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species multipled1
## 1:          5.1         3.5          1.4         0.2  setosa       10.2
## 2:          4.9         3.0          1.4         0.2  setosa        9.8
## 3:          4.7         3.2          1.3         0.2  setosa        9.4
## 4:          4.6         3.1          1.5         0.2  setosa        9.2
## 5:          5.0         3.6          1.4         0.2  setosa       10.0
## 6:          5.4         3.9          1.7         0.4  setosa       10.8
##    multipled2 multipled3 multipled4
## 1:        7.0        2.8        0.4
## 2:        6.0        2.8        0.4
## 3:        6.4        2.6        0.4
## 4:        6.2        3.0        0.4
## 5:        7.2        2.8        0.4
## 6:        7.8        3.4        0.8
mtcars %>% 
    let(
        # text expansion
        cols("scaled_{names(mtcars)}") := lapply(cols("{names(mtcars)}"), scale)
    ) %>% 
    head()
##     mpg cyl disp  hp drat    wt  qsec vs am gear carb scaled_mpg scaled_cyl
## 1: 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4  0.1508848 -0.1049878
## 2: 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4  0.1508848 -0.1049878
## 3: 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1  0.4495434 -1.2248578
## 4: 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1  0.2172534 -0.1049878
## 5: 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2 -0.2307345  1.0148821
## 6: 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1 -0.3302874 -0.1049878
##    scaled_disp  scaled_hp scaled_drat    scaled_wt scaled_qsec  scaled_vs
## 1: -0.57061982 -0.5350928   0.5675137 -0.610399567  -0.7771651 -0.8680278
## 2: -0.57061982 -0.5350928   0.5675137 -0.349785269  -0.4637808 -0.8680278
## 3: -0.99018209 -0.7830405   0.4739996 -0.917004624   0.4260068  1.1160357
## 4:  0.22009369 -0.5350928  -0.9661175 -0.002299538   0.8904872  1.1160357
## 5:  1.04308123  0.4129422  -0.8351978  0.227654255  -0.4637808 -0.8680278
## 6: -0.04616698 -0.6080186  -1.5646078  0.248094592   1.3269868  1.1160357
##     scaled_am scaled_gear scaled_carb
## 1:  1.1899014   0.4235542   0.7352031
## 2:  1.1899014   0.4235542   0.7352031
## 3:  1.1899014   0.4235542  -1.1221521
## 4: -0.8141431  -0.9318192  -1.1221521
## 5: -0.8141431  -0.9318192  -0.5030337
## 6: -0.8141431  -0.9318192  -1.1221521
# range selection in 'by'
# selection of range + additional column
mtcars %>% 
    take(
        res = sum(cols(mpg, disp %to% drat)),
        by = vs %to% gear
    )
##    vs am gear     res
## 1:  0  1    4  589.80
## 2:  1  1    4 1177.97
## 3:  1  0    3  985.64
## 4:  0  0    3 6839.45
## 5:  1  0    4 1125.35
## 6:  0  1    5 1874.61
## 7:  1  1    5  242.27

Joins

Here we use the same datasets as with lookups:

workers = fread("
    name company
    Nick Acme
    John Ajax
    Daniela Ajax
")

positions = fread("
    name position
    John designer
    Daniela engineer
    Cathie manager
")

workers
##       name company
## 1:    Nick    Acme
## 2:    John    Ajax
## 3: Daniela    Ajax
positions
##       name position
## 1:    John designer
## 2: Daniela engineer
## 3:  Cathie  manager

Different kinds of joins:

workers %>% dt_inner_join(positions)
## dt_inner_join: joining, by = "name"
##       name company position
## 1:    John    Ajax designer
## 2: Daniela    Ajax engineer
workers %>% dt_left_join(positions)
## dt_left_join: joining, by = "name"
##       name company position
## 1:    Nick    Acme     <NA>
## 2:    John    Ajax designer
## 3: Daniela    Ajax engineer
workers %>% dt_right_join(positions)
## dt_right_join: joining, by = "name"
##       name company position
## 1:    John    Ajax designer
## 2: Daniela    Ajax engineer
## 3:  Cathie    <NA>  manager
workers %>% dt_full_join(positions)
## dt_full_join: joining, by = "name"
##       name company position
## 1:    Nick    Acme     <NA>
## 2:    John    Ajax designer
## 3: Daniela    Ajax engineer
## 4:  Cathie    <NA>  manager
# filtering joins
workers %>% dt_anti_join(positions)
## dt_anti_join: joining, by = "name"
##    name company
## 1: Nick    Acme
workers %>% dt_semi_join(positions)
## dt_semi_join: joining, by = "name"
##       name company
## 1:    John    Ajax
## 2: Daniela    Ajax

To suppress the message, supply by argument:

workers %>% dt_left_join(positions, by = "name")

Use a named by if the join variables have different names:

positions2 = setNames(positions, c("worker", "position")) # rename first column in 'positions'
workers %>% dt_inner_join(positions2, by = c("name" = "worker"))

‘dplyr’-like interface for data.table.

There are a small subset of ‘dplyr’ verbs to work with data.table. Note that there is no group_by verb - use by or keyby argument when needed.

The same examples with ‘dplyr’-verbs:

# examples from 'dplyr'
# newly created variables are available immediately
mtcars  %>%
    dt_mutate(
        cyl2 = cyl * 2,
        cyl4 = cyl2 * 2
    ) %>%
    head()
##     mpg cyl disp  hp drat    wt  qsec vs am gear carb cyl2 cyl4
## 1: 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4   12   24
## 2: 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4   12   24
## 3: 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1    8   16
## 4: 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1   12   24
## 5: 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2   16   32
## 6: 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1   12   24
# you can also use dt_mutate() to remove variables and
# modify existing variables
mtcars %>%
    dt_mutate(
        mpg = NULL,
        disp = disp * 0.0163871 # convert to litres
    ) %>%
    head()
##    cyl     disp  hp drat    wt  qsec vs am gear carb
## 1:   6 2.621936 110 3.90 2.620 16.46  0  1    4    4
## 2:   6 2.621936 110 3.90 2.875 17.02  0  1    4    4
## 3:   4 1.769807  93 3.85 2.320 18.61  1  1    4    1
## 4:   6 4.227872 110 3.08 3.215 19.44  1  0    3    1
## 5:   8 5.899356 175 3.15 3.440 17.02  0  0    3    2
## 6:   6 3.687098 105 2.76 3.460 20.22  1  0    3    1
# window functions are useful for grouped mutates
mtcars %>%
    dt_mutate(
        rank = rank(-mpg, ties.method = "min"),
        keyby = cyl) %>%
    print()
##      mpg cyl  disp  hp drat    wt  qsec vs am gear carb rank
##  1: 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1    8
##  2: 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2    7
##  3: 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2    8
##  4: 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1    2
##  5: 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2    3
##  6: 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1    1
##  7: 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1   10
##  8: 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1    5
##  9: 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2    6
## 10: 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2    3
## 11: 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2   11
## 12: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4    2
## 13: 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4    2
## 14: 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1    1
## 15: 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1    6
## 16: 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4    5
## 17: 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4    7
## 18: 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6    4
## 19: 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2    2
## 20: 14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4   11
## 21: 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3    4
## 22: 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3    3
## 23: 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3    7
## 24: 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4   13
## 25: 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4   13
## 26: 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4   10
## 27: 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2    6
## 28: 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2    7
## 29: 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4   12
## 30: 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2    1
## 31: 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4    5
## 32: 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8    9
##      mpg cyl  disp  hp drat    wt  qsec vs am gear carb rank
# You can drop variables by setting them to NULL
mtcars %>% dt_mutate(cyl = NULL) %>% head()
##     mpg disp  hp drat    wt  qsec vs am gear carb
## 1: 21.0  160 110 3.90 2.620 16.46  0  1    4    4
## 2: 21.0  160 110 3.90 2.875 17.02  0  1    4    4
## 3: 22.8  108  93 3.85 2.320 18.61  1  1    4    1
## 4: 21.4  258 110 3.08 3.215 19.44  1  0    3    1
## 5: 18.7  360 175 3.15 3.440 17.02  0  0    3    2
## 6: 18.1  225 105 2.76 3.460 20.22  1  0    3    1
# A summary applied without by returns a single row
mtcars %>%
    dt_summarise(mean = mean(disp), n = .N)
##        mean  n
## 1: 230.7219 32
# Usually, you'll want to group first
mtcars %>%
    dt_summarise(mean = mean(disp), n = .N, by = cyl)
##    cyl     mean  n
## 1:   6 183.3143  7
## 2:   4 105.1364 11
## 3:   8 353.1000 14
# Multiple 'by' - variables
mtcars %>%
    dt_summarise(cyl_n = .N, by = list(cyl, vs))
##    cyl vs cyl_n
## 1:   6  0     3
## 2:   4  1    10
## 3:   6  1     4
## 4:   8  0    14
## 5:   4  0     1
# Newly created summaries immediately
# doesn't overwrite existing variables
mtcars %>%
    dt_summarise(disp = mean(disp),
                  sd = sd(disp),
                  by = cyl)
##    cyl     disp       sd
## 1:   6 183.3143 41.56246
## 2:   4 105.1364 26.87159
## 3:   8 353.1000 67.77132
# You can group by expressions:
mtcars %>%
    dt_summarise_all(mean, by = list(vsam = vs + am))
##    vsam      mpg      cyl     disp        hp     drat       wt     qsec
## 1:    1 20.28462 5.692308 189.4692 138.46154 3.738462 3.038846 18.04231
## 2:    2 28.37143 4.000000  89.8000  80.57143 4.148571 2.028286 18.70000
## 3:    0 15.05000 8.000000 357.6167 194.16667 3.120833 4.104083 17.14250
##        gear     carb
## 1: 4.076923 3.307692
## 2: 4.142857 1.428571
## 3: 3.000000 3.083333
# filter by condition
mtcars %>%
    dt_filter(am==0)
##      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
##  1: 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
##  2: 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
##  3: 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
##  4: 14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
##  5: 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
##  6: 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
##  7: 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
##  8: 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
##  9: 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## 10: 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## 11: 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## 12: 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## 13: 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## 14: 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## 15: 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 16: 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## 17: 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## 18: 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## 19: 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
# filter by compound condition
mtcars %>%
    dt_filter(am==0,  mpg>mean(mpg))
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1: 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 2: 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 3: 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## 4: 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
# select
mtcars %>% 
  dt_select(vs:carb, cyl) %>% 
  head()
##                   vs am gear carb cyl
## Mazda RX4          0  1    4    4   6
## Mazda RX4 Wag      0  1    4    4   6
## Datsun 710         1  1    4    1   4
## Hornet 4 Drive     1  0    3    1   6
## Hornet Sportabout  0  0    3    2   8
## Valiant            1  0    3    1   6
mtcars %>% 
  dt_select(-am, -cyl) %>% 
  head()
##                    mpg disp  hp drat    wt  qsec vs gear carb
## Mazda RX4         21.0  160 110 3.90 2.620 16.46  0    4    4
## Mazda RX4 Wag     21.0  160 110 3.90 2.875 17.02  0    4    4
## Datsun 710        22.8  108  93 3.85 2.320 18.61  1    4    1
## Hornet 4 Drive    21.4  258 110 3.08 3.215 19.44  1    3    1
## Hornet Sportabout 18.7  360 175 3.15 3.440 17.02  0    3    2
## Valiant           18.1  225 105 2.76 3.460 20.22  1    3    1
# regular expression pattern
dt_select(iris, "^Petal") %>% head() # variables which start from 'Petal'
##   Petal.Length Petal.Width
## 1          1.4         0.2
## 2          1.4         0.2
## 3          1.3         0.2
## 4          1.5         0.2
## 5          1.4         0.2
## 6          1.7         0.4
dt_select(iris, "Width$") %>% head()  # variables which end with 'Width'
##   Sepal.Width Petal.Width
## 1         3.5         0.2
## 2         3.0         0.2
## 3         3.2         0.2
## 4         3.1         0.2
## 5         3.6         0.2
## 6         3.9         0.4
# move Species variable to the front
# pattern "^." matches all variables
dt_select(iris, Species, "^.") %>% head() 
##   Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1  setosa          5.1         3.5          1.4         0.2
## 2  setosa          4.9         3.0          1.4         0.2
## 3  setosa          4.7         3.2          1.3         0.2
## 4  setosa          4.6         3.1          1.5         0.2
## 5  setosa          5.0         3.6          1.4         0.2
## 6  setosa          5.4         3.9          1.7         0.4
# pattern "^.*al" means "contains 'al'"
dt_select(iris, "^.*al") %>% head() 
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
dt_select(iris, 1:4) %>% head()  # numeric indexing - all variables except Species
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
# sorting
dt_arrange(mtcars, cyl, disp)
dt_arrange(mtcars, -disp)