Tips & tricks

Roland Krasser

2024-02-10

library(dplyr)
library(explore)

Count with percent

A classic count() returns the number of observations.

data <- use_data_penguins()
data %>% count(island)
#> # A tibble: 3 × 2
#>   island        n
#>   <fct>     <int>
#> 1 Biscoe      168
#> 2 Dream       124
#> 3 Torgersen    52

To add percent values, simply use count_pct() from {explore}.

data %>% count_pct(island)
#> # A tibble: 3 × 4
#>   island        n total   pct
#>   <fct>     <int> <int> <dbl>
#> 1 Biscoe      168   344  48.8
#> 2 Dream       124   344  36.0
#> 3 Torgersen    52   344  15.1

Add id

data %>% glimpse()
#> Rows: 344
#> Columns: 8
#> $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
#> $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
#> $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
#> $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
#> $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
#> $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
#> $ sex               <fct> male, female, female, NA, female, male, female, male…
#> $ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

To add an id variable, simply use add_var_id() from {explore}.

data %>% add_var_id() %>% glimpse()
#> Rows: 344
#> Columns: 9
#> $ id                <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
#> $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
#> $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
#> $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
#> $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
#> $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
#> $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
#> $ sex               <fct> male, female, female, NA, female, male, female, male…
#> $ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

User defined report

Create a user defined report (RMarkdown template) to explore your own data.

create_notebook_explore(
  output_dir = tempdir(),
  output_file = "notebook-explore.Rmd")

Data Dictionary

Create a Data Dictionary of a data set (Markdown File data_dict.md)

iris  %>%  data_dict_md(output_dir = tempdir())

Add title, detailed descriptions and change default filename

description <- data.frame(
                  variable = c("Species"), 
                  description = c("Species of Iris flower"))
data_dict_md(iris, 
             title = "iris flower data set", 
             description =  description, 
             output_file = "data_dict_iris.md",
             output_dir = tempdir())

Basic data cleaning

Rename variable

data <- use_data_titanic(count = FALSE)
glimpse(data)
#> Rows: 2,201
#> Columns: 4
#> $ Class    <chr> "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd"…
#> $ Sex      <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male…
#> $ Age      <chr> "Child", "Child", "Child", "Child", "Child", "Child", "Child"…
#> $ Survived <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "…
data <- data %>% clean_var(Age, name = "age")
glimpse(data)
#> Rows: 2,201
#> Columns: 4
#> $ Class    <chr> "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd"…
#> $ Sex      <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male…
#> $ age      <chr> "Child", "Child", "Child", "Child", "Child", "Child", "Child"…
#> $ Survived <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "…

Replace NA values

data <- use_data_beer()
data %>% describe(energy_kcal_100ml)
#> variable = energy_kcal_100ml
#> type     = double
#> na       = 11 of 161 (6.8%)
#> unique   = 34
#> min|max  = 20 | 62
#> q05|q95  = 24 | 56.65
#> q25|q75  = 37 | 44
#> median   = 42
#> mean     = 39.89333
data <- data %>% clean_var(energy_kcal_100ml, na = 42)
data %>% describe(energy_kcal_100ml)
#> variable = energy_kcal_100ml
#> type     = double
#> na       = 0 of 161 (0%)
#> unique   = 33
#> min|max  = 20 | 62
#> q05|q95  = 24 | 55
#> q25|q75  = 38 | 44
#> median   = 42
#> mean     = 40.03727

Set min max values

data <- create_data_person()
data %>% describe(age)
#> variable = age
#> type     = integer
#> na       = 0 of 1 000 (0%)
#> unique   = 80
#> min|max  = 16 | 95
#> q05|q95  = 21 | 92
#> q25|q75  = 37 | 76
#> median   = 55
#> mean     = 55.845
data <- data %>% clean_var(age, min_val = 20, max_val = 80)
data %>% describe(age)
#> variable = age
#> type     = integer
#> na       = 0 of 1 000 (0%)
#> unique   = 61
#> min|max  = 20 | 80
#> q05|q95  = 21 | 80
#> q25|q75  = 37 | 76
#> median   = 55
#> mean     = 54.276

Rescale 0 to 1

data %>% describe(income)
#> variable = income
#> type     = double
#> na       = 0 of 1 000 (0%)
#> unique   = 228
#> min|max  = 0 | 150
#> q05|q95  = 6 | 123.025
#> q25|q75  = 35 | 88.625
#> median   = 62
#> mean     = 61.5875
data <- data %>% clean_var(income, rescale01 = TRUE)
data %>% describe(income)
#> variable = income
#> type     = double
#> na       = 0 of 1 000 (0%)
#> unique   = 228
#> min|max  = 0 | 1
#> q05|q95  = 0.04 | 0.820167
#> q25|q75  = 0.233333 | 0.590833
#> median   = 0.4
#> mean     = 0.410583

Cleaning text

data[1, "handset"] <- " android "
data[2, "handset"] <- "ANDROID"
data %>% describe(handset)
#> variable = handset
#> type     = character
#> na       = 0 of 1 000 (0%)
#> unique   = 5
#>   android  = 1 (0.1%)
#>  ANDROID   = 1 (0.1%)
#>  Android   = 471 (47.1%)
#>  Apple     = 430 (43%)
#>  Other     = 97 (9.7%)
data <- data %>% clean_var(handset, simplify_text = TRUE)
data %>% describe(handset)
#> variable = handset
#> type     = character
#> na       = 0 of 1 000 (0%)
#> unique   = 3
#>  ANDROID = 473 (47.3%)
#>  APPLE   = 430 (43%)
#>  OTHER   = 97 (9.7%)