Using estimateIncidence()

estimateIncidence() is the function we use to estimate incidence rates. To demonstrate its use, let´s load the IncidencePrevalence package (along with a couple of packages to help for subsequent plots) and generate 50,000 example patients using the mockIncidencePrevalenceRef() function, from whom we´ll create a denominator population without adding any restrictions other than a study period. In this example we’ll use permanent tables (rather than temporary tables which would be used by default).

library(IncidencePrevalence)
library(dplyr)
library(tidyr)

cdm <- mockIncidencePrevalenceRef(
  sampleSize = 1000,
  outPre = 0.5
)

cdm <- generateDenominatorCohortSet(
  cdm = cdm, name = "denominator",
  cohortDateRange = c(as.Date("2008-01-01"), as.Date("2012-01-01")),
  ageGroup = list(c(0, 150)),
  sex = "Both",
  daysPriorObservation = 0
)
#> ℹ Creating denominator cohorts
#> ✔ Cohorts created in 0 min and 3 sec

cdm$denominator %>%
  glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v0.9.2 [eburn@Windows 10 x64:R 4.2.1/:memory:]
#> $ cohort_definition_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
#> $ subject_id           <chr> "2", "3", "4", "6", "7", "8", "10", "11", "12", "…
#> $ cohort_start_date    <date> 2008-01-01, 2008-01-01, 2008-01-01, 2008-01-01, …
#> $ cohort_end_date      <date> 2009-01-30, 2012-01-01, 2012-01-01, 2012-01-01, …

Let´s first calculate incidence rates on a yearly basis, without allowing repetitive events

inc <- estimateIncidence(
  cdm = cdm,
  denominatorTable = "denominator",
  outcomeTable = "outcome",
  interval = "years",
  outcomeWashout = 0,
  repeatedEvents = FALSE
)

inc %>%
  glimpse()
#> Rows: 4
#> Columns: 29
#> $ analysis_id                             <chr> "1", "1", "1", "1"
#> $ n_persons                               <int> 691, 668, 651, 641
#> $ person_days                             <dbl> 249072, 240050, 235229, 232626
#> $ n_events                                <int> 6, 8, NA, NA
#> $ incidence_start_date                    <date> 2008-01-01, 2009-01-01, 2010-0…
#> $ incidence_end_date                      <date> 2008-12-31, 2009-12-31, 2010-1…
#> $ person_years                            <dbl> 681.9220, 657.2211, 644.0219, …
#> $ incidence_100000_pys                    <dbl> 879.8661, 1217.2464, NA, NA
#> $ incidence_100000_pys_95CI_lower         <dbl> 322.8953, 525.5206, NA, NA
#> $ incidence_100000_pys_95CI_upper         <dbl> 1915.098, 2398.461, NA, NA
#> $ cohort_obscured                         <chr> "FALSE", "FALSE", "FALSE", "FA…
#> $ result_obscured                         <chr> "FALSE", "FALSE", "TRUE", "TR…
#> $ outcome_cohort_id                       <int> 1, 1, 1, 1
#> $ outcome_cohort_name                     <chr> "cohort_1", "cohort_1", "coho…
#> $ analysis_outcome_washout                <chr> "0", "0", "0", "0"
#> $ analysis_repeated_events                <lgl> FALSE, FALSE, FALSE, FALSE
#> $ analysis_interval                       <chr> "years", "years", "years", "ye…
#> $ analysis_complete_database_intervals    <lgl> TRUE, TRUE, TRUE, TRUE
#> $ denominator_cohort_id                   <int> 1, 1, 1, 1
#> $ analysis_min_cell_count                 <dbl> 5, 5, 5, 5
#> $ denominator_cohort_name                 <chr> "denominator_cohort_1", "denom…
#> $ denominator_age_group                   <chr> "0 to 150", "0 to 150", "0 to …
#> $ denominator_sex                         <chr> "Both", "Both", "Both", "Both"
#> $ denominator_days_prior_observation      <dbl> 0, 0, 0, 0
#> $ denominator_start_date                  <date> 2008-01-01, 2008-01-01, 2008-0…
#> $ denominator_end_date                    <date> 2012-01-01, 2012-01-01, 2012-0…
#> $ denominator_target_cohort_definition_id <int> NA, NA, NA, NA
#> $ denominator_target_cohort_name          <lgl> NA, NA, NA, NA
#> $ cdm_name                                <chr> "mock", "mock", "mock", "mock"

plotIncidence(inc)

Now with a washout of all prior history while still not allowing repetitive events. Here we use Inf to specify that we will use a washout of all prior history for an individual.

inc <- estimateIncidence(
  cdm = cdm,
  denominatorTable = "denominator",
  outcomeTable = "outcome",
  interval = "years",
  outcomeWashout = Inf,
  repeatedEvents = FALSE
)

inc %>%
  glimpse()
#> Rows: 4
#> Columns: 29
#> $ analysis_id                             <chr> "1", "1", "1", "1"
#> $ n_persons                               <int> 484, 471, 460, 454
#> $ person_days                             <dbl> 174903, 169371, 166289, 164782
#> $ n_events                                <int> 6, 8, NA, NA
#> $ incidence_start_date                    <date> 2008-01-01, 2009-01-01, 2010-0…
#> $ incidence_end_date                      <date> 2008-12-31, 2009-12-31, 2010-1…
#> $ person_years                            <dbl> 478.8583, 463.7125, 455.2745, …
#> $ incidence_100000_pys                    <dbl> 1252.980, 1725.207, NA, NA
#> $ incidence_100000_pys_95CI_lower         <dbl> 459.8217, 744.8218, NA, NA
#> $ incidence_100000_pys_95CI_upper         <dbl> 2727.210, 3399.345, NA, NA
#> $ cohort_obscured                         <chr> "FALSE", "FALSE", "FALSE", "FA…
#> $ result_obscured                         <chr> "FALSE", "FALSE", "TRUE", "TR…
#> $ outcome_cohort_id                       <int> 1, 1, 1, 1
#> $ outcome_cohort_name                     <chr> "cohort_1", "cohort_1", "coho…
#> $ analysis_outcome_washout                <chr> "inf", "inf", "inf", "inf"
#> $ analysis_repeated_events                <lgl> FALSE, FALSE, FALSE, FALSE
#> $ analysis_interval                       <chr> "years", "years", "years", "ye…
#> $ analysis_complete_database_intervals    <lgl> TRUE, TRUE, TRUE, TRUE
#> $ denominator_cohort_id                   <int> 1, 1, 1, 1
#> $ analysis_min_cell_count                 <dbl> 5, 5, 5, 5
#> $ denominator_cohort_name                 <chr> "denominator_cohort_1", "denom…
#> $ denominator_age_group                   <chr> "0 to 150", "0 to 150", "0 to …
#> $ denominator_sex                         <chr> "Both", "Both", "Both", "Both"
#> $ denominator_days_prior_observation      <dbl> 0, 0, 0, 0
#> $ denominator_start_date                  <date> 2008-01-01, 2008-01-01, 2008-0…
#> $ denominator_end_date                    <date> 2012-01-01, 2012-01-01, 2012-0…
#> $ denominator_target_cohort_definition_id <int> NA, NA, NA, NA
#> $ denominator_target_cohort_name          <lgl> NA, NA, NA, NA
#> $ cdm_name                                <chr> "mock", "mock", "mock", "mock"

plotIncidence(inc)

Now we´ll set the washout to 180 days while still not allowing repetitive events

inc <- estimateIncidence(
  cdm = cdm,
  denominatorTable = "denominator",
  outcomeTable = "outcome",
  interval = "years",
  outcomeWashout = 180,
  repeatedEvents = FALSE
)

inc %>%
  glimpse()
#> Rows: 4
#> Columns: 29
#> $ analysis_id                             <chr> "1", "1", "1", "1"
#> $ n_persons                               <int> 691, 668, 651, 641
#> $ person_days                             <dbl> 248694, 240050, 235229, 232626
#> $ n_events                                <int> 6, 8, NA, NA
#> $ incidence_start_date                    <date> 2008-01-01, 2009-01-01, 2010-0…
#> $ incidence_end_date                      <date> 2008-12-31, 2009-12-31, 2010-1…
#> $ person_years                            <dbl> 680.8871, 657.2211, 644.0219, …
#> $ incidence_100000_pys                    <dbl> 881.2034, 1217.2464, NA, NA
#> $ incidence_100000_pys_95CI_lower         <dbl> 323.3861, 525.5206, NA, NA
#> $ incidence_100000_pys_95CI_upper         <dbl> 1918.009, 2398.461, NA, NA
#> $ cohort_obscured                         <chr> "FALSE", "FALSE", "FALSE", "FA…
#> $ result_obscured                         <chr> "FALSE", "FALSE", "TRUE", "TR…
#> $ outcome_cohort_id                       <int> 1, 1, 1, 1
#> $ outcome_cohort_name                     <chr> "cohort_1", "cohort_1", "coho…
#> $ analysis_outcome_washout                <chr> "180", "180", "180", "180"
#> $ analysis_repeated_events                <lgl> FALSE, FALSE, FALSE, FALSE
#> $ analysis_interval                       <chr> "years", "years", "years", "ye…
#> $ analysis_complete_database_intervals    <lgl> TRUE, TRUE, TRUE, TRUE
#> $ denominator_cohort_id                   <int> 1, 1, 1, 1
#> $ analysis_min_cell_count                 <dbl> 5, 5, 5, 5
#> $ denominator_cohort_name                 <chr> "denominator_cohort_1", "denom…
#> $ denominator_age_group                   <chr> "0 to 150", "0 to 150", "0 to …
#> $ denominator_sex                         <chr> "Both", "Both", "Both", "Both"
#> $ denominator_days_prior_observation      <dbl> 0, 0, 0, 0
#> $ denominator_start_date                  <date> 2008-01-01, 2008-01-01, 2008-0…
#> $ denominator_end_date                    <date> 2012-01-01, 2012-01-01, 2012-0…
#> $ denominator_target_cohort_definition_id <int> NA, NA, NA, NA
#> $ denominator_target_cohort_name          <lgl> NA, NA, NA, NA
#> $ cdm_name                                <chr> "mock", "mock", "mock", "mock"

plotIncidence(inc)

And finally we´ll set the washout to 180 days and allow repetitive events

inc <- estimateIncidence(
  cdm = cdm,
  denominatorTable = "denominator",
  outcomeTable = "outcome",
  interval = "years",
  outcomeWashout = 180,
  repeatedEvents = TRUE
)

inc %>%
  glimpse()
#> Rows: 4
#> Columns: 29
#> $ analysis_id                             <chr> "1", "1", "1", "1"
#> $ n_persons                               <int> 691, 674, 665, 657
#> $ person_days                             <dbl> 248981, 242477, 240455, 238398
#> $ n_events                                <int> 6, 8, NA, NA
#> $ incidence_start_date                    <date> 2008-01-01, 2009-01-01, 2010-0…
#> $ incidence_end_date                      <date> 2008-12-31, 2009-12-31, 2010-1…
#> $ person_years                            <dbl> 681.6728, 663.8658, 658.3299, …
#> $ incidence_100000_pys                    <dbl> 880.1876, 1205.0627, NA, NA
#> $ incidence_100000_pys_95CI_lower         <dbl> 323.0134, 520.2606, NA, NA
#> $ incidence_100000_pys_95CI_upper         <dbl> 1915.798, 2374.454, NA, NA
#> $ cohort_obscured                         <chr> "FALSE", "FALSE", "FALSE", "FA…
#> $ result_obscured                         <chr> "FALSE", "FALSE", "TRUE", "TR…
#> $ outcome_cohort_id                       <int> 1, 1, 1, 1
#> $ outcome_cohort_name                     <chr> "cohort_1", "cohort_1", "coho…
#> $ analysis_outcome_washout                <chr> "180", "180", "180", "180"
#> $ analysis_repeated_events                <lgl> TRUE, TRUE, TRUE, TRUE
#> $ analysis_interval                       <chr> "years", "years", "years", "ye…
#> $ analysis_complete_database_intervals    <lgl> TRUE, TRUE, TRUE, TRUE
#> $ denominator_cohort_id                   <int> 1, 1, 1, 1
#> $ analysis_min_cell_count                 <dbl> 5, 5, 5, 5
#> $ denominator_cohort_name                 <chr> "denominator_cohort_1", "denom…
#> $ denominator_age_group                   <chr> "0 to 150", "0 to 150", "0 to …
#> $ denominator_sex                         <chr> "Both", "Both", "Both", "Both"
#> $ denominator_days_prior_observation      <dbl> 0, 0, 0, 0
#> $ denominator_start_date                  <date> 2008-01-01, 2008-01-01, 2008-0…
#> $ denominator_end_date                    <date> 2012-01-01, 2012-01-01, 2012-0…
#> $ denominator_target_cohort_definition_id <int> NA, NA, NA, NA
#> $ denominator_target_cohort_name          <lgl> NA, NA, NA, NA
#> $ cdm_name                                <chr> "mock", "mock", "mock", "mock"

plotIncidence(inc)

Stratified analyses

As with prevalence. if we specified multiple denominator populations results will be returned for each. Here for example we define three age groups for denominator populations and get three sets of estimates back when estimating incidence

cdm <- generateDenominatorCohortSet(
  cdm = cdm, name = "denominator_age_sex",
  cohortDateRange = c(as.Date("2008-01-01"), as.Date("2012-01-01")),
  ageGroup = list(c(0, 39),
                  c(41, 65),
                  c(66, 150)),
  sex = "Both",
  daysPriorObservation = 0
)
#> ℹ Creating denominator cohorts
#> ✔ Cohorts created in 0 min and 5 sec
inc <- estimateIncidence(
  cdm = cdm,
  denominatorTable = "denominator_age_sex",
  outcomeTable = "outcome",
  interval = "years",
  outcomeWashout = 180,
  repeatedEvents = TRUE
)
#> Getting incidence for analysis 1 of 3
#> Getting incidence for analysis 2 of 3
#> Getting incidence for analysis 3 of 3
#> Overall time taken: 0 mins and 2 secs

plotIncidence(inc, facet = "denominator_age_group")
#> Warning: Removed 10 rows containing missing values (`geom_point()`).

And again, as with prevalence while we specify time-varying stratifications when defining our denominator populations, if we have time-invariant stratifications we can include these at the the estimation stage.

cdm$denominator <- cdm$denominator %>% 
  mutate(group = if_else(as.numeric(subject_id)  < 500, "first", "second")) 

inc <- estimateIncidence(
  cdm = cdm,
  denominatorTable = "denominator",
  outcomeTable = "outcome",
  strata = list("group"),
  outcomeWashout = 180,
  repeatedEvents = TRUE
)
#> Getting incidence for analysis 1 of 1
#> Overall time taken: 0 mins and 1 secs

plotIncidence(inc, 
               facet = "strata_level")
#> Warning: Removed 8 rows containing missing values (`geom_point()`).


cdm$denominator <- cdm$denominator %>% 
  mutate(group_1 = if_else(as.numeric(subject_id)  < 1500, "first", "second"))  %>% 
  mutate(group_2 = if_else(cohort_start_date  < as.Date("2010-01-01"), 
                           "pre", "post"))

inc <- estimateIncidence(
  cdm = cdm,
  denominatorTable = "denominator",
  outcomeTable = "outcome",
  strata = list(c("group_1"), # for just group_1
                c("group_2"), # for just group_2
                c("group_1", "group_2")),  # for group_1 and group_2  outcomeWashout = 180,
  repeatedEvents = TRUE
)
#> Getting incidence for analysis 1 of 1
#> Overall time taken: 0 mins and 1 secs

plotIncidence(inc, 
               facet = "strata_level")
#> Warning: Removed 8 rows containing missing values (`geom_point()`).

Other parameters

In the examples above, we have used calculated incidence rates by months and years, but it can be also calculated by weeks, months, quarters, or for the entire study time period. In addition, we can decide whether to include time intervals that are not fully captured in the database (e.g., having data up to June for the last study year when computing yearly incidence rates). By default, incidence will only be estimated for those intervals where the denominator cohort captures all the interval (completeDatabaseIntervals=TRUE).

Given that we can set estimateIncidence() to exclude individuals based on other parameters (e.g., outcomeWashout), it is important to note that the denominator population used to compute incidence rates might differ from the one calculated with generateDenominatorCohortSet().

The user can also set the minimum number of events to be reported, below which results will be obscured. By default, results with <5 occurrences are blinded, but if minCellCount=0, all results will be reported. 95 % confidence intervals are calculated using the exact method. We can set verbose=TRUE to report progress as code is running. By default, no progress is reported (verbose=FALSE).

inc <- estimateIncidence(
  cdm = cdm,
  denominatorTable = "denominator",
  outcomeTable = "outcome",
  interval = c("weeks"),
  completeDatabaseIntervals = FALSE,
  outcomeWashout = 180,
  repeatedEvents = TRUE,
  minCellCount = 0
)
#> Getting incidence for analysis 1 of 1
#> Overall time taken: 0 mins and 3 secs

Output

estimateIncidence() will generate a table with incidence rates for each of the time intervals studied and for each combination of the parameters set. Similar to the output obtained by generateDenominatorCohortSet(), the table generated will also be associated with attributes such as settings and attrition.

inc <- estimateIncidence(
  cdm = cdm,
  denominatorTable = "denominator",
  outcomeTable = "outcome",
  interval = c("Years"),
  outcomeWashout = c(0, 180),
  repeatedEvents = TRUE,
  returnParticipants = TRUE
)
incidenceAttrition(inc)
#> # A tibble: 22 × 24
#>    analysis_id number_records number_subjects reason_id reason  excluded_records
#>    <chr>       <chr>          <chr>               <int> <chr>   <chr>           
#>  1 1           1000           1000                    1 Starti… <NA>            
#>  2 1           1000           1000                    2 Missin… 0               
#>  3 1           1000           1000                    3 Missin… 0               
#>  4 1           1000           1000                    4 Cannot… 0               
#>  5 1           691            691                     5 No obs… 309             
#>  6 1           691            691                     6 Doesn'… 0               
#>  7 1           691            691                     7 Prior … 0               
#>  8 1           691            691                    10 No obs… 0               
#>  9 1           918            691                    11 Starti… <NA>            
#> 10 1           711            691                    12 Exclud… 207             
#> # ℹ 12 more rows
#> # ℹ 18 more variables: excluded_subjects <chr>, outcome_cohort_id <int>,
#> #   outcome_cohort_name <chr>, analysis_outcome_washout <chr>,
#> #   analysis_repeated_events <lgl>, analysis_interval <chr>,
#> #   analysis_complete_database_intervals <lgl>, denominator_cohort_id <int>,
#> #   analysis_min_cell_count <dbl>, denominator_cohort_name <chr>,
#> #   denominator_age_group <chr>, denominator_sex <chr>, …

As with incidence, if we set returnParticipants as TRUE, we can identify the individuals who contributed to the analysis by using `participants(). For example, we can identify those people contributing to analysis 1 by running

participants(inc, analysisId = 1) %>%
  glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v0.9.2 [eburn@Windows 10 x64:R 4.2.1/:memory:]
#> $ subject_id         <chr> "3", "4", "6", "7", "8", "10", "14", "20", "23", "2…
#> $ cohort_start_date  <date> 2008-01-01, 2008-01-01, 2008-01-01, 2008-01-01, 20…
#> $ cohort_end_date    <date> 2012-01-01, 2012-01-01, 2012-01-01, 2012-01-01, 20…
#> $ outcome_start_date <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

As we;ve used permanent tables for this example, we can drop these after running our analysis.

CDMConnector::listTables(attr(attr(cdm, "cdm_source"), "dbcon"))
#>  [1] "denominator"                   "denominator_age_sex"          
#>  [3] "denominator_age_sex_attrition" "denominator_age_sex_codelist" 
#>  [5] "denominator_age_sex_set"       "denominator_attrition"        
#>  [7] "denominator_codelist"          "denominator_set"              
#>  [9] "inc_participants_1"            "observation_period"           
#> [11] "outcome"                       "outcome_attrition"            
#> [13] "outcome_codelist"              "outcome_set"                  
#> [15] "person"                        "target"                       
#> [17] "target_attrition"              "target_codelist"              
#> [19] "target_set"
CDMConnector::dropTable(cdm = cdm, name = starts_with("denominator"))
CDMConnector::dropTable(cdm = cdm, name = starts_with("inc_participants_"))
CDMConnector::listTables(attr(attr(cdm, "cdm_source"), "dbcon"))
#>  [1] "observation_period" "outcome"            "outcome_attrition" 
#>  [4] "outcome_codelist"   "outcome_set"        "person"            
#>  [7] "target"             "target_attrition"   "target_codelist"   
#> [10] "target_set"

Calculating incidence

Introduction

No washout, no repetitive events

Washout all history, no repetitive events

Some washout, no repetitive events

Some washout, repetitive events

Outcome definition

Using estimateIncidence()

Stratified analyses

Other parameters

Output