Tabulate counts and proportions

tab_linelist(
  x,
  ...,
  strata = NULL,
  keep = TRUE,
  drop = NULL,
  na.rm = TRUE,
  prop_total = FALSE,
  row_total = FALSE,
  col_total = FALSE,
  wide = TRUE,
  transpose = NULL,
  digits = 1,
  pretty = TRUE
)

tab_survey(
  x,
  ...,
  strata = NULL,
  keep = TRUE,
  drop = NULL,
  na.rm = TRUE,
  prop_total = FALSE,
  row_total = FALSE,
  col_total = FALSE,
  wide = TRUE,
  transpose = NULL,
  digits = 1,
  method = "logit",
  deff = FALSE,
  pretty = TRUE
)

Arguments

x

a data.frame() or tbl_svy object

...

categorical variables to tabulate

strata

a stratifier to split the data

keep

a character vector specifying which values to retain in the tabulation. Defaults to TRUE, which keeps all the values.

drop

a character vector specifying which values to drop in the tabulation. Defaults to NULL, which keeps all values.

na.rm

When TRUE (default), missing (NA) values present in var will be removed from the data set with a warning, causing a change in denominator for the tabulations. Setting this to FALSE creates an explicit missing value called "(Missing)".

prop_total

if TRUE and strata is not NULL, then the totals of the rows will be reported as proportions of the total data set, otherwise, they will be proportions within the stratum (default).

row_total

create a new column with the total counts for each row of stratified data.

col_total

create a new row with the total counts for each column of stratified data.

wide

if TRUE (default) and strata is defined, then the results are presented in a wide table with each stratification counts and estimates in separate columns. If FALSE, then the data will be presented in a long format where the counts and estimates are presented in single columns. This has no effect if strata is not defined.

transpose

if wide = TRUE, then this will transpose the columns to the rows, which is useful when you stratify by age group. Default is NULL, which will not transpose anything. You have three options for transpose:

  • transpose = "variable": uses the variable column, (dropping values if strata exists). Use this if you know that your values are all identical or at least identifiable by the variable name.

  • transpose = "value" : uses the value column, (dropping variables if strata exists). Use this if your values are important and the variable names are generic placeholders.

  • transpose = "both" : combines the variable and value columns. Use this if both the variables and values are important.

digits

(survey only) if pretty = FALSE, this indicates the number of digits used for proportion and CI

pretty

(survey only) if TRUE, default, the proportion and CI are merged

method

(survey only) a method from survey::svyciprop() to calculate the confidence interval. Defaults to "logit".

deff

a logical indicating if the design effect should be reported. Defaults to TRUE.

Value

a tibble::tibble() with a column for variables, a column for values, and counts and proportions. If strata is not NULL and wide = TRUE, then there will be separate columns for each strata for the counts and proportions. Survey data will report confidence intervals.

Examples

have_packages <- require("matchmaker") & require("epidict")
#> Loading required package: matchmaker
#> Loading required package: epidict

if (have_packages) {
  withAutoprint({

    # Simulating linelist data

    linelist <- epidict::gen_data("Measles", numcases = 1000, org = "MSF")
    measles_dict <- epidict::msf_dict("Measles", compact = FALSE)

    # Cleaning linelist data
    linelist_clean <- matchmaker::match_df(
      x = linelist,
      dictionary = measles_dict,
      from = "option_code",
      to = "option_name",
      by = "data_element_shortname",
      order = "option_order_in_set"
    )

    # get a descriptive table by sex
    tab_linelist(linelist_clean, sex)

    # describe prenancy statistics, but remove missing data from the tally
    tab_linelist(linelist_clean, trimester, na.rm = TRUE)

    # describe by symptom

    tab_linelist(linelist_clean,
      cough, nasal_discharge, severe_oral_lesions,
      transpose = "value"
    )
    # describe prenancy statistics, stratifying by vitamin A perscription
    tab_linelist(linelist_clean, trimester, sex,
      strata = prescribed_vitamin_a,
      na.rm = TRUE, row_total = TRUE
    )
  })
}
#> > linelist <- epidict::gen_data("Measles", numcases = 1000, org = "MSF")
#> > measles_dict <- epidict::msf_dict("Measles", compact = FALSE)
#> > linelist_clean <- matchmaker::match_df(x = linelist, dictionary = measles_dict, 
#> +     from = "option_code", to = "option_name", by = "data_element_shortname", order = "option_order_in_set")
#> > tab_linelist(linelist_clean, sex)
#> # A tibble: 3 × 4
#>   variable value                   n proportion
#>   <chr>    <chr>               <int>      <dbl>
#> 1 sex      Male                  318       31.8
#> 2 sex      Female                354       35.4
#> 3 sex      Unknown/unspecified   328       32.8
#> > tab_linelist(linelist_clean, trimester, na.rm = TRUE)
#> Warning: Removing 911 missing values
#> # A tibble: 3 × 4
#>   variable  value             n proportion
#>   <chr>     <chr>         <int>      <dbl>
#> 1 trimester 1st trimester    35       39.3
#> 2 trimester 2nd trimester    20       22.5
#> 3 trimester 3rd trimester    34       38.2
#> > tab_linelist(linelist_clean, cough, nasal_discharge, severe_oral_lesions, 
#> +     transpose = "value")
#> # A tibble: 3 × 5
#>   variable            `Yes n` `Yes proportion` `No n` `No proportion`
#>   <fct>                 <dbl>            <dbl>  <dbl>           <dbl>
#> 1 cough                   545             54.5    455            45.5
#> 2 nasal_discharge         507             50.7    493            49.3
#> 3 severe_oral_lesions     483             48.3    517            51.7
#> > tab_linelist(linelist_clean, trimester, sex, strata = prescribed_vitamin_a, 
#> +     na.rm = TRUE, row_total = TRUE)
#> Warning: Removing 911 missing values
#> # A tibble: 6 × 7
#>   variable  value               `Yes n` `Yes proportion` `No n` No propo…¹ Total
#>   <chr>     <chr>                 <dbl>            <dbl>  <dbl>      <dbl> <dbl>
#> 1 trimester 1st trimester            22             44       13       33.3    35
#> 2 trimester 2nd trimester            11             22        9       23.1    20
#> 3 trimester 3rd trimester            17             34       17       43.6    34
#> 4 sex       Male                    138             28.8    180       34.5   318
#> 5 sex       Female                  174             36.3    180       34.5   354
#> 6 sex       Unknown/unspecified     167             34.9    161       30.9   328
#> # … with abbreviated variable name ¹​`No proportion`

have_survey_packages <- require("survey") && require("srvyr")
#> Loading required package: survey
#> Loading required package: grid
#> Loading required package: Matrix
#> Loading required package: survival
#> 
#> Attaching package: ‘survey’
#> The following object is masked from ‘package:graphics’:
#> 
#>     dotchart
#> Loading required package: srvyr
#> 
#> Attaching package: ‘srvyr’
#> The following object is masked from ‘package:stats’:
#> 
#>     filter
if (have_survey_packages) {
  withAutoprint({
    data(api)

    # stratified sample
    surv <- apistrat %>%
      as_survey_design(strata = stype, weights = pw)

    s <- surv %>%
      tab_survey(awards, strata = stype, col_total = TRUE, row_total = TRUE, deff = TRUE)
    s

    # making things pretty
    s %>%
      # wrap all "n" variables in braces (note space before n).
      epikit::augment_redundant(" (n)" = " n") %>%
      # relabel all columns containing "prop" to "% (95% CI)"
      epikit::rename_redundant(
        "% (95% CI)" = ci,
        "Design Effect" = deff
      )

    # long data
    surv %>%
      tab_survey(awards, strata = stype, wide = FALSE)

    # tabulate binary variables
    surv %>%
      tab_survey(yr.rnd, sch.wide, awards, keep = "Yes")

    # stratify the binary variables
    surv %>%
      tab_survey(yr.rnd, sch.wide, awards,
        strata    = stype,
        keep      = "Yes"
      )

    # invert the tabulation
    surv %>%
      tab_survey(yr.rnd, sch.wide, awards,
        strata = stype,
        drop = "Yes",
        deff = TRUE,
        row_total = TRUE
      )
  })
}
#> > data(api)
#> > surv <- apistrat %>% as_survey_design(strata = stype, weights = pw)
#> > s <- surv %>% tab_survey(awards, strata = stype, col_total = TRUE, row_total = TRUE, 
#> +     deff = TRUE)
#> > s
#> # A tibble: 3 × 12
#>   variable value `E n` `E ci`  E def…¹ `H n` `H ci` H def…² `M n` `M ci` M def…³
#>   <chr>    <chr> <dbl> <chr>     <dbl> <dbl> <chr>    <dbl> <dbl> <chr>    <dbl>
#> 1 awards   No    1194. 27.0% …   0.768  513. 68.0%…   0.388  529. 52.0%…   0.534
#> 2 awards   Yes   3227. 73.0% …   0.319  242. 32.0%…   0.308  489. 48.0%…   0.319
#> 3 awards   Total 4421. NA (NA…  NA      755. NA (N…  NA     1018. NA (N…  NA    
#> # … with 1 more variable: `Total n` <dbl>, and abbreviated variable names
#> #   ¹​`E deff`, ²​`H deff`, ³​`M deff`
#> > s %>% epikit::augment_redundant(` (n)` = " n") %>% epikit::rename_redundant(`% (95% CI)` = ci, 
#> +     `Design Effect` = deff)
#> # A tibble: 3 × 12
#>   variable value `E (n)` % (95…¹ Desig…² `H (n)` % (95…³ Desig…⁴ `M (n)` % (95…⁵
#>   <chr>    <chr>   <dbl> <chr>     <dbl>   <dbl> <chr>     <dbl>   <dbl> <chr>  
#> 1 awards   No      1194. 27.0% …   0.768    513. 68.0% …   0.388    529. 52.0% …
#> 2 awards   Yes     3227. 73.0% …   0.319    242. 32.0% …   0.308    489. 48.0% …
#> 3 awards   Total   4421. NA (NA…  NA        755. NA (NA…  NA       1018. NA (NA…
#> # … with 2 more variables: `Design Effect` <dbl>, `Total (n)` <dbl>, and
#> #   abbreviated variable names ¹​`% (95% CI)`, ²​`Design Effect`, ³​`% (95% CI)`,
#> #   ⁴​`Design Effect`, ⁵​`% (95% CI)`
#> > surv %>% tab_survey(awards, strata = stype, wide = FALSE)
#> # A tibble: 6 × 5
#> # Groups:   value [2]
#>   variable value stype     n ci                
#>   <chr>    <chr> <fct> <dbl> <chr>             
#> 1 awards   No    E     1194. 27.0% (19.1--36.7)
#> 2 awards   No    H      513. 68.0% (53.5--79.7)
#> 3 awards   No    M      529. 52.0% (37.9--65.8)
#> 4 awards   Yes   E     3227. 73.0% (63.3--80.9)
#> 5 awards   Yes   H      242. 32.0% (20.3--46.5)
#> 6 awards   Yes   M      489. 48.0% (34.2--62.1)
#> > surv %>% tab_survey(yr.rnd, sch.wide, awards, keep = "Yes")
#> # A tibble: 3 × 4
#>   variable value     n ci                
#>   <chr>    <chr> <dbl> <chr>             
#> 1 yr.rnd   Yes    852. 13.7% (9.1--20.3) 
#> 2 sch.wide Yes   5128. 82.8% (77.4--87.1)
#> 3 awards   Yes   3958. 63.9% (56.8--70.5)
#> > surv %>% tab_survey(yr.rnd, sch.wide, awards, strata = stype, keep = "Yes")
#> # A tibble: 3 × 8
#>   variable value `E n` `E ci`             `H n` `H ci`             `M n` `M ci` 
#>   <chr>    <chr> <dbl> <chr>              <dbl> <chr>              <dbl> <chr>  
#> 1 yr.rnd   Yes    796. 18.0% (11.6--26.9)  15.1 2.0% (0.3--13.7)    40.7 4.0% (…
#> 2 sch.wide Yes   4023. 91.0% (83.4--95.3) 393.  52.0% (37.9--65.8) 713.  70.0% …
#> 3 awards   Yes   3227. 73.0% (63.3--80.9) 242.  32.0% (20.3--46.5) 489.  48.0% …
#> > surv %>% tab_survey(yr.rnd, sch.wide, awards, strata = stype, drop = "Yes", 
#> +     deff = TRUE, row_total = TRUE)
#> # A tibble: 3 × 12
#>   variable value `E n` `E ci`  E def…¹ `H n` `H ci` H def…² `M n` `M ci` M def…³
#>   <chr>    <chr> <dbl> <chr>     <dbl> <dbl> <chr>    <dbl> <dbl> <chr>    <dbl>
#> 1 yr.rnd   No    3625. 82.0% …   0.103  740. 98.0%…  0.0402  977. 96.0%…  0.0658
#> 2 sch.wide No     398. 9.0% (…   1.37   362. 48.0%…  0.704   305. 30.0%…  0.811 
#> 3 awards   No    1194. 27.0% …   0.768  513. 68.0%…  0.388   529. 52.0%…  0.534 
#> # … with 1 more variable: `Total n` <dbl>, and abbreviated variable names
#> #   ¹​`E deff`, ²​`H deff`, ³​`M deff`