This was written a few days after the retraction of a paper in JAMA due to an error in recoding the treatment variable (https://jamanetwork.com/journals/jama/fullarticle/2752474). This takes a data frame or tibble, fuzzy matches variable names, and produces crosstables of all matched variables. A visual inspection should reveal any miscoding.

check_recode(.data, dependent = NULL, explanatory = NULL,
  include_numerics = TRUE, ...)

Arguments

.data

Data frame or tibble.

dependent

Optional character vector: name(s) of depdendent variable(s).

explanatory

Optional character vector: name(s) of explanatory variable(s).

include_numerics

Logical. Include numeric variables in function.

...

Pass other arguments to agrep.

Value

List of length two. The first is an index of variable combiations. The second is a nested list of crosstables as tibbles.

Examples

library(dplyr) data(colon_s) colon_s_small = colon_s %>% select(-id, -rx, -rx.factor) %>% mutate( age.factor2 = forcats::fct_collapse(age.factor, "<60 years" = c("<40 years", "40-59 years")), sex.factor2 = forcats::fct_recode(sex.factor, # Intentional miscode "F" = "Male", "M" = "Female") ) # Check colon_s_small %>% check_recode(include_numerics = FALSE)
#> $index #> # A tibble: 3 x 2 #> var1 var2 #> <chr> <chr> #> 1 sex.factor sex.factor2 #> 2 age.factor age.factor2 #> 3 sex.factor2 age.factor2 #> #> $counts #> $counts[[1]] #> # A tibble: 2 x 3 #> sex.factor sex.factor2 n #> <fct> <fct> <int> #> 1 Female M 445 #> 2 Male F 484 #> #> $counts[[2]] #> # A tibble: 3 x 3 #> age.factor age.factor2 n #> <fct> <fct> <int> #> 1 <40 years <60 years 70 #> 2 40-59 years <60 years 344 #> 3 60+ years 60+ years 515 #> #> $counts[[3]] #> # A tibble: 4 x 3 #> sex.factor2 age.factor2 n #> <fct> <fct> <int> #> 1 M <60 years 204 #> 2 M 60+ years 241 #> 3 F <60 years 210 #> 4 F 60+ years 274 #> #>
out = colon_s_small %>% select(-extent, -extent.factor,-time, -time.years) %>% check_recode()
#> Warning: Factor `obstruct.factor` contains implicit NA, consider using `forcats::fct_explicit_na`
#> Warning: Factor `differ.factor` contains implicit NA, consider using `forcats::fct_explicit_na`
#> Warning: Factor `surg.factor` contains implicit NA, consider using `forcats::fct_explicit_na`
#> Warning: Factor `loccomp.factor` contains implicit NA, consider using `forcats::fct_explicit_na`
#> Warning: Factor `mort_5yr` contains implicit NA, consider using `forcats::fct_explicit_na`
out
#> $index #> # A tibble: 19 x 2 #> var1 var2 #> <chr> <chr> #> 1 sex sex.factor #> 2 sex sex.factor2 #> 3 age age.factor #> 4 age age.10 #> 5 age age.factor2 #> 6 obstruct obstruct.factor #> 7 perfor perfor.factor #> 8 adhere adhere.factor #> 9 nodes node4 #> 10 nodes node4.factor #> 11 status status.factor #> 12 differ differ.factor #> 13 surg surg.factor #> 14 node4 node4.factor #> 15 sex.factor sex.factor2 #> 16 age.factor age.factor2 #> 17 loccomp loccomp.factor #> 18 mort_5yr mort_5yr.num #> 19 sex.factor2 age.factor2 #> #> $counts #> $counts[[1]] #> # A tibble: 2 x 3 #> sex sex.factor n #> <dbl> <fct> <int> #> 1 0 Female 445 #> 2 1 Male 484 #> #> $counts[[2]] #> # A tibble: 2 x 3 #> sex sex.factor2 n #> <dbl> <fct> <int> #> 1 0 M 445 #> 2 1 F 484 #> #> $counts[[3]] #> # A tibble: 62 x 3 #> age age.factor n #> <dbl> <fct> <int> #> 1 18 <40 years 1 #> 2 22 <40 years 1 #> 3 25 <40 years 1 #> 4 26 <40 years 1 #> 5 27 <40 years 3 #> 6 28 <40 years 1 #> 7 29 <40 years 1 #> 8 30 <40 years 5 #> 9 31 <40 years 2 #> 10 32 <40 years 5 #> # … with 52 more rows #> #> $counts[[4]] #> # A tibble: 62 x 3 #> age age.10 n #> <dbl> <dbl> <int> #> 1 18 1.8 1 #> 2 22 2.2 1 #> 3 25 2.5 1 #> 4 26 2.6 1 #> 5 27 2.7 3 #> 6 28 2.8 1 #> 7 29 2.9 1 #> 8 30 3 5 #> 9 31 3.1 2 #> 10 32 3.2 5 #> # … with 52 more rows #> #> $counts[[5]] #> # A tibble: 62 x 3 #> age age.factor2 n #> <dbl> <fct> <int> #> 1 18 <60 years 1 #> 2 22 <60 years 1 #> 3 25 <60 years 1 #> 4 26 <60 years 1 #> 5 27 <60 years 3 #> 6 28 <60 years 1 #> 7 29 <60 years 1 #> 8 30 <60 years 5 #> 9 31 <60 years 2 #> 10 32 <60 years 5 #> # … with 52 more rows #> #> $counts[[6]] #> # A tibble: 3 x 3 #> obstruct obstruct.factor n #> <dbl> <fct> <int> #> 1 0 No 732 #> 2 1 Yes 176 #> 3 NA NA 21 #> #> $counts[[7]] #> # A tibble: 2 x 3 #> perfor perfor.factor n #> <dbl> <fct> <int> #> 1 0 No 902 #> 2 1 Yes 27 #> #> $counts[[8]] #> # A tibble: 2 x 3 #> adhere adhere.factor n #> <dbl> <fct> <int> #> 1 0 No 794 #> 2 1 Yes 135 #> #> $counts[[9]] #> # A tibble: 32 x 3 #> nodes node4 n #> <dbl> <dbl> <int> #> 1 0 0 2 #> 2 1 0 269 #> 3 1 1 5 #> 4 2 0 194 #> 5 3 0 124 #> 6 3 1 1 #> 7 4 0 81 #> 8 4 1 3 #> 9 5 0 1 #> 10 5 1 45 #> # … with 22 more rows #> #> $counts[[10]] #> # A tibble: 32 x 3 #> nodes node4.factor n #> <dbl> <fct> <int> #> 1 0 No 2 #> 2 1 No 269 #> 3 1 Yes 5 #> 4 2 No 194 #> 5 3 No 124 #> 6 3 Yes 1 #> 7 4 No 81 #> 8 4 Yes 3 #> 9 5 No 1 #> 10 5 Yes 45 #> # … with 22 more rows #> #> $counts[[11]] #> # A tibble: 2 x 3 #> status status.factor n #> <dbl> <fct> <int> #> 1 0 Alive 477 #> 2 1 Died 452 #> #> $counts[[12]] #> # A tibble: 4 x 3 #> differ differ.factor n #> <dbl> <fct> <int> #> 1 1 Well 93 #> 2 2 Moderate 663 #> 3 3 Poor 150 #> 4 NA NA 23 #> #> $counts[[13]] #> # A tibble: 3 x 3 #> surg surg.factor n #> <dbl> <fct> <int> #> 1 0 Short 668 #> 2 1 Long 244 #> 3 NA NA 17 #> #> $counts[[14]] #> # A tibble: 2 x 3 #> node4 node4.factor n #> <dbl> <fct> <int> #> 1 0 No 674 #> 2 1 Yes 255 #> #> $counts[[15]] #> # A tibble: 2 x 3 #> sex.factor sex.factor2 n #> <fct> <fct> <int> #> 1 Female M 445 #> 2 Male F 484 #> #> $counts[[16]] #> # A tibble: 3 x 3 #> age.factor age.factor2 n #> <fct> <fct> <int> #> 1 <40 years <60 years 70 #> 2 40-59 years <60 years 344 #> 3 60+ years 60+ years 515 #> #> $counts[[17]] #> # A tibble: 3 x 3 #> loccomp loccomp.factor n #> <dbl> <fct> <int> #> 1 0 No 616 #> 2 1 Yes 293 #> 3 NA NA 20 #> #> $counts[[18]] #> # A tibble: 3 x 3 #> mort_5yr mort_5yr.num n #> <fct> <dbl> <int> #> 1 Alive 1 511 #> 2 Died 2 404 #> 3 NA NA 14 #> #> $counts[[19]] #> # A tibble: 4 x 3 #> sex.factor2 age.factor2 n #> <fct> <fct> <int> #> 1 M <60 years 204 #> 2 M 60+ years 241 #> 3 F <60 years 210 #> 4 F 60+ years 274 #> #>
# Select a tibble and expand out$counts[[9]] %>% print(n = Inf)
#> # A tibble: 32 x 3 #> nodes node4 n #> <dbl> <dbl> <int> #> 1 0 0 2 #> 2 1 0 269 #> 3 1 1 5 #> 4 2 0 194 #> 5 3 0 124 #> 6 3 1 1 #> 7 4 0 81 #> 8 4 1 3 #> 9 5 0 1 #> 10 5 1 45 #> 11 6 1 43 #> 12 7 1 38 #> 13 8 0 1 #> 14 8 1 22 #> 15 9 0 1 #> 16 9 1 19 #> 17 10 1 13 #> 18 11 1 10 #> 19 12 1 11 #> 20 13 1 7 #> 21 14 1 4 #> 22 15 1 6 #> 23 16 1 1 #> 24 17 1 2 #> 25 19 1 2 #> 26 20 1 2 #> 27 22 1 1 #> 28 24 1 1 #> 29 27 1 1 #> 30 33 1 1 #> 31 NA 0 1 #> 32 NA 1 17
# Note this variable (node4) appears miscoded in original dataset survival::colon. # Choose to only include variables that you actually use. # This uses standard Finalfit grammar. dependent = "mort_5yr" explanatory = c("age.factor2", "sex.factor2") colon_s_small %>% check_recode(dependent, explanatory)
#> Warning: Factor `mort_5yr` contains implicit NA, consider using `forcats::fct_explicit_na`
#> $index #> # A tibble: 4 x 2 #> var1 var2 #> <chr> <chr> #> 1 mort_5yr mort_5yr.num #> 2 age.factor2 age.factor #> 3 sex.factor2 sex.factor #> 4 sex.factor2 age.factor2 #> #> $counts #> $counts[[1]] #> # A tibble: 3 x 3 #> mort_5yr mort_5yr.num n #> <fct> <dbl> <int> #> 1 Alive 1 511 #> 2 Died 2 404 #> 3 NA NA 14 #> #> $counts[[2]] #> # A tibble: 3 x 3 #> age.factor2 age.factor n #> <fct> <fct> <int> #> 1 <60 years <40 years 70 #> 2 <60 years 40-59 years 344 #> 3 60+ years 60+ years 515 #> #> $counts[[3]] #> # A tibble: 2 x 3 #> sex.factor2 sex.factor n #> <fct> <fct> <int> #> 1 M Female 445 #> 2 F Male 484 #> #> $counts[[4]] #> # A tibble: 4 x 3 #> sex.factor2 age.factor2 n #> <fct> <fct> <int> #> 1 M <60 years 204 #> 2 M 60+ years 241 #> 3 F <60 years 210 #> 4 F 60+ years 274 #> #>