This was written a few days after the retraction of a paper in JAMA due to an error in recoding the treatment variable (https://jamanetwork.com/journals/jama/fullarticle/2752474). This takes a data frame or tibble, fuzzy matches variable names, and produces crosstables of all matched variables. A visual inspection should reveal any miscoding.

check_recode(
  .data,
  dependent = NULL,
  explanatory = NULL,
  include_numerics = TRUE,
  ...
)

Arguments

.data

Data frame or tibble.

dependent

Optional character vector: name(s) of depdendent variable(s).

explanatory

Optional character vector: name(s) of explanatory variable(s).

include_numerics

Logical. Include numeric variables in function.

...

Pass other arguments to agrep.

Value

List of length two. The first is an index of variable combiations. The second is a nested list of crosstables as tibbles.

Examples

library(dplyr)
data(colon_s)
colon_s_small = colon_s %>%
  select(-id, -rx, -rx.factor) %>%
  mutate(
    age.factor2 = forcats::fct_collapse(age.factor,
      "<60 years" = c("<40 years", "40-59 years")),
    sex.factor2 = forcats::fct_recode(sex.factor,
    # Intentional miscode
      "F" = "Male",
      "M" = "Female")
  )

# Check
colon_s_small %>%
  check_recode(include_numerics = FALSE)
#> $index
#> # A tibble: 3 × 2
#>   var1        var2       
#>   <chr>       <chr>      
#> 1 sex.factor  sex.factor2
#> 2 age.factor  age.factor2
#> 3 sex.factor2 age.factor2
#> 
#> $counts
#> $counts[[1]]
#>   sex.factor sex.factor2   n
#> 1     Female           M 445
#> 2       Male           F 484
#> 
#> $counts[[2]]
#>    age.factor age.factor2   n
#> 1   <40 years   <60 years  70
#> 2 40-59 years   <60 years 344
#> 3   60+ years   60+ years 515
#> 
#> $counts[[3]]
#>   sex.factor2 age.factor2   n
#> 1           M   <60 years 204
#> 2           M   60+ years 241
#> 3           F   <60 years 210
#> 4           F   60+ years 274
#> 
#> 

out = colon_s_small %>%
  select(-extent, -extent.factor,-time, -time.years) %>%
  check_recode()
out
#> $index
#> # A tibble: 19 × 2
#>    var1        var2           
#>    <chr>       <chr>          
#>  1 sex         sex.factor     
#>  2 sex         sex.factor2    
#>  3 age         age.factor     
#>  4 age         age.10         
#>  5 age         age.factor2    
#>  6 obstruct    obstruct.factor
#>  7 perfor      perfor.factor  
#>  8 adhere      adhere.factor  
#>  9 nodes       node4          
#> 10 nodes       node4.factor   
#> 11 status      status.factor  
#> 12 differ      differ.factor  
#> 13 surg        surg.factor    
#> 14 node4       node4.factor   
#> 15 sex.factor  sex.factor2    
#> 16 age.factor  age.factor2    
#> 17 loccomp     loccomp.factor 
#> 18 mort_5yr    mort_5yr.num   
#> 19 sex.factor2 age.factor2    
#> 
#> $counts
#> $counts[[1]]
#>   sex sex.factor   n
#> 1   0     Female 445
#> 2   1       Male 484
#> 
#> $counts[[2]]
#>   sex sex.factor2   n
#> 1   0           M 445
#> 2   1           F 484
#> 
#> $counts[[3]]
#>    age  age.factor  n
#> 1   18   <40 years  1
#> 2   22   <40 years  1
#> 3   25   <40 years  1
#> 4   26   <40 years  1
#> 5   27   <40 years  3
#> 6   28   <40 years  1
#> 7   29   <40 years  1
#> 8   30   <40 years  5
#> 9   31   <40 years  2
#> 10  32   <40 years  5
#> 11  33   <40 years  7
#> 12  34   <40 years  4
#> 13  35   <40 years  2
#> 14  36   <40 years 10
#> 15  37   <40 years  2
#> 16  38   <40 years 10
#> 17  39   <40 years 14
#> 18  40 40-59 years  8
#> 19  41 40-59 years  7
#> 20  42 40-59 years  7
#> 21  43 40-59 years 11
#> 22  44 40-59 years  8
#> 23  45 40-59 years 13
#> 24  46 40-59 years 19
#> 25  47 40-59 years 12
#> 26  48 40-59 years 15
#> 27  49 40-59 years 13
#> 28  50 40-59 years 14
#> 29  51 40-59 years 10
#> 30  52 40-59 years 20
#> 31  53 40-59 years 22
#> 32  54 40-59 years 16
#> 33  55 40-59 years 27
#> 34  56 40-59 years 31
#> 35  57 40-59 years 31
#> 36  58 40-59 years 29
#> 37  59 40-59 years 31
#> 38  60   60+ years 31
#> 39  61   60+ years 36
#> 40  62   60+ years 21
#> 41  63   60+ years 29
#> 42  64   60+ years 36
#> 43  65   60+ years 28
#> 44  66   60+ years 35
#> 45  67   60+ years 24
#> 46  68   60+ years 38
#> 47  69   60+ years 20
#> 48  70   60+ years 36
#> 49  71   60+ years 24
#> 50  72   60+ years 25
#> 51  73   60+ years 20
#> 52  74   60+ years 34
#> 53  75   60+ years 17
#> 54  76   60+ years 21
#> 55  77   60+ years 11
#> 56  78   60+ years  5
#> 57  79   60+ years  7
#> 58  80   60+ years  8
#> 59  81   60+ years  5
#> 60  82   60+ years  2
#> 61  83   60+ years  1
#> 62  85   60+ years  1
#> 
#> $counts[[4]]
#>    age age.10  n
#> 1   18    1.8  1
#> 2   22    2.2  1
#> 3   25    2.5  1
#> 4   26    2.6  1
#> 5   27    2.7  3
#> 6   28    2.8  1
#> 7   29    2.9  1
#> 8   30    3.0  5
#> 9   31    3.1  2
#> 10  32    3.2  5
#> 11  33    3.3  7
#> 12  34    3.4  4
#> 13  35    3.5  2
#> 14  36    3.6 10
#> 15  37    3.7  2
#> 16  38    3.8 10
#> 17  39    3.9 14
#> 18  40    4.0  8
#> 19  41    4.1  7
#> 20  42    4.2  7
#> 21  43    4.3 11
#> 22  44    4.4  8
#> 23  45    4.5 13
#> 24  46    4.6 19
#> 25  47    4.7 12
#> 26  48    4.8 15
#> 27  49    4.9 13
#> 28  50    5.0 14
#> 29  51    5.1 10
#> 30  52    5.2 20
#> 31  53    5.3 22
#> 32  54    5.4 16
#> 33  55    5.5 27
#> 34  56    5.6 31
#> 35  57    5.7 31
#> 36  58    5.8 29
#> 37  59    5.9 31
#> 38  60    6.0 31
#> 39  61    6.1 36
#> 40  62    6.2 21
#> 41  63    6.3 29
#> 42  64    6.4 36
#> 43  65    6.5 28
#> 44  66    6.6 35
#> 45  67    6.7 24
#> 46  68    6.8 38
#> 47  69    6.9 20
#> 48  70    7.0 36
#> 49  71    7.1 24
#> 50  72    7.2 25
#> 51  73    7.3 20
#> 52  74    7.4 34
#> 53  75    7.5 17
#> 54  76    7.6 21
#> 55  77    7.7 11
#> 56  78    7.8  5
#> 57  79    7.9  7
#> 58  80    8.0  8
#> 59  81    8.1  5
#> 60  82    8.2  2
#> 61  83    8.3  1
#> 62  85    8.5  1
#> 
#> $counts[[5]]
#>    age age.factor2  n
#> 1   18   <60 years  1
#> 2   22   <60 years  1
#> 3   25   <60 years  1
#> 4   26   <60 years  1
#> 5   27   <60 years  3
#> 6   28   <60 years  1
#> 7   29   <60 years  1
#> 8   30   <60 years  5
#> 9   31   <60 years  2
#> 10  32   <60 years  5
#> 11  33   <60 years  7
#> 12  34   <60 years  4
#> 13  35   <60 years  2
#> 14  36   <60 years 10
#> 15  37   <60 years  2
#> 16  38   <60 years 10
#> 17  39   <60 years 14
#> 18  40   <60 years  8
#> 19  41   <60 years  7
#> 20  42   <60 years  7
#> 21  43   <60 years 11
#> 22  44   <60 years  8
#> 23  45   <60 years 13
#> 24  46   <60 years 19
#> 25  47   <60 years 12
#> 26  48   <60 years 15
#> 27  49   <60 years 13
#> 28  50   <60 years 14
#> 29  51   <60 years 10
#> 30  52   <60 years 20
#> 31  53   <60 years 22
#> 32  54   <60 years 16
#> 33  55   <60 years 27
#> 34  56   <60 years 31
#> 35  57   <60 years 31
#> 36  58   <60 years 29
#> 37  59   <60 years 31
#> 38  60   60+ years 31
#> 39  61   60+ years 36
#> 40  62   60+ years 21
#> 41  63   60+ years 29
#> 42  64   60+ years 36
#> 43  65   60+ years 28
#> 44  66   60+ years 35
#> 45  67   60+ years 24
#> 46  68   60+ years 38
#> 47  69   60+ years 20
#> 48  70   60+ years 36
#> 49  71   60+ years 24
#> 50  72   60+ years 25
#> 51  73   60+ years 20
#> 52  74   60+ years 34
#> 53  75   60+ years 17
#> 54  76   60+ years 21
#> 55  77   60+ years 11
#> 56  78   60+ years  5
#> 57  79   60+ years  7
#> 58  80   60+ years  8
#> 59  81   60+ years  5
#> 60  82   60+ years  2
#> 61  83   60+ years  1
#> 62  85   60+ years  1
#> 
#> $counts[[6]]
#>   obstruct obstruct.factor   n
#> 1        0              No 732
#> 2        1             Yes 176
#> 3       NA            <NA>  21
#> 
#> $counts[[7]]
#>   perfor perfor.factor   n
#> 1      0            No 902
#> 2      1           Yes  27
#> 
#> $counts[[8]]
#>   adhere adhere.factor   n
#> 1      0            No 794
#> 2      1           Yes 135
#> 
#> $counts[[9]]
#>    nodes node4   n
#> 1      0     0   2
#> 2      1     0 269
#> 3      1     1   5
#> 4      2     0 194
#> 5      3     0 124
#> 6      3     1   1
#> 7      4     0  81
#> 8      4     1   3
#> 9      5     0   1
#> 10     5     1  45
#> 11     6     1  43
#> 12     7     1  38
#> 13     8     0   1
#> 14     8     1  22
#> 15     9     0   1
#> 16     9     1  19
#> 17    10     1  13
#> 18    11     1  10
#> 19    12     1  11
#> 20    13     1   7
#> 21    14     1   4
#> 22    15     1   6
#> 23    16     1   1
#> 24    17     1   2
#> 25    19     1   2
#> 26    20     1   2
#> 27    22     1   1
#> 28    24     1   1
#> 29    27     1   1
#> 30    33     1   1
#> 31    NA     0   1
#> 32    NA     1  17
#> 
#> $counts[[10]]
#>    nodes node4.factor   n
#> 1      0           No   2
#> 2      1           No 269
#> 3      1          Yes   5
#> 4      2           No 194
#> 5      3           No 124
#> 6      3          Yes   1
#> 7      4           No  81
#> 8      4          Yes   3
#> 9      5           No   1
#> 10     5          Yes  45
#> 11     6          Yes  43
#> 12     7          Yes  38
#> 13     8           No   1
#> 14     8          Yes  22
#> 15     9           No   1
#> 16     9          Yes  19
#> 17    10          Yes  13
#> 18    11          Yes  10
#> 19    12          Yes  11
#> 20    13          Yes   7
#> 21    14          Yes   4
#> 22    15          Yes   6
#> 23    16          Yes   1
#> 24    17          Yes   2
#> 25    19          Yes   2
#> 26    20          Yes   2
#> 27    22          Yes   1
#> 28    24          Yes   1
#> 29    27          Yes   1
#> 30    33          Yes   1
#> 31    NA           No   1
#> 32    NA          Yes  17
#> 
#> $counts[[11]]
#>   status status.factor   n
#> 1      0         Alive 477
#> 2      1          Died 452
#> 
#> $counts[[12]]
#>   differ differ.factor   n
#> 1      1          Well  93
#> 2      2      Moderate 663
#> 3      3          Poor 150
#> 4     NA          <NA>  23
#> 
#> $counts[[13]]
#>   surg surg.factor   n
#> 1    0       Short 668
#> 2    1        Long 244
#> 3   NA        <NA>  17
#> 
#> $counts[[14]]
#>   node4 node4.factor   n
#> 1     0           No 674
#> 2     1          Yes 255
#> 
#> $counts[[15]]
#>   sex.factor sex.factor2   n
#> 1     Female           M 445
#> 2       Male           F 484
#> 
#> $counts[[16]]
#>    age.factor age.factor2   n
#> 1   <40 years   <60 years  70
#> 2 40-59 years   <60 years 344
#> 3   60+ years   60+ years 515
#> 
#> $counts[[17]]
#>   loccomp loccomp.factor   n
#> 1       0             No 616
#> 2       1            Yes 293
#> 3      NA           <NA>  20
#> 
#> $counts[[18]]
#>   mort_5yr mort_5yr.num   n
#> 1    Alive            1 511
#> 2     Died            2 404
#> 3     <NA>           NA  14
#> 
#> $counts[[19]]
#>   sex.factor2 age.factor2   n
#> 1           M   <60 years 204
#> 2           M   60+ years 241
#> 3           F   <60 years 210
#> 4           F   60+ years 274
#> 
#> 

# Select a tibble and expand
out$counts[[9]]
#>    nodes node4   n
#> 1      0     0   2
#> 2      1     0 269
#> 3      1     1   5
#> 4      2     0 194
#> 5      3     0 124
#> 6      3     1   1
#> 7      4     0  81
#> 8      4     1   3
#> 9      5     0   1
#> 10     5     1  45
#> 11     6     1  43
#> 12     7     1  38
#> 13     8     0   1
#> 14     8     1  22
#> 15     9     0   1
#> 16     9     1  19
#> 17    10     1  13
#> 18    11     1  10
#> 19    12     1  11
#> 20    13     1   7
#> 21    14     1   4
#> 22    15     1   6
#> 23    16     1   1
#> 24    17     1   2
#> 25    19     1   2
#> 26    20     1   2
#> 27    22     1   1
#> 28    24     1   1
#> 29    27     1   1
#> 30    33     1   1
#> 31    NA     0   1
#> 32    NA     1  17
# Note this variable (node4) appears miscoded in original dataset survival::colon.

# Choose to only include variables that you actually use. 
# This uses standard Finalfit grammar. 
dependent = "mort_5yr"
explanatory = c("age.factor2", "sex.factor2")
colon_s_small %>% 
  check_recode(dependent, explanatory)
#> $index
#> # A tibble: 4 × 2
#>   var1        var2        
#>   <chr>       <chr>       
#> 1 mort_5yr    mort_5yr.num
#> 2 age.factor2 age.factor  
#> 3 sex.factor2 sex.factor  
#> 4 sex.factor2 age.factor2 
#> 
#> $counts
#> $counts[[1]]
#>   mort_5yr mort_5yr.num   n
#> 1    Alive            1 511
#> 2     Died            2 404
#> 3     <NA>           NA  14
#> 
#> $counts[[2]]
#>   age.factor2  age.factor   n
#> 1   <60 years   <40 years  70
#> 2   <60 years 40-59 years 344
#> 3   60+ years   60+ years 515
#> 
#> $counts[[3]]
#>   sex.factor2 sex.factor   n
#> 1           M     Female 445
#> 2           F       Male 484
#> 
#> $counts[[4]]
#>   sex.factor2 age.factor2   n
#> 1           M   <60 years 204
#> 2           M   60+ years 241
#> 3           F   <60 years 210
#> 4           F   60+ years 274
#> 
#>