Skip to contents

The function will flag if a survey for its missing values. The missing values column can be created with add_percentage_missing and the values are flagged with check_outliers.

Usage

check_percentage_missing(
  dataset,
  uuid_column = "uuid",
  column_to_check = "percentage_missing",
  strongness_factor = 2,
  log_name = "percentage_missing_log"
)

Arguments

dataset

a dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset".

uuid_column

uuid column in the dataset. Default is "uuid".

column_to_check

string character with the name of the columns to check. Default is "percentage_missing"

strongness_factor

Strongness factor define how strong your outliers will be. The default is 3.

log_name

name of the log of flagged value, default is percentage_missing_log

Value

return a list with the dataset checked stored as checked_dataset and a dataframe with the flagged values log

Examples

# Adding the percentage missing first
data_example <- data.frame(
  uuid = letters[1:3],
  col_1 = c(1:3),
  col_2 = c(NA, NA, "expenditures"),
  col_3 = c("with need", NA, "with need"),
  col_4 = c("food health school", NA, "food"),
  col_4.food = c(1, NA, 1),
  col_4.health = c(1, NA, 0),
  col_4.school = c(1, NA, 0)
)
data_example <- data_example %>%
  add_percentage_missing()
data_example %>% check_percentage_missing()
#> [1] "checking_percentage_missing"
#> $checked_dataset
#>   uuid col_1        col_2     col_3              col_4 col_4.food col_4.health
#> 1    a     1         <NA> with need food health school          1            1
#> 2    b     2         <NA>      <NA>               <NA>         NA           NA
#> 3    c     3 expenditures with need               food          1            0
#>   col_4.school percentage_missing
#> 1            1              0.125
#> 2           NA              0.750
#> 3            0              0.000
#> 
#> $percentage_missing_log
#> # A tibble: 0 × 4
#> # ℹ 4 variables: uuid <chr>, issue <chr>, question <chr>, old_value <chr>
#> 

# With a dataset that already has a percentage missing
data_example2 <- data.frame(
  uuid = letters,
  any_cols = LETTERS,
  any_number = 1:26,
  percentage_missing = c(rep(.05, 25), .99)
)
data_example2 %>% check_percentage_missing()
#> [1] "checking_percentage_missing"
#> $checked_dataset
#>    uuid any_cols any_number percentage_missing
#> 1     a        A          1               0.05
#> 2     b        B          2               0.05
#> 3     c        C          3               0.05
#> 4     d        D          4               0.05
#> 5     e        E          5               0.05
#> 6     f        F          6               0.05
#> 7     g        G          7               0.05
#> 8     h        H          8               0.05
#> 9     i        I          9               0.05
#> 10    j        J         10               0.05
#> 11    k        K         11               0.05
#> 12    l        L         12               0.05
#> 13    m        M         13               0.05
#> 14    n        N         14               0.05
#> 15    o        O         15               0.05
#> 16    p        P         16               0.05
#> 17    q        Q         17               0.05
#> 18    r        R         18               0.05
#> 19    s        S         19               0.05
#> 20    t        T         20               0.05
#> 21    u        U         21               0.05
#> 22    v        V         22               0.05
#> 23    w        W         23               0.05
#> 24    x        X         24               0.05
#> 25    y        Y         25               0.05
#> 26    z        Z         26               0.99
#> 
#> $percentage_missing_log
#> # A tibble: 1 × 4
#>   uuid  issue                                                 question old_value
#>   <chr> <chr>                                                 <chr>    <chr>    
#> 1 z     Percentages of missing values from this survey is di… percent… 0.99     
#>