Skip to contents

check outliers over the dataset

Usage

check_outliers(
  dataset,
  uuid_column = "uuid",
  element_name = "checked_dataset",
  kobo_survey = NULL,
  kobo_choices = NULL,
  cols_to_add_cleaning_log = NULL,
  strongness_factor = 3,
  minimum_unique_value_of_variable = NULL,
  remove_choice_multiple = TRUE,
  sm_separator = ".",
  columns_not_to_check = NULL
)

Arguments

dataset

dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset"

uuid_column

UUID. Default is uuid

element_name

name of the dataset in list

kobo_survey

Kobo survey sheet. Default is NULL.

kobo_choices

Kobo choices sheet. Default is NULL.

cols_to_add_cleaning_log

Variables those must be included in the output

strongness_factor

Strongness factor define how strong your outliers will be. The default is 3.

minimum_unique_value_of_variable

Default is NULL, mean this parameter won't be considered. For example 10 means for any variable where number of unique value is less than 10, then the variable won't be considered for outlier checking.

remove_choice_multiple

TRUE (default) will remove choice multiple questions from the output.

sm_separator

Separator for choice multiple questions. The default is "."

columns_not_to_check

Columns to exclude from the checks even if they are numeric values.

Value

return a list with the dataset checked stored as checked_dataset and a dataframe with the outliers log

Examples

dataset_outlier <- data.frame(
  uuid = paste0("uuid_", 1:100),
  one_value = c(round(runif(90, min = 45, max = 55)), round(runif(5)), round(runif(5, 99, 100))),
  expense = c(sample(200:500, replace = TRUE, size = 95), c(600, 100, 80, 1020, 1050)),
  income = c(c(60, 0, 80, 1020, 1050), sample(20000:50000, replace = TRUE, size = 95)),
  yy = c(rep(100, 99), 10)
)

check_outliers(dataset = dataset_outlier, uuid_column = "uuid")
#> [1] "checking_one_value"
#> [1] "checking_expense"
#> [1] "checking_income"
#> [1] "checking_yy"
#> $checked_dataset
#>         uuid one_value expense income  yy
#> 1     uuid_1        49     472     60 100
#> 2     uuid_2        47     212      0 100
#> 3     uuid_3        49     296     80 100
#> 4     uuid_4        46     276   1020 100
#> 5     uuid_5        49     487   1050 100
#> 6     uuid_6        55     396  21758 100
#> 7     uuid_7        48     210  37267 100
#> 8     uuid_8        52     491  48533 100
#> 9     uuid_9        52     462  23438 100
#> 10   uuid_10        47     350  33371 100
#> 11   uuid_11        55     233  22033 100
#> 12   uuid_12        52     365  31705 100
#> 13   uuid_13        46     436  38238 100
#> 14   uuid_14        50     224  45277 100
#> 15   uuid_15        52     362  42049 100
#> 16   uuid_16        52     244  26905 100
#> 17   uuid_17        45     409  46665 100
#> 18   uuid_18        47     370  22353 100
#> 19   uuid_19        48     203  28612 100
#> 20   uuid_20        51     496  26162 100
#> 21   uuid_21        50     281  32883 100
#> 22   uuid_22        49     293  46589 100
#> 23   uuid_23        52     403  24301 100
#> 24   uuid_24        54     344  28289 100
#> 25   uuid_25        47     401  48660 100
#> 26   uuid_26        47     435  34180 100
#> 27   uuid_27        52     468  30885 100
#> 28   uuid_28        50     494  33362 100
#> 29   uuid_29        51     412  23184 100
#> 30   uuid_30        52     392  41463 100
#> 31   uuid_31        46     202  28359 100
#> 32   uuid_32        53     462  37117 100
#> 33   uuid_33        53     376  38901 100
#> 34   uuid_34        55     397  20967 100
#> 35   uuid_35        55     467  43181 100
#> 36   uuid_36        49     494  47135 100
#> 37   uuid_37        50     241  36450 100
#> 38   uuid_38        48     239  34353 100
#> 39   uuid_39        47     445  21006 100
#> 40   uuid_40        50     312  43643 100
#> 41   uuid_41        50     443  31167 100
#> 42   uuid_42        53     214  38555 100
#> 43   uuid_43        47     499  22026 100
#> 44   uuid_44        52     284  38967 100
#> 45   uuid_45        46     228  30918 100
#> 46   uuid_46        49     301  21881 100
#> 47   uuid_47        53     341  38740 100
#> 48   uuid_48        48     236  41319 100
#> 49   uuid_49        51     331  44814 100
#> 50   uuid_50        48     255  31428 100
#> 51   uuid_51        51     384  26799 100
#> 52   uuid_52        47     344  40018 100
#> 53   uuid_53        54     230  22403 100
#> 54   uuid_54        50     379  35264 100
#> 55   uuid_55        50     223  34108 100
#> 56   uuid_56        48     455  21048 100
#> 57   uuid_57        49     288  28428 100
#> 58   uuid_58        49     209  32193 100
#> 59   uuid_59        45     410  29302 100
#> 60   uuid_60        50     378  41452 100
#> 61   uuid_61        49     228  45394 100
#> 62   uuid_62        45     248  33598 100
#> 63   uuid_63        49     278  32953 100
#> 64   uuid_64        51     390  47895 100
#> 65   uuid_65        54     478  26288 100
#> 66   uuid_66        49     230  30814 100
#> 67   uuid_67        50     239  24384 100
#> 68   uuid_68        51     334  46088 100
#> 69   uuid_69        48     378  26191 100
#> 70   uuid_70        48     291  41737 100
#> 71   uuid_71        50     303  26121 100
#> 72   uuid_72        54     467  38282 100
#> 73   uuid_73        49     458  46068 100
#> 74   uuid_74        47     271  42929 100
#> 75   uuid_75        52     447  35849 100
#> 76   uuid_76        46     270  40826 100
#> 77   uuid_77        55     409  27319 100
#> 78   uuid_78        46     225  26625 100
#> 79   uuid_79        50     316  39667 100
#> 80   uuid_80        53     417  44900 100
#> 81   uuid_81        52     256  23453 100
#> 82   uuid_82        47     242  26827 100
#> 83   uuid_83        50     488  29750 100
#> 84   uuid_84        53     215  22540 100
#> 85   uuid_85        45     286  28243 100
#> 86   uuid_86        50     342  23075 100
#> 87   uuid_87        53     231  30578 100
#> 88   uuid_88        53     411  48605 100
#> 89   uuid_89        49     418  33780 100
#> 90   uuid_90        47     264  45237 100
#> 91   uuid_91         0     259  22287 100
#> 92   uuid_92         1     480  48754 100
#> 93   uuid_93         1     324  29467 100
#> 94   uuid_94         1     337  40621 100
#> 95   uuid_95         1     242  43323 100
#> 96   uuid_96       100     600  22643 100
#> 97   uuid_97       100     100  44472 100
#> 98   uuid_98       100      80  21021 100
#> 99   uuid_99        99    1020  44212 100
#> 100 uuid_100       100    1050  43365  10
#> 
#> $potential_outliers
#> # A tibble: 18 × 4
#>    uuid     issue                         question  old_value
#>    <chr>    <chr>                         <chr>         <dbl>
#>  1 uuid_91  outlier (normal distribution) one_value         0
#>  2 uuid_92  outlier (normal distribution) one_value         1
#>  3 uuid_93  outlier (normal distribution) one_value         1
#>  4 uuid_94  outlier (normal distribution) one_value         1
#>  5 uuid_95  outlier (normal distribution) one_value         1
#>  6 uuid_96  outlier (normal distribution) one_value       100
#>  7 uuid_97  outlier (normal distribution) one_value       100
#>  8 uuid_98  outlier (normal distribution) one_value       100
#>  9 uuid_99  outlier (normal distribution) one_value        99
#> 10 uuid_100 outlier (normal distribution) one_value       100
#> 11 uuid_99  outlier (normal distribution) expense        1020
#> 12 uuid_100 outlier (normal distribution) expense        1050
#> 13 uuid_97  outlier (log distribution)    expense         100
#> 14 uuid_98  outlier (log distribution)    expense          80
#> 15 uuid_1   outlier (log distribution)    income           60
#> 16 uuid_2   outlier (log distribution)    income            0
#> 17 uuid_3   outlier (log distribution)    income           80
#> 18 uuid_100 outlier (normal distribution) yy               10
#>