check outliers over the dataset
check_outliers.Rd
check outliers over the dataset
Usage
check_outliers(
dataset,
uuid_column = "uuid",
element_name = "checked_dataset",
kobo_survey = NULL,
kobo_choices = NULL,
cols_to_add_cleaning_log = NULL,
strongness_factor = 3,
minimum_unique_value_of_variable = NULL,
remove_choice_multiple = TRUE,
sm_separator = ".",
columns_not_to_check = NULL
)
Arguments
- dataset
dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset"
- uuid_column
UUID. Default is uuid
- element_name
name of the dataset in list
- kobo_survey
Kobo survey sheet. Default is NULL.
- kobo_choices
Kobo choices sheet. Default is NULL.
- cols_to_add_cleaning_log
Variables those must be included in the output
- strongness_factor
Strongness factor define how strong your outliers will be. The default is 3.
- minimum_unique_value_of_variable
Default is NULL, mean this parameter won't be considered. For example 10 means for any variable where number of unique value is less than 10, then the variable won't be considered for outlier checking.
- remove_choice_multiple
TRUE (default) will remove choice multiple questions from the output.
- sm_separator
Separator for choice multiple questions. The default is "."
- columns_not_to_check
Columns to exclude from the checks even if they are numeric values.
Value
return a list with the dataset checked stored as checked_dataset and a dataframe with the outliers log
Examples
dataset_outlier <- data.frame(
uuid = paste0("uuid_", 1:100),
one_value = c(round(runif(90, min = 45, max = 55)), round(runif(5)), round(runif(5, 99, 100))),
expense = c(sample(200:500, replace = TRUE, size = 95), c(600, 100, 80, 1020, 1050)),
income = c(c(60, 0, 80, 1020, 1050), sample(20000:50000, replace = TRUE, size = 95)),
yy = c(rep(100, 99), 10)
)
check_outliers(dataset = dataset_outlier, uuid_column = "uuid")
#> [1] "checking_one_value"
#> [1] "checking_expense"
#> [1] "checking_income"
#> [1] "checking_yy"
#> $checked_dataset
#> uuid one_value expense income yy
#> 1 uuid_1 49 472 60 100
#> 2 uuid_2 47 212 0 100
#> 3 uuid_3 49 296 80 100
#> 4 uuid_4 46 276 1020 100
#> 5 uuid_5 49 487 1050 100
#> 6 uuid_6 55 396 21758 100
#> 7 uuid_7 48 210 37267 100
#> 8 uuid_8 52 491 48533 100
#> 9 uuid_9 52 462 23438 100
#> 10 uuid_10 47 350 33371 100
#> 11 uuid_11 55 233 22033 100
#> 12 uuid_12 52 365 31705 100
#> 13 uuid_13 46 436 38238 100
#> 14 uuid_14 50 224 45277 100
#> 15 uuid_15 52 362 42049 100
#> 16 uuid_16 52 244 26905 100
#> 17 uuid_17 45 409 46665 100
#> 18 uuid_18 47 370 22353 100
#> 19 uuid_19 48 203 28612 100
#> 20 uuid_20 51 496 26162 100
#> 21 uuid_21 50 281 32883 100
#> 22 uuid_22 49 293 46589 100
#> 23 uuid_23 52 403 24301 100
#> 24 uuid_24 54 344 28289 100
#> 25 uuid_25 47 401 48660 100
#> 26 uuid_26 47 435 34180 100
#> 27 uuid_27 52 468 30885 100
#> 28 uuid_28 50 494 33362 100
#> 29 uuid_29 51 412 23184 100
#> 30 uuid_30 52 392 41463 100
#> 31 uuid_31 46 202 28359 100
#> 32 uuid_32 53 462 37117 100
#> 33 uuid_33 53 376 38901 100
#> 34 uuid_34 55 397 20967 100
#> 35 uuid_35 55 467 43181 100
#> 36 uuid_36 49 494 47135 100
#> 37 uuid_37 50 241 36450 100
#> 38 uuid_38 48 239 34353 100
#> 39 uuid_39 47 445 21006 100
#> 40 uuid_40 50 312 43643 100
#> 41 uuid_41 50 443 31167 100
#> 42 uuid_42 53 214 38555 100
#> 43 uuid_43 47 499 22026 100
#> 44 uuid_44 52 284 38967 100
#> 45 uuid_45 46 228 30918 100
#> 46 uuid_46 49 301 21881 100
#> 47 uuid_47 53 341 38740 100
#> 48 uuid_48 48 236 41319 100
#> 49 uuid_49 51 331 44814 100
#> 50 uuid_50 48 255 31428 100
#> 51 uuid_51 51 384 26799 100
#> 52 uuid_52 47 344 40018 100
#> 53 uuid_53 54 230 22403 100
#> 54 uuid_54 50 379 35264 100
#> 55 uuid_55 50 223 34108 100
#> 56 uuid_56 48 455 21048 100
#> 57 uuid_57 49 288 28428 100
#> 58 uuid_58 49 209 32193 100
#> 59 uuid_59 45 410 29302 100
#> 60 uuid_60 50 378 41452 100
#> 61 uuid_61 49 228 45394 100
#> 62 uuid_62 45 248 33598 100
#> 63 uuid_63 49 278 32953 100
#> 64 uuid_64 51 390 47895 100
#> 65 uuid_65 54 478 26288 100
#> 66 uuid_66 49 230 30814 100
#> 67 uuid_67 50 239 24384 100
#> 68 uuid_68 51 334 46088 100
#> 69 uuid_69 48 378 26191 100
#> 70 uuid_70 48 291 41737 100
#> 71 uuid_71 50 303 26121 100
#> 72 uuid_72 54 467 38282 100
#> 73 uuid_73 49 458 46068 100
#> 74 uuid_74 47 271 42929 100
#> 75 uuid_75 52 447 35849 100
#> 76 uuid_76 46 270 40826 100
#> 77 uuid_77 55 409 27319 100
#> 78 uuid_78 46 225 26625 100
#> 79 uuid_79 50 316 39667 100
#> 80 uuid_80 53 417 44900 100
#> 81 uuid_81 52 256 23453 100
#> 82 uuid_82 47 242 26827 100
#> 83 uuid_83 50 488 29750 100
#> 84 uuid_84 53 215 22540 100
#> 85 uuid_85 45 286 28243 100
#> 86 uuid_86 50 342 23075 100
#> 87 uuid_87 53 231 30578 100
#> 88 uuid_88 53 411 48605 100
#> 89 uuid_89 49 418 33780 100
#> 90 uuid_90 47 264 45237 100
#> 91 uuid_91 0 259 22287 100
#> 92 uuid_92 1 480 48754 100
#> 93 uuid_93 1 324 29467 100
#> 94 uuid_94 1 337 40621 100
#> 95 uuid_95 1 242 43323 100
#> 96 uuid_96 100 600 22643 100
#> 97 uuid_97 100 100 44472 100
#> 98 uuid_98 100 80 21021 100
#> 99 uuid_99 99 1020 44212 100
#> 100 uuid_100 100 1050 43365 10
#>
#> $potential_outliers
#> # A tibble: 18 × 4
#> uuid issue question old_value
#> <chr> <chr> <chr> <dbl>
#> 1 uuid_91 outlier (normal distribution) one_value 0
#> 2 uuid_92 outlier (normal distribution) one_value 1
#> 3 uuid_93 outlier (normal distribution) one_value 1
#> 4 uuid_94 outlier (normal distribution) one_value 1
#> 5 uuid_95 outlier (normal distribution) one_value 1
#> 6 uuid_96 outlier (normal distribution) one_value 100
#> 7 uuid_97 outlier (normal distribution) one_value 100
#> 8 uuid_98 outlier (normal distribution) one_value 100
#> 9 uuid_99 outlier (normal distribution) one_value 99
#> 10 uuid_100 outlier (normal distribution) one_value 100
#> 11 uuid_99 outlier (normal distribution) expense 1020
#> 12 uuid_100 outlier (normal distribution) expense 1050
#> 13 uuid_97 outlier (log distribution) expense 100
#> 14 uuid_98 outlier (log distribution) expense 80
#> 15 uuid_1 outlier (log distribution) income 60
#> 16 uuid_2 outlier (log distribution) income 0
#> 17 uuid_3 outlier (log distribution) income 80
#> 18 uuid_100 outlier (normal distribution) yy 10
#>