Checks for duplicated values in columns
check_duplicate.Rd
Checks for duplicated values in columns
Usage
check_duplicate(
dataset,
uuid_column = "uuid",
columns_to_check = NULL,
log_name = "duplicate_log"
)
Arguments
- dataset
dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset".
- uuid_column
uuid column in the dataset. Default is uuid.
- columns_to_check
string character with the name of the columns to check. If NULL (default), it will check for the uuid_column
- log_name
name of the log of flagged value
Value
return a list with the dataset checked stored as checked_dataset and a dataframe with the duplicate log
Examples
testdata <- data.frame(
uuid = c(letters[1:4], "a", "b", "c"),
col_a = runif(7),
col_b = runif(7)
)
check_duplicate(testdata)
#> $checked_dataset
#> uuid col_a col_b
#> 1 a 0.080750138 0.28976724
#> 2 b 0.834333037 0.73288199
#> 3 c 0.600760886 0.77252151
#> 4 d 0.157208442 0.87460066
#> 5 a 0.007399441 0.17494063
#> 6 b 0.466393497 0.03424133
#> 7 c 0.497777389 0.32038573
#>
#> $duplicate_log
#> uuid old_value question issue
#> 1 a a uuid duplicated uuid
#> 2 b b uuid duplicated uuid
#> 3 c c uuid duplicated uuid
#>
testdata2 <- data.frame(
uuid = letters[c(1:7)],
village = paste("village", c(1:3, 1:3, 4)),
ki_identifier = paste0("xx_", c(1:5, 3, 4))
)
check_duplicate(testdata2, columns_to_check = "village")
#> $checked_dataset
#> uuid village ki_identifier
#> 1 a village 1 xx_1
#> 2 b village 2 xx_2
#> 3 c village 3 xx_3
#> 4 d village 1 xx_4
#> 5 e village 2 xx_5
#> 6 f village 3 xx_3
#> 7 g village 4 xx_4
#>
#> $duplicate_log
#> # A tibble: 3 × 4
#> uuid question old_value issue
#> <chr> <chr> <chr> <glue>
#> 1 d village village 1 duplicated village
#> 2 e village village 2 duplicated village
#> 3 f village village 3 duplicated village
#>
check_duplicate(testdata2, columns_to_check = c("village", "ki_identifier"), uuid = "uuid")
#> $checked_dataset
#> uuid village ki_identifier
#> 1 a village 1 xx_1
#> 2 b village 2 xx_2
#> 3 c village 3 xx_3
#> 4 d village 1 xx_4
#> 5 e village 2 xx_5
#> 6 f village 3 xx_3
#> 7 g village 4 xx_4
#>
#> $duplicate_log
#> # A tibble: 2 × 4
#> uuid question old_value issue
#> <chr> <chr> <chr> <glue>
#> 1 f village village 3 duplicated village ~/~ ki_identifier
#> 2 f ki_identifier xx_3 duplicated village ~/~ ki_identifier
#>