Skip to contents

Checks for duplicated values in columns

Usage

check_duplicate(
  dataset,
  uuid_column = "uuid",
  columns_to_check = NULL,
  log_name = "duplicate_log"
)

Arguments

dataset

dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset".

uuid_column

uuid column in the dataset. Default is uuid.

columns_to_check

string character with the name of the columns to check. If NULL (default), it will check for the uuid_column

log_name

name of the log of flagged value

Value

return a list with the dataset checked stored as checked_dataset and a dataframe with the duplicate log

Examples

testdata <- data.frame(
  uuid = c(letters[1:4], "a", "b", "c"),
  col_a = runif(7),
  col_b = runif(7)
)

check_duplicate(testdata)
#> $checked_dataset
#>   uuid       col_a      col_b
#> 1    a 0.080750138 0.28976724
#> 2    b 0.834333037 0.73288199
#> 3    c 0.600760886 0.77252151
#> 4    d 0.157208442 0.87460066
#> 5    a 0.007399441 0.17494063
#> 6    b 0.466393497 0.03424133
#> 7    c 0.497777389 0.32038573
#> 
#> $duplicate_log
#>   uuid old_value question           issue
#> 1    a         a     uuid duplicated uuid
#> 2    b         b     uuid duplicated uuid
#> 3    c         c     uuid duplicated uuid
#> 

testdata2 <- data.frame(
  uuid = letters[c(1:7)],
  village = paste("village", c(1:3, 1:3, 4)),
  ki_identifier = paste0("xx_", c(1:5, 3, 4))
)

check_duplicate(testdata2, columns_to_check = "village")
#> $checked_dataset
#>   uuid   village ki_identifier
#> 1    a village 1          xx_1
#> 2    b village 2          xx_2
#> 3    c village 3          xx_3
#> 4    d village 1          xx_4
#> 5    e village 2          xx_5
#> 6    f village 3          xx_3
#> 7    g village 4          xx_4
#> 
#> $duplicate_log
#> # A tibble: 3 × 4
#>   uuid  question old_value issue             
#>   <chr> <chr>    <chr>     <glue>            
#> 1 d     village  village 1 duplicated village
#> 2 e     village  village 2 duplicated village
#> 3 f     village  village 3 duplicated village
#> 

check_duplicate(testdata2, columns_to_check = c("village", "ki_identifier"), uuid = "uuid")
#> $checked_dataset
#>   uuid   village ki_identifier
#> 1    a village 1          xx_1
#> 2    b village 2          xx_2
#> 3    c village 3          xx_3
#> 4    d village 1          xx_4
#> 5    e village 2          xx_5
#> 6    f village 3          xx_3
#> 7    g village 4          xx_4
#> 
#> $duplicate_log
#> # A tibble: 2 × 4
#>   uuid  question      old_value issue                               
#>   <chr> <chr>         <chr>     <glue>                              
#> 1 f     village       village 3 duplicated village ~/~ ki_identifier
#> 2 f     ki_identifier xx_3      duplicated village ~/~ ki_identifier
#>