Skip to contents

implement cleaning log on raw data set.

Usage

create_clean_data(
  raw_dataset,
  raw_data_uuid_column = "uuid",
  cleaning_log,
  cleaning_log_uuid_column = "uuid",
  cleaning_log_question_column,
  cleaning_log_new_value_column,
  cleaning_log_change_type_column,
  change_response_value = "change_response",
  NA_response_value = "blank_response",
  no_change_value = "no_action",
  remove_survey_value = "remove_survey"
)

Arguments

raw_dataset

Raw dataset

raw_data_uuid_column

uuid column in the raw dataset. Default is uuid.

cleaning_log

cleaning + deletion log (data.frame).

cleaning_log_uuid_column

uuid column in the cleaning log. Default is "uuid".

cleaning_log_question_column

column in cleaning log which specifies data set column to change

cleaning_log_new_value_column

cleaning log column specifying the new correct value

cleaning_log_change_type_column

column in cleaning log which specifies change type to be made

change_response_value

values in change type column which should be changed to a new value.

NA_response_value

values in change type column which should be blank (NA).

no_change_value

values in change type column which should NOT be changed to a new value.

remove_survey_value

values in change type column which should be deleted from the data.

Value

clean data set

Examples


create_clean_data(
raw_dataset = cleaningtools_raw_data,
raw_data_uuid_column = "X_uuid",
cleaning_log = cleaningtools_cleaning_log,
cleaning_log_uuid_column = "X_uuid",
cleaning_log_question_column = "questions",
cleaning_log_new_value_column = "new_value",
cleaning_log_change_type_column = "change_type"
)
#> [1] "air_coolers_nb"
#> [1] "water_tank_litres_nb"
#> [1] "air_coolers_nb"
#> [1] "air_coolers_nb"
#> [1] "num_hh_hosting"
#> [1] "number_pumps"
#> [1] "pay_water_charges_amount"
#> [1] "water_sources.borehole"
#> [1] "water_sources.borehole"
#> [1] "connection_fees_amount"
#> [1] "connection_fees_amount"
#> [1] "connection_fees_amount"
#> [1] "connection_fees_amount"
#> [1] "tot_expenses"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "treat_drink_water"
#> [1] "treat_drink_water"
#> [1] "treat_drink_water_how.filter"
#> [1] "treat_drink_water_how.expose_sunlight"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> # A tibble: 578 × 348
#>    X.U.FEFF.start    end   date_assessment deviceid enumerator_num neighbourhood
#>    <chr>             <chr> <chr>           <chr>             <int> <chr>        
#>  1 2021-07-06T11:19… 2021… 2021-07-06      collect…              5 frgdf        
#>  2 2021-07-06T12:15… 2021… 2021-07-06      collect…              5 frgdf        
#>  3 2021-07-06T11:35… 2021… 2021-07-06      collect…             16 Bccccccc_1   
#>  4 2021-07-06T13:30… 2021… 2021-07-06      collect…              5 frgdf        
#>  5 2021-07-06T12:29… 2021… 2021-07-06      collect…             16 Bccccccc_1   
#>  6 2021-07-06T14:27… 2021… 2021-07-06      collect…              5 frgdf        
#>  7 2021-07-06T13:15… 2021… 2021-07-06      collect…             16 Bx1          
#>  8 2021-07-06T14:26… 2021… 2021-07-06      collect…             16 Bx1          
#>  9 2021-07-05T23:39… 2021… 2021-07-05      collect…             15 Bccccccc_1   
#> 10 2021-07-06T00:30… 2021… 2021-07-06      collect…             15 Bccccccc_1   
#> # ℹ 568 more rows
#> # ℹ 342 more variables: consent_remote <chr>, hhh_r <chr>,
#> #   will_to_response_r <chr>, age_respondent_r <int>, age_hoh <int>,
#> #   gender_hoh <chr>, displace_status <chr>, displace_status_returnee <chr>,
#> #   return_date <chr>, num_hh_member <int>, hh_hosting <chr>,
#> #   num_hh_hosting <int>, hh_hosted <chr>, shelter_occupation <chr>,
#> #   shelter_occupation_other <chr>, property_title <chr>, …

cleaning_log_test <- data.frame(
  uuid = paste0("uuid", 1:4),
  question = c("age", "gender", "pop_group", "strata"),
  change_type = c("blank_response", "no_change", "Delete", "change_res"),
  new_value = c(NA_character_, NA_character_, NA_character_, "st-a")
)
test_data <- data.frame(
  uuid = paste0("uuid", 1:4),
  age = c(180, 23, 45, 67),
  gender = c("male", "female", "male", "female"),
  pop_group = c("idp", "refugee", "host", "idp"),
  strata = c("a", "b", "c", "d")
)

review_cleaning_log(
  raw_dataset = test_data,
  raw_data_uuid_column = "uuid",
  cleaning_log = cleaning_log_test,
  cleaning_log_change_type_column = "change_type",
  change_response_value = "change_res",
  cleaning_log_question_column = "question",
  cleaning_log_uuid_column = "uuid",
  cleaning_log_new_value_column = "new_value"
)
#> [1] "no issues in cleaning log found"

create_clean_data(
  raw_dataset = test_data, raw_data_uuid_column = "uuid", cleaning_log = cleaning_log_test,
  cleaning_log_change_type_column = "change_type",
  change_response_value = "change_res",
  NA_response_value = "blank_response",
  no_change_value = "no_change",
  remove_survey_value = "Delete",
  cleaning_log_question_column = "question",
  cleaning_log_uuid_column = "uuid",
  cleaning_log_new_value_column = "new_value"
)
#> [1] "age"
#> [1] "strata"
#>    uuid age gender pop_group strata
#> 1 uuid1  NA   male       idp      a
#> 2 uuid2  23 female   refugee      b
#> 3 uuid4  67 female       idp   st-a