implement cleaning log on raw data set.
create_clean_data.Rd
implement cleaning log on raw data set.
Usage
create_clean_data(
raw_dataset,
raw_data_uuid_column = "uuid",
cleaning_log,
cleaning_log_uuid_column = "uuid",
cleaning_log_question_column,
cleaning_log_new_value_column,
cleaning_log_change_type_column,
change_response_value = "change_response",
NA_response_value = "blank_response",
no_change_value = "no_action",
remove_survey_value = "remove_survey"
)
Arguments
- raw_dataset
Raw dataset
- raw_data_uuid_column
uuid column in the raw dataset. Default is uuid.
- cleaning_log
cleaning + deletion log (data.frame).
- cleaning_log_uuid_column
uuid column in the cleaning log. Default is "uuid".
- cleaning_log_question_column
column in cleaning log which specifies data set column to change
- cleaning_log_new_value_column
cleaning log column specifying the new correct value
- cleaning_log_change_type_column
column in cleaning log which specifies change type to be made
- change_response_value
values in change type column which should be changed to a new value.
- NA_response_value
values in change type column which should be blank (NA).
- no_change_value
values in change type column which should NOT be changed to a new value.
- remove_survey_value
values in change type column which should be deleted from the data.
Examples
create_clean_data(
raw_dataset = cleaningtools_raw_data,
raw_data_uuid_column = "X_uuid",
cleaning_log = cleaningtools_cleaning_log,
cleaning_log_uuid_column = "X_uuid",
cleaning_log_question_column = "questions",
cleaning_log_new_value_column = "new_value",
cleaning_log_change_type_column = "change_type"
)
#> [1] "air_coolers_nb"
#> [1] "water_tank_litres_nb"
#> [1] "air_coolers_nb"
#> [1] "air_coolers_nb"
#> [1] "num_hh_hosting"
#> [1] "number_pumps"
#> [1] "pay_water_charges_amount"
#> [1] "water_sources.borehole"
#> [1] "water_sources.borehole"
#> [1] "connection_fees_amount"
#> [1] "connection_fees_amount"
#> [1] "connection_fees_amount"
#> [1] "connection_fees_amount"
#> [1] "tot_expenses"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "treat_drink_water"
#> [1] "treat_drink_water"
#> [1] "treat_drink_water_how.filter"
#> [1] "treat_drink_water_how.expose_sunlight"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> [1] "return_date"
#> # A tibble: 578 × 348
#> X.U.FEFF.start end date_assessment deviceid enumerator_num neighbourhood
#> <chr> <chr> <chr> <chr> <int> <chr>
#> 1 2021-07-06T11:19… 2021… 2021-07-06 collect… 5 frgdf
#> 2 2021-07-06T12:15… 2021… 2021-07-06 collect… 5 frgdf
#> 3 2021-07-06T11:35… 2021… 2021-07-06 collect… 16 Bccccccc_1
#> 4 2021-07-06T13:30… 2021… 2021-07-06 collect… 5 frgdf
#> 5 2021-07-06T12:29… 2021… 2021-07-06 collect… 16 Bccccccc_1
#> 6 2021-07-06T14:27… 2021… 2021-07-06 collect… 5 frgdf
#> 7 2021-07-06T13:15… 2021… 2021-07-06 collect… 16 Bx1
#> 8 2021-07-06T14:26… 2021… 2021-07-06 collect… 16 Bx1
#> 9 2021-07-05T23:39… 2021… 2021-07-05 collect… 15 Bccccccc_1
#> 10 2021-07-06T00:30… 2021… 2021-07-06 collect… 15 Bccccccc_1
#> # ℹ 568 more rows
#> # ℹ 342 more variables: consent_remote <chr>, hhh_r <chr>,
#> # will_to_response_r <chr>, age_respondent_r <int>, age_hoh <int>,
#> # gender_hoh <chr>, displace_status <chr>, displace_status_returnee <chr>,
#> # return_date <chr>, num_hh_member <int>, hh_hosting <chr>,
#> # num_hh_hosting <int>, hh_hosted <chr>, shelter_occupation <chr>,
#> # shelter_occupation_other <chr>, property_title <chr>, …
cleaning_log_test <- data.frame(
uuid = paste0("uuid", 1:4),
question = c("age", "gender", "pop_group", "strata"),
change_type = c("blank_response", "no_change", "Delete", "change_res"),
new_value = c(NA_character_, NA_character_, NA_character_, "st-a")
)
test_data <- data.frame(
uuid = paste0("uuid", 1:4),
age = c(180, 23, 45, 67),
gender = c("male", "female", "male", "female"),
pop_group = c("idp", "refugee", "host", "idp"),
strata = c("a", "b", "c", "d")
)
review_cleaning_log(
raw_dataset = test_data,
raw_data_uuid_column = "uuid",
cleaning_log = cleaning_log_test,
cleaning_log_change_type_column = "change_type",
change_response_value = "change_res",
cleaning_log_question_column = "question",
cleaning_log_uuid_column = "uuid",
cleaning_log_new_value_column = "new_value"
)
#> [1] "no issues in cleaning log found"
create_clean_data(
raw_dataset = test_data, raw_data_uuid_column = "uuid", cleaning_log = cleaning_log_test,
cleaning_log_change_type_column = "change_type",
change_response_value = "change_res",
NA_response_value = "blank_response",
no_change_value = "no_change",
remove_survey_value = "Delete",
cleaning_log_question_column = "question",
cleaning_log_uuid_column = "uuid",
cleaning_log_new_value_column = "new_value"
)
#> [1] "age"
#> [1] "strata"
#> uuid age gender pop_group strata
#> 1 uuid1 NA male idp a
#> 2 uuid2 23 female refugee b
#> 3 uuid4 67 female idp st-a