diff --git a/data-raw/simulate-data.R b/data-raw/simulate-data.R index 00d83fb..5235416 100644 --- a/data-raw/simulate-data.R +++ b/data-raw/simulate-data.R @@ -29,12 +29,33 @@ library(rvest) # Simulation functions ----------------------------------------------------- +#' Zero pad an integer to a specific length +#' +#' @param x An integer or vector of integers. +#' @param width An integer describing the final width of the zero-padded integer. +#' @keywords internal +#' +#' @return A character vector of integers. +#' +#' @examples +#' pad_integers(x = 1, width = 5) +#' pad_integers(x = c(1, 2, 3), width = 10) pad_integers <- function(x, width) { x |> stringr::str_trunc(width = width, side = "left", ellipsis = "") |> stringr::str_pad(width = width, side = "left", pad = "0") } +#' Create a vector with random ICD-8 or -10 diagnoses +#' +#' @param n The number of ICD-8 or -10 diagnoses to generate. +#' @param date A date determining whether the diagnoses should be ICD-8 or ICD-10. If null, a random date will be sampled to determine which ICD revision the diagnosis should be from. In the Danish registers, ICD-10 is used after 1994. +#' +#' @return A character vector of ICD-10 diagnoses. +#' +#' @examples +#' create_fake_icd(10) +#' create_fake_icd(5, "1995-04-19") create_fake_icd <- function(n, date = NULL) { if (is.null(date)) { date <- sample(c("1993-01-01", "1995-01-01"), 1) @@ -47,6 +68,17 @@ create_fake_icd <- function(n, date = NULL) { } } +#' Create a vector of random ICD-8 diagnoses + +#' @description +#' ICD-8 is the 8th revision of the International Classification of Diseases. +#' +#' @param n The number of ICD-8 diagnoses to generate. +#' +#' @return A character vector of ICD-8 diagnoses. +#' +#' @examples +#' create_fake_icd8(1) create_fake_icd8 <- function(n) { here("data-raw/icd8-codes.csv") |> read_csv() |> @@ -54,6 +86,17 @@ create_fake_icd8 <- function(n) { sample(size = n, replace = TRUE) } +#' Create a vector of random ICD-10 diagnoses. +#' +#' @description +#' ICD-10 is the 10th revision of the International Classification of Diseases. +#' +#' @param n An integer determining how many diagnoses to create. +#' +#' @return A character vector of ICD-10 diagnoses. +#' +#' @examples +#' create_fake_icd10(3) create_fake_icd10 <- function(n) { # from: https://medinfo.dk/sks/brows.php?s_nod=6308 here("data-raw/icd10-codes.csv") |> @@ -62,6 +105,18 @@ create_fake_icd10 <- function(n) { sample(size = n, replace = TRUE) } +#' Create a vector with random ATC codes +#' +#' @description +#' Anatomical Therapeutic Chemical (ATC) codes are unique medicine codes +#' based on on what organ or system it works on and how it works. +#' +#' @param n The number of random ATC codes to generate. +#' +#' @return A character vector of ATC codes. +#' +#' @examples +#' create_fake_atc(10) create_fake_atc <- function(n) { codeCollection::ATCKoodit |> tibble::as_tibble() |> @@ -70,17 +125,49 @@ create_fake_atc <- function(n) { sample(n, replace = TRUE) } +#' Create fake dates +#' +#' @param n The number of dates to generate. +#' @param from A date determining the first date in the interval to sample from. +#' @param to A date determining the last date in the interval to sample from. +#' +#' @return A vector of dates. +#' +#' @examples +#' create_fake_date(20) +#' create_fake_date(20, "1995-04-19", "2024-04-19") create_fake_date <- function(n, from = "1977-01-01", to = lubridate::today()) { seq(as_date(from), as_date(to), by = "day") |> sample(n, replace = TRUE) } +#' Create a vector of random zero-padded integers. +#' +#' @param n The number of integers to generate. +#' @param length An integer determining the length of the padded integer. +#' +#' @return A character vector of integers. +#' +#' @examples +#' create_padded_integer(5, 10) create_padded_integer <- function(n, length) { purrr::map(1:length, \(ignore) sample(0:9, n, replace = TRUE)) |> purrr::reduce(\(integer1, integer2) paste(integer1, integer2, sep = "")) |> pad_integers(width = length) } +#' Create a vector of random NPU codes +#' +#' @description +#' Nomenclature for Properties and Units (NPUs) are codes that identifies +#' laboratory results. +#' +#' @param n The number of NPUs to create. +#' +#' @return A character vector. +#' +#' @examples +#' create_fake_npu(4) create_fake_npu <- function(n) { stringr::str_c( "NPU", @@ -88,6 +175,14 @@ create_fake_npu <- function(n) { ) } +#' Create a vector of random department specialties +#' +#' @param n The number of department specialties to create. +#' +#' @return A character vector. +#' +#' @examples +#' create_fake_hovedspeciale_ans(1000) create_fake_hovedspeciale_ans <- function(n) { "https://www.dst.dk/da/Statistik/dokumentation/Times/forebyggelsesregistret/spec" |> read_html() |> @@ -97,6 +192,15 @@ create_fake_hovedspeciale_ans <- function(n) { sample(n, replace = TRUE) } +#' Create a vector of drug names based on a vector of ATC codes +#' +#' @param atc A character describing an ATC code. +#' +#' @return A character vector with the drug name of the given ATC code. +#' +#' @examples +#' create_fake_drug_name("A03FA05") +#' create_fake_drug_name(c("A03FA05", "A02BA04")) create_fake_drug_name <- function(atc) { codeCollection::ATCKoodit |> tibble::as_tibble() |> @@ -105,20 +209,56 @@ create_fake_drug_name <- function(atc) { sample(length(atc), replace = TRUE) } +#' Transform date(s) to the format wwyy +#' +#' @param x A date or a vector of dates. +#' +#' @return A vector of dates in the format wwyy. +#' +#' @examples +#' to_wwyy("2020-12-01") +#' to_wwyy(c("2020-01-12", "1995-04-19")) to_wwyy <- function(x) { paste0(lubridate::isoweek(lubridate::as_date(x)), stringr::str_sub(lubridate::isoyear(lubridate::as_date(x)), -2)) } +#' Transform date(s) to the format yyyymmdd +#' +#' @param x A date or a vector of dates. +#' +#' @return A vector of dates in the format yyyymmdd. +#' +#' @examples +#' to_yyyymmdd("2020-12-01") +#' to_yyyymmdd(c("2020-01-12", "1995-04-19")) to_yyyymmdd <- function(x) { format(lubridate::as_date(x), format = "%Y%m%d") } # Insert extra values to overrepresent certain values ------------------------------------------------------ +#' Generate logic based on a probability +#' +#' @param proportion A double between 0 and 1. +#' +#' @return A logic vector. TRUE if the random number is less than the proportion, +#' otherwise FALSE. +#' +#' @examples +#' insertion_rate(0.3) insertion_rate <- function(proportion) { runif(1) < proportion } +#' Insert specific ATC codes based on a proportion +#' +#' @param data A tibble. +#' @param proportion Proportion to be resampled. Defaults to 0.3. +#' +#' @return A tibble with a proportion of resampled ATC codes for columns +#' named 'atc' +#' +#' @examples insert_specific_atc <- function(data, proportion = 0.3) { glucose_lowering_drugs <- c( metformin = "A10AB02", @@ -149,42 +289,67 @@ insert_specific_atc <- function(data, proportion = 0.3) { ) } -# Insert a few cases where purchases of metformin are used for other purposes -# than diabetes. +#' Insert cases where metformin is used for other purposes than diabetes +#' +#' @description +#' This function uses the variable 'indo' which is the code for the underlying +#' condition treated by the prescribed medication. +#' +#' @param data A tibble. +#' @param proportion Proportion to resample. Defaults to 0.05. +#' +#' @return A tibble. If all column names in the tibble is either 'atc' or +#' 'name', a proportion of observations is resampled as metmorfin. insert_false_metformin <- function(data, proportion = 0.05) { - if (!all(colnames(data) %in% c("atc", "name"))) { - return(data) - } - data |> - dplyr::mutate( - atc = dplyr::if_else( - indo %in% c("0000092", "0000276", "0000781") & insertion_rate(proportion), - "A10BA02", - atc - ), - name = dplyr::if_else( - indo %in% c("0000092", "0000276", "0000781") & insertion_rate(proportion), - "metformin", - name + if (all(c("atc", "name", "indo") %in% colnames(data))) { + data |> + dplyr::mutate( + atc = dplyr::if_else( + indo %in% c("0000092", "0000276", "0000781") & insertion_rate(proportion), + "A10BA02", + atc + ), + name = dplyr::if_else( + indo %in% c("0000092", "0000276", "0000781") & insertion_rate(proportion), + "metformin", + name + ) ) - ) + } else { + data + } } -# Insert some false positives for Wegovy and Saxenda. +# Insert false positives for Wegovy and Saxenda +#' +#' @param data A tibble. +#' @param proportion Proportion to resample. Defaults to 0.05. +#' +#' @return A tibble. If all column names in the tibble is either 'atc' or 'name' +#' and the atc is a A10BJ06 or A10BJ02, a proportion of observations is resampled +#' to have the name Wegovy Flextouch or Saxenda. insert_false_drug_names <- function(data, proportion = 0.05) { - if (!all(colnames(data) %in% c("atc", "name"))) { - return(data) - } - data |> - mutate( - name = case_when( - atc == "A10BJ06" & insertion_rate(proportion) ~ "Wegovy Flextouch", - atc == "A10BJ02" & insertion_rate(proportion) ~ "Saxenda", - TRUE ~ name + if (all(c("atc", "name") %in% colnames(data))) { + data |> + mutate( + name = case_when( + atc == "A10BJ06" & insertion_rate(proportion) ~ "Wegovy Flextouch", + atc == "A10BJ02" & insertion_rate(proportion) ~ "Saxenda", + TRUE ~ name + ) ) - ) + } else { + data + } } +#' Insert additional analysis codes for HbA1c +#' +#' @param data A tibble. +#' @param proportion Proportion to resample. Defaults to 0.3. +#' +#' @return A tibble. If a column is named "analysiscode", a proportion of the +#' values are replaced by codes for HbA1c. insert_analysiscode <- function(data, proportion = 0.3) { # NPU27300: New units for HbA1c # NPU03835: Old units for HbA1c @@ -201,11 +366,32 @@ insert_analysiscode <- function(data, proportion = 0.3) { ) } +#' Add drug names (from ATC codes) +#' +#' @param data A tibble. +#' +#' @return A tibble. For columns named "name", a fake drug name (atc) will be +#' added. +add_fake_drug_name <- function(data) { + data |> + mutate( + across( + matches("^name$"), + \(x) create_fake_drug_name(atc = atc) + ) + ) +} + # TODO: Need a function to reuse recnum and dw_ek_kontakt in LPR data # Simulate data ----------------------------------------------------------- -# use the simulation definition data to simulate some data +#' Simulate data based on simulation definitions +#' +#' @param data A tibble with simulation definitions. +#' @param n Number of observations to simulate. +#' +#' @return A tibble with simulated data. simulate_data <- function(data, n) { # N needs to be capitalized for fabricatr, and but to be consistent # with other functions and their use of `n`, I kept it lowercase for @@ -222,16 +408,6 @@ simulate_data <- function(data, n) { list_cbind() } -add_fake_drug_name <- function(data) { - data |> - mutate( - across( - matches("^name$"), - \(x) create_fake_drug_name(atc = atc) - ) - ) -} - set.seed(123) simulation_definitions <- here("data-raw/simulation-definitions.csv") |> read_csv(show_col_types = FALSE) |>