Skip to content

Commit

Permalink
Merged origin/main into build/add-targets-pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
lwjohnst86 committed Jun 25, 2024
2 parents d511c24 + 8a2f966 commit 09810da
Showing 1 changed file with 215 additions and 39 deletions.
254 changes: 215 additions & 39 deletions data-raw/simulate-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,33 @@ library(rvest)

# Simulation functions -----------------------------------------------------

#' Zero pad an integer to a specific length
#'
#' @param x An integer or vector of integers.
#' @param width An integer describing the final width of the zero-padded integer.
#' @keywords internal
#'
#' @return A character vector of integers.
#'
#' @examples
#' pad_integers(x = 1, width = 5)
#' pad_integers(x = c(1, 2, 3), width = 10)
pad_integers <- function(x, width) {
x |>
stringr::str_trunc(width = width, side = "left", ellipsis = "") |>
stringr::str_pad(width = width, side = "left", pad = "0")
}

#' Create a vector with random ICD-8 or -10 diagnoses
#'
#' @param n The number of ICD-8 or -10 diagnoses to generate.
#' @param date A date determining whether the diagnoses should be ICD-8 or ICD-10. If null, a random date will be sampled to determine which ICD revision the diagnosis should be from. In the Danish registers, ICD-10 is used after 1994.
#'
#' @return A character vector of ICD-10 diagnoses.
#'
#' @examples
#' create_fake_icd(10)
#' create_fake_icd(5, "1995-04-19")
create_fake_icd <- function(n, date = NULL) {
if (is.null(date)) {
date <- sample(c("1993-01-01", "1995-01-01"), 1)
Expand All @@ -47,13 +68,35 @@ create_fake_icd <- function(n, date = NULL) {
}
}

#' Create a vector of random ICD-8 diagnoses

#' @description
#' ICD-8 is the 8th revision of the International Classification of Diseases.
#'
#' @param n The number of ICD-8 diagnoses to generate.
#'
#' @return A character vector of ICD-8 diagnoses.
#'
#' @examples
#' create_fake_icd8(1)
create_fake_icd8 <- function(n) {
here("data-raw/icd8-codes.csv") |>
read_csv() |>
pull(icd8) |>
sample(size = n, replace = TRUE)
}

#' Create a vector of random ICD-10 diagnoses.
#'
#' @description
#' ICD-10 is the 10th revision of the International Classification of Diseases.
#'
#' @param n An integer determining how many diagnoses to create.
#'
#' @return A character vector of ICD-10 diagnoses.
#'
#' @examples
#' create_fake_icd10(3)
create_fake_icd10 <- function(n) {
# from: https://medinfo.dk/sks/brows.php?s_nod=6308
here("data-raw/icd10-codes.csv") |>
Expand All @@ -62,6 +105,18 @@ create_fake_icd10 <- function(n) {
sample(size = n, replace = TRUE)
}

#' Create a vector with random ATC codes
#'
#' @description
#' Anatomical Therapeutic Chemical (ATC) codes are unique medicine codes
#' based on on what organ or system it works on and how it works.
#'
#' @param n The number of random ATC codes to generate.
#'
#' @return A character vector of ATC codes.
#'
#' @examples
#' create_fake_atc(10)
create_fake_atc <- function(n) {
codeCollection::ATCKoodit |>
tibble::as_tibble() |>
Expand All @@ -70,24 +125,64 @@ create_fake_atc <- function(n) {
sample(n, replace = TRUE)
}

#' Create fake dates
#'
#' @param n The number of dates to generate.
#' @param from A date determining the first date in the interval to sample from.
#' @param to A date determining the last date in the interval to sample from.
#'
#' @return A vector of dates.
#'
#' @examples
#' create_fake_date(20)
#' create_fake_date(20, "1995-04-19", "2024-04-19")
create_fake_date <- function(n, from = "1977-01-01", to = lubridate::today()) {
seq(as_date(from), as_date(to), by = "day") |>
sample(n, replace = TRUE)
}

#' Create a vector of random zero-padded integers.
#'
#' @param n The number of integers to generate.
#' @param length An integer determining the length of the padded integer.
#'
#' @return A character vector of integers.
#'
#' @examples
#' create_padded_integer(5, 10)
create_padded_integer <- function(n, length) {
purrr::map(1:length, \(ignore) sample(0:9, n, replace = TRUE)) |>
purrr::reduce(\(integer1, integer2) paste(integer1, integer2, sep = "")) |>
pad_integers(width = length)
}

#' Create a vector of random NPU codes
#'
#' @description
#' Nomenclature for Properties and Units (NPUs) are codes that identifies
#' laboratory results.
#'
#' @param n The number of NPUs to create.
#'
#' @return A character vector.
#'
#' @examples
#' create_fake_npu(4)
create_fake_npu <- function(n) {
stringr::str_c(
"NPU",
create_padded_integer(n, 5)
)
}

#' Create a vector of random department specialties
#'
#' @param n The number of department specialties to create.
#'
#' @return A character vector.
#'
#' @examples
#' create_fake_hovedspeciale_ans(1000)
create_fake_hovedspeciale_ans <- function(n) {
"https://www.dst.dk/da/Statistik/dokumentation/Times/forebyggelsesregistret/spec" |>
read_html() |>
Expand All @@ -97,6 +192,15 @@ create_fake_hovedspeciale_ans <- function(n) {
sample(n, replace = TRUE)
}

#' Create a vector of drug names based on a vector of ATC codes
#'
#' @param atc A character describing an ATC code.
#'
#' @return A character vector with the drug name of the given ATC code.
#'
#' @examples
#' create_fake_drug_name("A03FA05")
#' create_fake_drug_name(c("A03FA05", "A02BA04"))
create_fake_drug_name <- function(atc) {
codeCollection::ATCKoodit |>
tibble::as_tibble() |>
Expand All @@ -105,20 +209,56 @@ create_fake_drug_name <- function(atc) {
sample(length(atc), replace = TRUE)
}

#' Transform date(s) to the format wwyy
#'
#' @param x A date or a vector of dates.
#'
#' @return A vector of dates in the format wwyy.
#'
#' @examples
#' to_wwyy("2020-12-01")
#' to_wwyy(c("2020-01-12", "1995-04-19"))
to_wwyy <- function(x) {
paste0(lubridate::isoweek(lubridate::as_date(x)), stringr::str_sub(lubridate::isoyear(lubridate::as_date(x)), -2))
}

#' Transform date(s) to the format yyyymmdd
#'
#' @param x A date or a vector of dates.
#'
#' @return A vector of dates in the format yyyymmdd.
#'
#' @examples
#' to_yyyymmdd("2020-12-01")
#' to_yyyymmdd(c("2020-01-12", "1995-04-19"))
to_yyyymmdd <- function(x) {
format(lubridate::as_date(x), format = "%Y%m%d")
}

# Insert extra values to overrepresent certain values ------------------------------------------------------

#' Generate logic based on a probability
#'
#' @param proportion A double between 0 and 1.
#'
#' @return A logic vector. TRUE if the random number is less than the proportion,
#' otherwise FALSE.
#'
#' @examples
#' insertion_rate(0.3)
insertion_rate <- function(proportion) {
runif(1) < proportion
}

#' Insert specific ATC codes based on a proportion
#'
#' @param data A tibble.
#' @param proportion Proportion to be resampled. Defaults to 0.3.
#'
#' @return A tibble with a proportion of resampled ATC codes for columns
#' named 'atc'
#'
#' @examples
insert_specific_atc <- function(data, proportion = 0.3) {
glucose_lowering_drugs <- c(
metformin = "A10AB02",
Expand Down Expand Up @@ -149,42 +289,67 @@ insert_specific_atc <- function(data, proportion = 0.3) {
)
}

# Insert a few cases where purchases of metformin are used for other purposes
# than diabetes.
#' Insert cases where metformin is used for other purposes than diabetes
#'
#' @description
#' This function uses the variable 'indo' which is the code for the underlying
#' condition treated by the prescribed medication.
#'
#' @param data A tibble.
#' @param proportion Proportion to resample. Defaults to 0.05.
#'
#' @return A tibble. If all column names in the tibble is either 'atc' or
#' 'name', a proportion of observations is resampled as metmorfin.
insert_false_metformin <- function(data, proportion = 0.05) {
if (!all(colnames(data) %in% c("atc", "name"))) {
return(data)
}
data |>
dplyr::mutate(
atc = dplyr::if_else(
indo %in% c("0000092", "0000276", "0000781") & insertion_rate(proportion),
"A10BA02",
atc
),
name = dplyr::if_else(
indo %in% c("0000092", "0000276", "0000781") & insertion_rate(proportion),
"metformin",
name
if (all(c("atc", "name", "indo") %in% colnames(data))) {
data |>
dplyr::mutate(
atc = dplyr::if_else(
indo %in% c("0000092", "0000276", "0000781") & insertion_rate(proportion),
"A10BA02",
atc
),
name = dplyr::if_else(
indo %in% c("0000092", "0000276", "0000781") & insertion_rate(proportion),
"metformin",
name
)
)
)
} else {
data
}
}

# Insert some false positives for Wegovy and Saxenda.
# Insert false positives for Wegovy and Saxenda
#'
#' @param data A tibble.
#' @param proportion Proportion to resample. Defaults to 0.05.
#'
#' @return A tibble. If all column names in the tibble is either 'atc' or 'name'
#' and the atc is a A10BJ06 or A10BJ02, a proportion of observations is resampled
#' to have the name Wegovy Flextouch or Saxenda.
insert_false_drug_names <- function(data, proportion = 0.05) {
if (!all(colnames(data) %in% c("atc", "name"))) {
return(data)
}
data |>
mutate(
name = case_when(
atc == "A10BJ06" & insertion_rate(proportion) ~ "Wegovy Flextouch",
atc == "A10BJ02" & insertion_rate(proportion) ~ "Saxenda",
TRUE ~ name
if (all(c("atc", "name") %in% colnames(data))) {
data |>
mutate(
name = case_when(
atc == "A10BJ06" & insertion_rate(proportion) ~ "Wegovy Flextouch",
atc == "A10BJ02" & insertion_rate(proportion) ~ "Saxenda",
TRUE ~ name
)
)
)
} else {
data
}
}

#' Insert additional analysis codes for HbA1c
#'
#' @param data A tibble.
#' @param proportion Proportion to resample. Defaults to 0.3.
#'
#' @return A tibble. If a column is named "analysiscode", a proportion of the
#' values are replaced by codes for HbA1c.
insert_analysiscode <- function(data, proportion = 0.3) {
# NPU27300: New units for HbA1c
# NPU03835: Old units for HbA1c
Expand All @@ -201,11 +366,32 @@ insert_analysiscode <- function(data, proportion = 0.3) {
)
}

#' Add drug names (from ATC codes)
#'
#' @param data A tibble.
#'
#' @return A tibble. For columns named "name", a fake drug name (atc) will be
#' added.
add_fake_drug_name <- function(data) {
data |>
mutate(
across(
matches("^name$"),
\(x) create_fake_drug_name(atc = atc)
)
)
}

# TODO: Need a function to reuse recnum and dw_ek_kontakt in LPR data

# Simulate data -----------------------------------------------------------

# use the simulation definition data to simulate some data
#' Simulate data based on simulation definitions
#'
#' @param data A tibble with simulation definitions.
#' @param n Number of observations to simulate.
#'
#' @return A tibble with simulated data.
simulate_data <- function(data, n) {
# N needs to be capitalized for fabricatr, and but to be consistent
# with other functions and their use of `n`, I kept it lowercase for
Expand All @@ -222,16 +408,6 @@ simulate_data <- function(data, n) {
list_cbind()
}

add_fake_drug_name <- function(data) {
data |>
mutate(
across(
matches("^name$"),
\(x) create_fake_drug_name(atc = atc)
)
)
}

set.seed(123)
simulation_definitions <- here("data-raw/simulation-definitions.csv") |>
read_csv(show_col_types = FALSE) |>
Expand Down

0 comments on commit 09810da

Please sign in to comment.