From 652aaca2232b6b366b227efff142ac96bc6d5cb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= Date: Wed, 6 Dec 2023 20:17:00 +0100 Subject: [PATCH 01/18] feat: add functions script with create_test_lab_df() --- R/functions.R | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 R/functions.R diff --git a/R/functions.R b/R/functions.R new file mode 100644 index 0000000..a16be6d --- /dev/null +++ b/R/functions.R @@ -0,0 +1,25 @@ +#' Create synthetic lab data +#' +#' @param num_samples Number of samples to create (1 row per individual) +#' +#' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE +create_test_lab_df <- function(num_samples) { + data.frame( + # patient ID + pnr = sprintf("%03d", seq_len(num_samples)), + # date of sample + SAMPLINGDATE = sample( + seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"), + num_samples, replace = TRUE), + # npu code of analysis type (50% is either NPU27300 or NPU03835) + ANALYSISCODE = ifelse( + # repeat 1 and 2 num_samples times and randomise them + sample(rep(c(1, 0), length.out = num_samples)), + # sample 'NPU27300' and 'NPU03835' for all 1's + sample(c('NPU27300', 'NPU03835'), num_samples, replace = TRUE), + # sample NPU + random digit between 10000:99999 for all 0's + paste0('NPU', sample(10000:99999, num_samples, replace = TRUE))), + # numerical result of test + VALUE = runif(num_samples, 0.1, 99.9) + ) +} From e6baf5be7bc791acb857f24551b8e1bbdafbdcc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= Date: Wed, 6 Dec 2023 20:19:54 +0100 Subject: [PATCH 02/18] feat: create test lab_df using create_test_lab_df() --- R/create_test_data.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/create_test_data.R b/R/create_test_data.R index e3cd8bc..252e6c9 100644 --- a/R/create_test_data.R +++ b/R/create_test_data.R @@ -3,8 +3,11 @@ # Load required libraries library(stringr) library(data.table) +library(tidyverse) +library(here) - +# Load functions +source(here::here("R/functions.R")) # MEDICATION DATA --------------------------------------------------------- @@ -182,4 +185,5 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=` (ATC = "A10BJ06", # Laboratory data --------------------------------------------------------- - +# create test lab df with 100 rows (one row per individual) +lab_df <- create_test_lab_df(num_samples = 100) From d930fc36d8381bff4bac2b755a76dce4716b3f0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= Date: Fri, 8 Dec 2023 11:33:35 +0100 Subject: [PATCH 03/18] fix: change pnr to only include 001-100 independent of num_samples --- R/functions.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/functions.R b/R/functions.R index a16be6d..e19d8ba 100644 --- a/R/functions.R +++ b/R/functions.R @@ -1,12 +1,12 @@ #' Create synthetic lab data #' -#' @param num_samples Number of samples to create (1 row per individual) +#' @param num_samples Number of samples to create #' #' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE create_test_lab_df <- function(num_samples) { data.frame( # patient ID - pnr = sprintf("%03d", seq_len(num_samples)), + pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), # date of sample SAMPLINGDATE = sample( seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"), From 88f0b40c349125ff23579c048491d6a2303716e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= Date: Fri, 8 Dec 2023 13:11:05 +0100 Subject: [PATCH 04/18] style: edit pnr comment to clarify it's only 001-100 even if num_samples > 100 --- R/functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/functions.R b/R/functions.R index e19d8ba..f924781 100644 --- a/R/functions.R +++ b/R/functions.R @@ -5,7 +5,7 @@ #' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE create_test_lab_df <- function(num_samples) { data.frame( - # patient ID + # patient ID (will only include 001-100 even if num_samples > 100) pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), # date of sample SAMPLINGDATE = sample( From 3c04bd831873e76e9444f51034364393c77fe4f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= Date: Fri, 8 Dec 2023 13:54:06 +0100 Subject: [PATCH 05/18] style: update comments in create_test_lab_df --- R/functions.R | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/R/functions.R b/R/functions.R index f924781..7806934 100644 --- a/R/functions.R +++ b/R/functions.R @@ -5,21 +5,32 @@ #' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE create_test_lab_df <- function(num_samples) { data.frame( - # patient ID (will only include 001-100 even if num_samples > 100) + # pnr: patient ID (chr) + # random ID's from 001-100 (even if num_samples > 100) pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), - # date of sample + + # SAMPLINGDATE: date of sample (date) + # random dates between 1995-01-01 and 2015-12-31 SAMPLINGDATE = sample( seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"), - num_samples, replace = TRUE), - # npu code of analysis type (50% is either NPU27300 or NPU03835) + num_samples, + replace = TRUE + ), + + # ANALYSISCODE: npu code of analysis type (chr) + # 50% is either NPU27300 or NPU03835 + # other 50% is 'NPU'+random sample from 10000:99999 ANALYSISCODE = ifelse( - # repeat 1 and 2 num_samples times and randomise them - sample(rep(c(1, 0), length.out = num_samples)), + # repeat 0 and 1 num_samples times and randomise them + sample(rep(c(0, 1), length.out = num_samples)), # sample 'NPU27300' and 'NPU03835' for all 1's - sample(c('NPU27300', 'NPU03835'), num_samples, replace = TRUE), + sample(c("NPU27300", "NPU03835"), num_samples, replace = TRUE), # sample NPU + random digit between 10000:99999 for all 0's - paste0('NPU', sample(10000:99999, num_samples, replace = TRUE))), - # numerical result of test + paste0("NPU", sample(10000:99999, num_samples, replace = TRUE)) + ), + + # VALUE: numerical result of test (num) + # random decimal number between 0.1-99.9 VALUE = runif(num_samples, 0.1, 99.9) ) } From a6e07a8fc960c29f1b590539316b0891d32df4f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= Date: Fri, 8 Dec 2023 13:58:21 +0100 Subject: [PATCH 06/18] feat: add create_test_hi_df() --- R/functions.R | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/R/functions.R b/R/functions.R index 7806934..fb51f88 100644 --- a/R/functions.R +++ b/R/functions.R @@ -34,3 +34,42 @@ create_test_lab_df <- function(num_samples) { VALUE = runif(num_samples, 0.1, 99.9) ) } + +#' Create synthetic health insurance data +#' +#' @param num_samples Number of samples to create +#' +#' @return Data.frame with columns pnr, BARNMAK, SPECIALE, and HONUGE +create_test_hi_df <- function(num_samples) { + data.frame( + # pnr: patientID (chr) + # random values from 001-100 + pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), + + # BARNMAK: service performed on patient' child or not (binary) + # 1 = child, 0 = not, 5% are 1's + BARNMAK = sample(c(0, 1), num_samples, replace = TRUE, + prob = c(0.95, 0.05)), + + # SPECIALE: service code (6-digit int) + # 50% random samples between 100000 and 600000 + # 50% random samples from 540000 to 549999 + SPECIALE = ifelse( + # repeat 0 and 1 num_samples times and randomise them + sample(rep(c(0, 1), length.out = num_samples)), + # sample 100000:600000 for all 1's + sample(100000:600000, num_samples, replace = TRUE), + # sample 540000:549999 for all 0's + sample(540000:549999, num_samples, replace = TRUE) + ), + + # HONUGE: year/week of the service being billed (4-digit chr) + # first and second digits are random numbers between 01-52 + # third and fourth digits are random numbers between 00-99 + HONUGE = sprintf( + "%02d%02d", + sample(1:52, num_samples, replace = TRUE), + sample(0:99, num_samples, replace = TRUE) + ) + ) +} From bbb2910da3113c07854af9fdab7094aea1f54efa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= Date: Fri, 8 Dec 2023 13:59:17 +0100 Subject: [PATCH 07/18] feat: create test health insurance df using. create_test_hi_df() --- R/create_test_data.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/create_test_data.R b/R/create_test_data.R index 252e6c9..b3adff5 100644 --- a/R/create_test_data.R +++ b/R/create_test_data.R @@ -180,8 +180,8 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=` (ATC = "A10BJ06", # Health Service data ----------------------------------------------------- - - +# create test health insurance df with 100 rows +health_insurance_df <- create_test_hi_df(num_samples = 100) # Laboratory data --------------------------------------------------------- From 17e40d26edbb9f74acb22bdf34ef6ba4f1f5e106 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= Date: Fri, 8 Dec 2023 13:59:51 +0100 Subject: [PATCH 08/18] style: remove old parenthesis from comment --- R/create_test_data.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/create_test_data.R b/R/create_test_data.R index b3adff5..804ec7c 100644 --- a/R/create_test_data.R +++ b/R/create_test_data.R @@ -185,5 +185,5 @@ health_insurance_df <- create_test_hi_df(num_samples = 100) # Laboratory data --------------------------------------------------------- -# create test lab df with 100 rows (one row per individual) +# create test lab df with 100 rows lab_df <- create_test_lab_df(num_samples = 100) From e09b902a7c24eea0d22c45d470fcb19b28e7b67f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= Date: Fri, 8 Dec 2023 14:01:05 +0100 Subject: [PATCH 09/18] fix: move set.seed up this way it's clear that we set it for all test datasets and not only medication data --- R/create_test_data.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/create_test_data.R b/R/create_test_data.R index 804ec7c..db7402b 100644 --- a/R/create_test_data.R +++ b/R/create_test_data.R @@ -9,15 +9,15 @@ library(here) # Load functions source(here::here("R/functions.R")) +# Set seed for reproducibility +set.seed(123) + # MEDICATION DATA --------------------------------------------------------- # Pseudo-lmdb: #### Non-diabetes data: -# Set seed for reproducibility -set.seed(123) - # Create a dataframe with 1000 rows from 200 individuals med_df <- data.frame( pnr = sprintf("%03d", 1:200), From ea8e089ffeab39b7250942367646077caa589894 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= Date: Sun, 10 Dec 2023 13:21:16 +0100 Subject: [PATCH 10/18] refactor: move functions to create test lab and hi data into create_test_data.R and delete empty functions.R --- R/create_test_data.R | 120 +++++++++++++++++++++++++++++++++++++------ R/functions.R | 75 --------------------------- 2 files changed, 105 insertions(+), 90 deletions(-) delete mode 100644 R/functions.R diff --git a/R/create_test_data.R b/R/create_test_data.R index db7402b..79f6cd4 100644 --- a/R/create_test_data.R +++ b/R/create_test_data.R @@ -6,9 +6,6 @@ library(data.table) library(tidyverse) library(here) -# Load functions -source(here::here("R/functions.R")) - # Set seed for reproducibility set.seed(123) @@ -23,7 +20,8 @@ med_df <- data.frame( pnr = sprintf("%03d", 1:200), # ID variable eksd = as.Date(sample( - seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000, replace = TRUE + seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000, + replace = TRUE )), # Date of purchase apk = sample(1:3, 1000, replace = TRUE), @@ -63,7 +61,8 @@ med_a10_df <- data.frame( pnr = sprintf("%03d", 1:50), # ID variable eksd = as.Date(sample( - seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000, replace = TRUE + seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000, + replace = TRUE )), # Date of purchase apk = sample(1:3, 1000, replace = TRUE), @@ -100,10 +99,11 @@ med_a10_df <- data.frame( # Hardcode half of purchases to be metformin, Liraglutide or semaglutide: -med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2),]$ATC <- +med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <- sample(c("A10BA02", "A10BJ02", "A10BJ06"), - nrow(med_a10_df) / 2, - replace = TRUE) + nrow(med_a10_df) / 2, + replace = TRUE + ) generateDrugName <- function(atc) { # You can implement your own logic to generate drug names based on ATC codes @@ -123,8 +123,10 @@ replaceDrugNames <- function(data) { } # Define replacement mappings for ATC codes - replacement_mappings <- list("A10BJ02" = "Saxenda", - "A10BJ06" = "Wegovy Flextouch") + replacement_mappings <- list( + "A10BJ02" = "Saxenda", + "A10BJ06" = "Wegovy Flextouch" + ) # Iterate through rows and make replacements for (i in 1:nrow(data)) { @@ -149,19 +151,23 @@ med_df <- rbind(med_df, med_a10_df) setDT(med_df) # Handcode a few false-positive cases with purchases of metformin: -med_df[pnr %in% c(sprintf("%03d", 180:190)), `:=` ( +med_df[pnr %in% c(sprintf("%03d", 180:190)), `:=`( indo = sample(c("0000092", "0000276", "0000781"), 55, replace = TRUE), ATC = "A10BA02", drugname = "Metformin" )] # Handcode a few false-positive cases with purchases of Saxenda: -med_df[pnr %in% c(sprintf("%03d", 190:195)), `:=` (ATC = "A10BJ02", - drugname = "Saxenda")] +med_df[pnr %in% c(sprintf("%03d", 190:195)), `:=`( + ATC = "A10BJ02", + drugname = "Saxenda" +)] # Handcode a few false-positive cases with purchases of Wegovy: -med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=` (ATC = "A10BJ06", - drugname = "Wegovy Flextouch")] +med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`( + ATC = "A10BJ06", + drugname = "Wegovy Flextouch" +)] @@ -180,10 +186,94 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=` (ATC = "A10BJ06", # Health Service data ----------------------------------------------------- +#' Create synthetic health insurance data +#' +#' @param num_samples Number of samples to create +#' +#' @return Data.frame with columns pnr, BARNMAK, SPECIALE, and HONUGE +#' +#' @examples +#' create_test_hi_df(num_samples = 100) +create_test_hi_df <- function(num_samples) { + data.frame( + # pnr: patientID (chr) + # random values from 001-100 + pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), + + # BARNMAK: service performed on patient' child or not (binary) + # 1 = child, 0 = not, 5% are 1's + BARNMAK = sample(c(0, 1), num_samples, + replace = TRUE, + prob = c(0.95, 0.05) + ), + + # SPECIALE: service code (6-digit int) + # 50% random samples between 100000 and 600000 + # 50% random samples from 540000 to 549999 + SPECIALE = ifelse( + # repeat 0 and 1 num_samples times and randomise them + sample(rep(c(0, 1), length.out = num_samples)), + # sample 100000:600000 for all 1's + sample(100000:600000, num_samples, replace = TRUE), + # sample 540000:549999 for all 0's + sample(540000:549999, num_samples, replace = TRUE) + ), + + # HONUGE: year/week of the service being billed (4-digit chr) + # first and second digits are random numbers between 01-52 + # third and fourth digits are random numbers between 00-99 + HONUGE = sprintf( + "%02d%02d", + sample(1:52, num_samples, replace = TRUE), + sample(0:99, num_samples, replace = TRUE) + ) + ) +} + # create test health insurance df with 100 rows health_insurance_df <- create_test_hi_df(num_samples = 100) # Laboratory data --------------------------------------------------------- +#' Create synthetic lab data +#' +#' @param num_samples Number of samples to create +#' +#' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE +#' +#' @examples +#' create_test_lab_df(num_samples = 100) +create_test_lab_df <- function(num_samples) { + data.frame( + # pnr: patient ID (chr) + # random ID's from 001-100 (even if num_samples > 100) + pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), + + # SAMPLINGDATE: date of sample (date) + # random dates between 1995-01-01 and 2015-12-31 + SAMPLINGDATE = sample( + seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"), + num_samples, + replace = TRUE + ), + + # ANALYSISCODE: npu code of analysis type (chr) + # 50% is either NPU27300 or NPU03835 + # other 50% is 'NPU'+random sample from 10000:99999 + ANALYSISCODE = ifelse( + # repeat 0 and 1 num_samples times and randomise them + sample(rep(c(0, 1), length.out = num_samples)), + # sample 'NPU27300' and 'NPU03835' for all 1's + sample(c("NPU27300", "NPU03835"), num_samples, replace = TRUE), + # sample NPU + random digit between 10000:99999 for all 0's + paste0("NPU", sample(10000:99999, num_samples, replace = TRUE)) + ), + + # VALUE: numerical result of test (num) + # random decimal number between 0.1-99.9 + VALUE = runif(num_samples, 0.1, 99.9) + ) +} + # create test lab df with 100 rows lab_df <- create_test_lab_df(num_samples = 100) diff --git a/R/functions.R b/R/functions.R deleted file mode 100644 index fb51f88..0000000 --- a/R/functions.R +++ /dev/null @@ -1,75 +0,0 @@ -#' Create synthetic lab data -#' -#' @param num_samples Number of samples to create -#' -#' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE -create_test_lab_df <- function(num_samples) { - data.frame( - # pnr: patient ID (chr) - # random ID's from 001-100 (even if num_samples > 100) - pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), - - # SAMPLINGDATE: date of sample (date) - # random dates between 1995-01-01 and 2015-12-31 - SAMPLINGDATE = sample( - seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"), - num_samples, - replace = TRUE - ), - - # ANALYSISCODE: npu code of analysis type (chr) - # 50% is either NPU27300 or NPU03835 - # other 50% is 'NPU'+random sample from 10000:99999 - ANALYSISCODE = ifelse( - # repeat 0 and 1 num_samples times and randomise them - sample(rep(c(0, 1), length.out = num_samples)), - # sample 'NPU27300' and 'NPU03835' for all 1's - sample(c("NPU27300", "NPU03835"), num_samples, replace = TRUE), - # sample NPU + random digit between 10000:99999 for all 0's - paste0("NPU", sample(10000:99999, num_samples, replace = TRUE)) - ), - - # VALUE: numerical result of test (num) - # random decimal number between 0.1-99.9 - VALUE = runif(num_samples, 0.1, 99.9) - ) -} - -#' Create synthetic health insurance data -#' -#' @param num_samples Number of samples to create -#' -#' @return Data.frame with columns pnr, BARNMAK, SPECIALE, and HONUGE -create_test_hi_df <- function(num_samples) { - data.frame( - # pnr: patientID (chr) - # random values from 001-100 - pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), - - # BARNMAK: service performed on patient' child or not (binary) - # 1 = child, 0 = not, 5% are 1's - BARNMAK = sample(c(0, 1), num_samples, replace = TRUE, - prob = c(0.95, 0.05)), - - # SPECIALE: service code (6-digit int) - # 50% random samples between 100000 and 600000 - # 50% random samples from 540000 to 549999 - SPECIALE = ifelse( - # repeat 0 and 1 num_samples times and randomise them - sample(rep(c(0, 1), length.out = num_samples)), - # sample 100000:600000 for all 1's - sample(100000:600000, num_samples, replace = TRUE), - # sample 540000:549999 for all 0's - sample(540000:549999, num_samples, replace = TRUE) - ), - - # HONUGE: year/week of the service being billed (4-digit chr) - # first and second digits are random numbers between 01-52 - # third and fourth digits are random numbers between 00-99 - HONUGE = sprintf( - "%02d%02d", - sample(1:52, num_samples, replace = TRUE), - sample(0:99, num_samples, replace = TRUE) - ) - ) -} From 2fc565591451d8476e6c5de60ed18d62c57b515c Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Tue, 12 Dec 2023 19:15:23 +0100 Subject: [PATCH 11/18] chore: add setup for making fake data using usethis::use_data_raw(). --- .Rbuildignore | 1 + data-raw/testdata.R | 3 +++ 2 files changed, 4 insertions(+) create mode 100644 data-raw/testdata.R diff --git a/.Rbuildignore b/.Rbuildignore index e13c405..732aa56 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,4 @@ ^.*\.Rproj$ ^\.Rproj\.user$ ^LICENSE\.md$ +^data-raw$ diff --git a/data-raw/testdata.R b/data-raw/testdata.R new file mode 100644 index 0000000..aaea9d1 --- /dev/null +++ b/data-raw/testdata.R @@ -0,0 +1,3 @@ +## code to prepare `testdata` dataset goes here + +usethis::use_data(testdata, overwrite = TRUE) From 17dfea6a1a447eaf6e702c894c03acf80025d6bb Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Tue, 12 Dec 2023 19:17:04 +0100 Subject: [PATCH 12/18] chore: Moved code over into data-raw folder --- R/create_test_data.R | 279 ------------------------------------------ data-raw/testdata.R | 282 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 280 insertions(+), 281 deletions(-) delete mode 100644 R/create_test_data.R diff --git a/R/create_test_data.R b/R/create_test_data.R deleted file mode 100644 index 79f6cd4..0000000 --- a/R/create_test_data.R +++ /dev/null @@ -1,279 +0,0 @@ -# Script to generate synthetic data for tests - -# Load required libraries -library(stringr) -library(data.table) -library(tidyverse) -library(here) - -# Set seed for reproducibility -set.seed(123) - -# MEDICATION DATA --------------------------------------------------------- - -# Pseudo-lmdb: - -#### Non-diabetes data: - -# Create a dataframe with 1000 rows from 200 individuals -med_df <- data.frame( - pnr = sprintf("%03d", 1:200), - # ID variable - eksd = as.Date(sample( - seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000, - replace = TRUE - )), - # Date of purchase - apk = sample(1:3, 1000, replace = TRUE), - # Number of packages - indo = ifelse(runif(1000) <= 0.05, "", sprintf( - "%07d", sample(1:9999999, 50, replace = TRUE) - )), - # Indication for treatment - ATC = paste( - sample(LETTERS, 1000, replace = TRUE), - sample(0:9, 1000, replace = TRUE), - sample(0:9, 1000, replace = TRUE), - sample(LETTERS, 1000, replace = TRUE), - sample(LETTERS, 1000, replace = TRUE), - sample(0:9, 1000, replace = TRUE), - sample(0:9, 1000, replace = TRUE), - sep = "" - ), - # ATC code - volume = sample(20:100, 1000, replace = TRUE) # Volume -) - -# Create a function to generate drug names based on ATC codes (replace this with your own drug name generation logic) -generateDrugName <- function(atc) { - # You can implement your own logic to generate drug names based on ATC codes - # Here, we are using a placeholder logic that simply returns the atc code. - return(atc) -} - -# Apply the function to create drug names -med_df$drugname <- sapply(med_df$ATC, generateDrugName) - -#### Diabetes data: - -# Create a dataframe with 1000 rows from 50 individuals -med_a10_df <- data.frame( - pnr = sprintf("%03d", 1:50), - # ID variable - eksd = as.Date(sample( - seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000, - replace = TRUE - )), - # Date of purchase - apk = sample(1:3, 1000, replace = TRUE), - # Number of packages - indo = ifelse(runif(1000) <= 0.05, "", sprintf( - "%07d", sample(1:9999999, 50, replace = TRUE) - )), - # Indication for treatment - ATC = paste( - rep( - c( - "A10AB", - "A10AC", - "A10AD", - "A10AE", - "A10BA", - "A10BB", - "A10BD", - "A10BG", - "A10BH", - "A10BJ", - "A10BK", - "A10BX" - ), - 80 - ), - sample(0:9, 1000, replace = TRUE), - sample(0:9, 1000, replace = TRUE), - sep = "" - ), - # ATC code - volume = sample(20:100, 1000, replace = TRUE) # Volume -) - -# Hardcode half of purchases to be metformin, Liraglutide or semaglutide: - -med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <- - sample(c("A10BA02", "A10BJ02", "A10BJ06"), - nrow(med_a10_df) / 2, - replace = TRUE - ) - -generateDrugName <- function(atc) { - # You can implement your own logic to generate drug names based on ATC codes - # Here, we are using a placeholder logic that simply returns the atc code. - return(atc) -} - -# Apply the function to create drug names -med_a10_df$drugname <- sapply(med_a10_df$ATC, generateDrugName) - - -replaceDrugNames <- function(data) { - # Check if the data frame contains the necessary columns - if (!all(c("ATC", "drugname") %in% colnames(data))) { - cat("Required columns not found in the data frame.\n") - return(NULL) - } - - # Define replacement mappings for ATC codes - replacement_mappings <- list( - "A10BJ02" = "Saxenda", - "A10BJ06" = "Wegovy Flextouch" - ) - - # Iterate through rows and make replacements - for (i in 1:nrow(data)) { - atc_code <- data$ATC[i] - if (atc_code %in% names(replacement_mappings)) { - # Check if the ATC code is in the mappings - if (runif(1) < 0.5) { - # Replace with the corresponding drug name with 50% probability - data$drugname[i] <- replacement_mappings[[atc_code]] - } - } - } - - return(data) -} - -# Apply the function to create drug names -med_a10_df <- replaceDrugNames(med_a10_df) - -med_df <- rbind(med_df, med_a10_df) - -setDT(med_df) - -# Handcode a few false-positive cases with purchases of metformin: -med_df[pnr %in% c(sprintf("%03d", 180:190)), `:=`( - indo = sample(c("0000092", "0000276", "0000781"), 55, replace = TRUE), - ATC = "A10BA02", - drugname = "Metformin" -)] - -# Handcode a few false-positive cases with purchases of Saxenda: -med_df[pnr %in% c(sprintf("%03d", 190:195)), `:=`( - ATC = "A10BJ02", - drugname = "Saxenda" -)] - -# Handcode a few false-positive cases with purchases of Wegovy: -med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`( - ATC = "A10BJ06", - drugname = "Wegovy Flextouch" -)] - - - -# Hospital diagnoses ------------------------------------------------------ - - - -# lpr_adm ----------------------------------------------------------------- - - - - -# lpr_diag ---------------------------------------------------------------- - - - -# Health Service data ----------------------------------------------------- - -#' Create synthetic health insurance data -#' -#' @param num_samples Number of samples to create -#' -#' @return Data.frame with columns pnr, BARNMAK, SPECIALE, and HONUGE -#' -#' @examples -#' create_test_hi_df(num_samples = 100) -create_test_hi_df <- function(num_samples) { - data.frame( - # pnr: patientID (chr) - # random values from 001-100 - pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), - - # BARNMAK: service performed on patient' child or not (binary) - # 1 = child, 0 = not, 5% are 1's - BARNMAK = sample(c(0, 1), num_samples, - replace = TRUE, - prob = c(0.95, 0.05) - ), - - # SPECIALE: service code (6-digit int) - # 50% random samples between 100000 and 600000 - # 50% random samples from 540000 to 549999 - SPECIALE = ifelse( - # repeat 0 and 1 num_samples times and randomise them - sample(rep(c(0, 1), length.out = num_samples)), - # sample 100000:600000 for all 1's - sample(100000:600000, num_samples, replace = TRUE), - # sample 540000:549999 for all 0's - sample(540000:549999, num_samples, replace = TRUE) - ), - - # HONUGE: year/week of the service being billed (4-digit chr) - # first and second digits are random numbers between 01-52 - # third and fourth digits are random numbers between 00-99 - HONUGE = sprintf( - "%02d%02d", - sample(1:52, num_samples, replace = TRUE), - sample(0:99, num_samples, replace = TRUE) - ) - ) -} - -# create test health insurance df with 100 rows -health_insurance_df <- create_test_hi_df(num_samples = 100) - -# Laboratory data --------------------------------------------------------- - -#' Create synthetic lab data -#' -#' @param num_samples Number of samples to create -#' -#' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE -#' -#' @examples -#' create_test_lab_df(num_samples = 100) -create_test_lab_df <- function(num_samples) { - data.frame( - # pnr: patient ID (chr) - # random ID's from 001-100 (even if num_samples > 100) - pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), - - # SAMPLINGDATE: date of sample (date) - # random dates between 1995-01-01 and 2015-12-31 - SAMPLINGDATE = sample( - seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"), - num_samples, - replace = TRUE - ), - - # ANALYSISCODE: npu code of analysis type (chr) - # 50% is either NPU27300 or NPU03835 - # other 50% is 'NPU'+random sample from 10000:99999 - ANALYSISCODE = ifelse( - # repeat 0 and 1 num_samples times and randomise them - sample(rep(c(0, 1), length.out = num_samples)), - # sample 'NPU27300' and 'NPU03835' for all 1's - sample(c("NPU27300", "NPU03835"), num_samples, replace = TRUE), - # sample NPU + random digit between 10000:99999 for all 0's - paste0("NPU", sample(10000:99999, num_samples, replace = TRUE)) - ), - - # VALUE: numerical result of test (num) - # random decimal number between 0.1-99.9 - VALUE = runif(num_samples, 0.1, 99.9) - ) -} - -# create test lab df with 100 rows -lab_df <- create_test_lab_df(num_samples = 100) diff --git a/data-raw/testdata.R b/data-raw/testdata.R index aaea9d1..855cde7 100644 --- a/data-raw/testdata.R +++ b/data-raw/testdata.R @@ -1,3 +1,281 @@ -## code to prepare `testdata` dataset goes here +# Script to generate synthetic data for tests -usethis::use_data(testdata, overwrite = TRUE) +# Load required libraries +library(stringr) +library(data.table) +library(tidyverse) +library(here) + +# Set seed for reproducibility +set.seed(123) + +# MEDICATION DATA --------------------------------------------------------- + +# Pseudo-lmdb: + +#### Non-diabetes data: + +# Create a dataframe with 1000 rows from 200 individuals +med_df <- data.frame( + pnr = sprintf("%03d", 1:200), + # ID variable + eksd = as.Date(sample( + seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000, + replace = TRUE + )), + # Date of purchase + apk = sample(1:3, 1000, replace = TRUE), + # Number of packages + indo = ifelse(runif(1000) <= 0.05, "", sprintf( + "%07d", sample(1:9999999, 50, replace = TRUE) + )), + # Indication for treatment + ATC = paste( + sample(LETTERS, 1000, replace = TRUE), + sample(0:9, 1000, replace = TRUE), + sample(0:9, 1000, replace = TRUE), + sample(LETTERS, 1000, replace = TRUE), + sample(LETTERS, 1000, replace = TRUE), + sample(0:9, 1000, replace = TRUE), + sample(0:9, 1000, replace = TRUE), + sep = "" + ), + # ATC code + volume = sample(20:100, 1000, replace = TRUE) # Volume +) + +# Create a function to generate drug names based on ATC codes (replace this with your own drug name generation logic) +generateDrugName <- function(atc) { + # You can implement your own logic to generate drug names based on ATC codes + # Here, we are using a placeholder logic that simply returns the atc code. + return(atc) +} + +# Apply the function to create drug names +med_df$drugname <- sapply(med_df$ATC, generateDrugName) + +#### Diabetes data: + +# Create a dataframe with 1000 rows from 50 individuals +med_a10_df <- data.frame( + pnr = sprintf("%03d", 1:50), + # ID variable + eksd = as.Date(sample( + seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000, + replace = TRUE + )), + # Date of purchase + apk = sample(1:3, 1000, replace = TRUE), + # Number of packages + indo = ifelse(runif(1000) <= 0.05, "", sprintf( + "%07d", sample(1:9999999, 50, replace = TRUE) + )), + # Indication for treatment + ATC = paste( + rep( + c( + "A10AB", + "A10AC", + "A10AD", + "A10AE", + "A10BA", + "A10BB", + "A10BD", + "A10BG", + "A10BH", + "A10BJ", + "A10BK", + "A10BX" + ), + 80 + ), + sample(0:9, 1000, replace = TRUE), + sample(0:9, 1000, replace = TRUE), + sep = "" + ), + # ATC code + volume = sample(20:100, 1000, replace = TRUE) # Volume +) + +# Hardcode half of purchases to be metformin, Liraglutide or semaglutide: + +med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <- + sample(c("A10BA02", "A10BJ02", "A10BJ06"), + nrow(med_a10_df) / 2, + replace = TRUE + ) + +generateDrugName <- function(atc) { + # You can implement your own logic to generate drug names based on ATC codes + # Here, we are using a placeholder logic that simply returns the atc code. + return(atc) +} + +# Apply the function to create drug names +med_a10_df$drugname <- sapply(med_a10_df$ATC, generateDrugName) + + +replaceDrugNames <- function(data) { + # Check if the data frame contains the necessary columns + if (!all(c("ATC", "drugname") %in% colnames(data))) { + cat("Required columns not found in the data frame.\n") + return(NULL) + } + + # Define replacement mappings for ATC codes + replacement_mappings <- list( + "A10BJ02" = "Saxenda", + "A10BJ06" = "Wegovy Flextouch" + ) + + # Iterate through rows and make replacements + for (i in 1:nrow(data)) { + atc_code <- data$ATC[i] + if (atc_code %in% names(replacement_mappings)) { + # Check if the ATC code is in the mappings + if (runif(1) < 0.5) { + # Replace with the corresponding drug name with 50% probability + data$drugname[i] <- replacement_mappings[[atc_code]] + } + } + } + + return(data) +} + +# Apply the function to create drug names +med_a10_df <- replaceDrugNames(med_a10_df) + +med_df <- rbind(med_df, med_a10_df) + +setDT(med_df) + +# Handcode a few false-positive cases with purchases of metformin: +med_df[pnr %in% c(sprintf("%03d", 180:190)), `:=`( + indo = sample(c("0000092", "0000276", "0000781"), 55, replace = TRUE), + ATC = "A10BA02", + drugname = "Metformin" +)] + +# Handcode a few false-positive cases with purchases of Saxenda: +med_df[pnr %in% c(sprintf("%03d", 190:195)), `:=`( + ATC = "A10BJ02", + drugname = "Saxenda" +)] + +# Handcode a few false-positive cases with purchases of Wegovy: +med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`( + ATC = "A10BJ06", + drugname = "Wegovy Flextouch" +)] + + + +# Hospital diagnoses ------------------------------------------------------ + + + +# lpr_adm ----------------------------------------------------------------- + + + + +# lpr_diag ---------------------------------------------------------------- + + + +# Health Service data ----------------------------------------------------- + +#' Create synthetic health insurance data +#' +#' @param num_samples Number of samples to create +#' +#' @return Data.frame with columns pnr, BARNMAK, SPECIALE, and HONUGE +#' +#' @examples +#' create_test_hi_df(num_samples = 100) +create_test_hi_df <- function(num_samples) { + data.frame( + # pnr: patientID (chr) + # random values from 001-100 + pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), + + # BARNMAK: service performed on patient' child or not (binary) + # 1 = child, 0 = not, 5% are 1's + BARNMAK = sample(c(0, 1), num_samples, + replace = TRUE, + prob = c(0.95, 0.05) + ), + + # SPECIALE: service code (6-digit int) + # 50% random samples between 100000 and 600000 + # 50% random samples from 540000 to 549999 + SPECIALE = ifelse( + # repeat 0 and 1 num_samples times and randomise them + sample(rep(c(0, 1), length.out = num_samples)), + # sample 100000:600000 for all 1's + sample(100000:600000, num_samples, replace = TRUE), + # sample 540000:549999 for all 0's + sample(540000:549999, num_samples, replace = TRUE) + ), + + # HONUGE: year/week of the service being billed (4-digit chr) + # first and second digits are random numbers between 01-52 + # third and fourth digits are random numbers between 00-99 + HONUGE = sprintf( + "%02d%02d", + sample(1:52, num_samples, replace = TRUE), + sample(0:99, num_samples, replace = TRUE) + ) + ) +} + +# create test health insurance df with 100 rows +health_insurance_df <- create_test_hi_df(num_samples = 100) + +# Laboratory data --------------------------------------------------------- + +#' Create synthetic lab data +#' +#' @param num_samples Number of samples to create +#' +#' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE +#' +#' @examples +#' create_test_lab_df(num_samples = 100) +create_test_lab_df <- function(num_samples) { + data.frame( + # pnr: patient ID (chr) + # random ID's from 001-100 (even if num_samples > 100) + pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), + + # SAMPLINGDATE: date of sample (date) + # random dates between 1995-01-01 and 2015-12-31 + SAMPLINGDATE = sample( + seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"), + num_samples, + replace = TRUE + ), + + # ANALYSISCODE: npu code of analysis type (chr) + # 50% is either NPU27300 or NPU03835 + # other 50% is 'NPU'+random sample from 10000:99999 + ANALYSISCODE = ifelse( + # repeat 0 and 1 num_samples times and randomise them + sample(rep(c(0, 1), length.out = num_samples)), + # sample 'NPU27300' and 'NPU03835' for all 1's + sample(c("NPU27300", "NPU03835"), num_samples, replace = TRUE), + # sample NPU + random digit between 10000:99999 for all 0's + paste0("NPU", sample(10000:99999, num_samples, replace = TRUE)) + ), + + # VALUE: numerical result of test (num) + # random decimal number between 0.1-99.9 + VALUE = runif(num_samples, 0.1, 99.9) + ) +} + +# create test lab df with 100 rows +fake_data <- create_test_lab_df(num_samples = 100) + +usethis::use_data(fake_data, overwrite = TRUE, internal = TRUE) From 4eb894714ef4a27bd61bf00940a5158e682d0984 Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Tue, 12 Dec 2023 20:00:16 +0100 Subject: [PATCH 13/18] refactor: Started refactoring but not sure what output should be. --- DESCRIPTION | 1 + data-raw/testdata.R | 102 ++++++++++++++++++++------------------------ 2 files changed, 47 insertions(+), 56 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 83edf41..8975652 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,4 +22,5 @@ Imports: here, lubridate Suggests: + codeCollection, DiagrammeR diff --git a/data-raw/testdata.R b/data-raw/testdata.R index 855cde7..69060b3 100644 --- a/data-raw/testdata.R +++ b/data-raw/testdata.R @@ -1,60 +1,68 @@ # Script to generate synthetic data for tests # Load required libraries -library(stringr) -library(data.table) library(tidyverse) +library(data.table) library(here) +library(lubridate) # Set seed for reproducibility set.seed(123) -# MEDICATION DATA --------------------------------------------------------- +# Set number of rows to create for fake data +number_rows <- 1000 + + +# Functions --------------------------------------------------------------- + +generate_fake_atc <- function(number_rows) { + codeCollection::ATCKoodit |> + tibble::as_tibble() |> + dplyr::filter(stringr::str_length(Koodi) == 7) |> + dplyr::pull(Koodi) |> + sample(number_rows, replace = TRUE) +} + +generate_fake_indication <- function(number_rows) { + sample(1:9e8, number_rows, replace = TRUE) |> + stringr::str_trunc(width = 7, ellipsis = "") |> + stringr::str_pad(width = 7, pad = "0") +} + +assign_drugname_from_atc <- function(data) { + codeCollection::ATCKoodit |> + tibble::as_tibble() |> + dplyr::select(ATC = Koodi, drugname = en) |> + dplyr::right_join(data, by = "ATC", relationship = "many-to-many") +} + +# Medication data --------------------------------------------------------- # Pseudo-lmdb: -#### Non-diabetes data: +## Non-diabetes data: -# Create a dataframe with 1000 rows from 200 individuals -med_df <- data.frame( - pnr = sprintf("%03d", 1:200), +# Create a tibble with 1000 rows from 200 individuals +med_df <- tibble( # ID variable - eksd = as.Date(sample( - seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000, + pnr = sample(sprintf("%03d", 1:200), number_rows, replace = TRUE), + # Date of purchase + eksd = as_date(sample( + seq(as_date("1995-01-01"), as_date("2014-12-31"), by = "day"), 1000, replace = TRUE )), - # Date of purchase - apk = sample(1:3, 1000, replace = TRUE), # Number of packages - indo = ifelse(runif(1000) <= 0.05, "", sprintf( - "%07d", sample(1:9999999, 50, replace = TRUE) - )), + apk = sample(1:3, 1000, replace = TRUE), # Indication for treatment - ATC = paste( - sample(LETTERS, 1000, replace = TRUE), - sample(0:9, 1000, replace = TRUE), - sample(0:9, 1000, replace = TRUE), - sample(LETTERS, 1000, replace = TRUE), - sample(LETTERS, 1000, replace = TRUE), - sample(0:9, 1000, replace = TRUE), - sample(0:9, 1000, replace = TRUE), - sep = "" - ), + indo = generate_fake_indication(number_rows), # ATC code - volume = sample(20:100, 1000, replace = TRUE) # Volume -) + ATC = generate_fake_atc(number_rows), + # Volume + volume = sample(20:100, 1000, replace = TRUE) +) |> + assign_drugname_from_atc() -# Create a function to generate drug names based on ATC codes (replace this with your own drug name generation logic) -generateDrugName <- function(atc) { - # You can implement your own logic to generate drug names based on ATC codes - # Here, we are using a placeholder logic that simply returns the atc code. - return(atc) -} - -# Apply the function to create drug names -med_df$drugname <- sapply(med_df$ATC, generateDrugName) - -#### Diabetes data: +## Diabetes data: # Create a dataframe with 1000 rows from 50 individuals med_a10_df <- data.frame( @@ -67,9 +75,7 @@ med_a10_df <- data.frame( # Date of purchase apk = sample(1:3, 1000, replace = TRUE), # Number of packages - indo = ifelse(runif(1000) <= 0.05, "", sprintf( - "%07d", sample(1:9999999, 50, replace = TRUE) - )), + indo = generate_fake_indication(number_rows), # Indication for treatment ATC = paste( rep( @@ -105,16 +111,6 @@ med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <- replace = TRUE ) -generateDrugName <- function(atc) { - # You can implement your own logic to generate drug names based on ATC codes - # Here, we are using a placeholder logic that simply returns the atc code. - return(atc) -} - -# Apply the function to create drug names -med_a10_df$drugname <- sapply(med_a10_df$ATC, generateDrugName) - - replaceDrugNames <- function(data) { # Check if the data frame contains the necessary columns if (!all(c("ATC", "drugname") %in% colnames(data))) { @@ -169,12 +165,8 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`( drugname = "Wegovy Flextouch" )] - - # Hospital diagnoses ------------------------------------------------------ - - # lpr_adm ----------------------------------------------------------------- @@ -182,8 +174,6 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`( # lpr_diag ---------------------------------------------------------------- - - # Health Service data ----------------------------------------------------- #' Create synthetic health insurance data From d35dc94cdcec24515d7b95f7a8c74ec02fa7128a Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Tue, 12 Dec 2023 20:29:34 +0100 Subject: [PATCH 14/18] refactor: create function to make pnr, plus other small edits --- data-raw/testdata.R | 75 ++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 28 deletions(-) diff --git a/data-raw/testdata.R b/data-raw/testdata.R index 69060b3..fe6f60d 100644 --- a/data-raw/testdata.R +++ b/data-raw/testdata.R @@ -15,15 +15,42 @@ number_rows <- 1000 # Functions --------------------------------------------------------------- -generate_fake_atc <- function(number_rows) { +#' Create a vector of fake PNR (person ID number). +#' +#' These numbers range from 001 to 100. +#' +#' @param n The number (length) of items to output. +#' @param number_subjects Number of patients to create +#' +#' @return A vector. +#' +#' @examples +#' create_fake_pnr(10) +create_fake_pnr <- function(n, number_subjects = 200) { + 1:number_subjects |> + # Pad the string to match the number values, so that if 1000 `number_subjects` + # is given, the string length of the output is 4 characters wide (e.g. "0001"). + stringr::str_pad(width = stringr::str_length(number_subjects), pad = "0") |> + sample(number_rows, replace = TRUE) +} + +#' Create a vector of fake ATC Codes. +#' +#' @inheritParams create_fake_pnr +#' +#' @return A vector. +#' +#' @examples +#' create_fake_atc(10) +create_fake_atc <- function(n) { codeCollection::ATCKoodit |> tibble::as_tibble() |> dplyr::filter(stringr::str_length(Koodi) == 7) |> dplyr::pull(Koodi) |> - sample(number_rows, replace = TRUE) + sample(n, replace = TRUE) } -generate_fake_indication <- function(number_rows) { +create_fake_indication <- function(number_rows) { sample(1:9e8, number_rows, replace = TRUE) |> stringr::str_trunc(width = 7, ellipsis = "") |> stringr::str_pad(width = 7, pad = "0") @@ -45,7 +72,7 @@ assign_drugname_from_atc <- function(data) { # Create a tibble with 1000 rows from 200 individuals med_df <- tibble( # ID variable - pnr = sample(sprintf("%03d", 1:200), number_rows, replace = TRUE), + pnr = create_fake_pnr(number_rows), # Date of purchase eksd = as_date(sample( seq(as_date("1995-01-01"), as_date("2014-12-31"), by = "day"), 1000, @@ -54,9 +81,9 @@ med_df <- tibble( # Number of packages apk = sample(1:3, 1000, replace = TRUE), # Indication for treatment - indo = generate_fake_indication(number_rows), + indo = create_fake_indication(number_rows), # ATC code - ATC = generate_fake_atc(number_rows), + ATC = create_fake_atc(number_rows), # Volume volume = sample(20:100, 1000, replace = TRUE) ) |> @@ -66,7 +93,7 @@ med_df <- tibble( # Create a dataframe with 1000 rows from 50 individuals med_a10_df <- data.frame( - pnr = sprintf("%03d", 1:50), + pnr = create_fake_pnr(number_rows, number_subjects = 50), # ID variable eksd = as.Date(sample( seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000, @@ -75,7 +102,7 @@ med_a10_df <- data.frame( # Date of purchase apk = sample(1:3, 1000, replace = TRUE), # Number of packages - indo = generate_fake_indication(number_rows), + indo = create_fake_indication(number_rows), # Indication for treatment ATC = paste( rep( @@ -180,15 +207,13 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`( #' #' @param num_samples Number of samples to create #' -#' @return Data.frame with columns pnr, BARNMAK, SPECIALE, and HONUGE +#' @return A [tibble::tibble()] with columns `pnr`, `BARNMAK`, `SPECIALE`, and `HONUGE`. #' #' @examples #' create_test_hi_df(num_samples = 100) create_test_hi_df <- function(num_samples) { - data.frame( - # pnr: patientID (chr) - # random values from 001-100 - pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), + tibble::tibble( + pnr = create_fake_pnr(num_samples), # BARNMAK: service performed on patient' child or not (binary) # 1 = child, 0 = not, 5% are 1's @@ -229,20 +254,18 @@ health_insurance_df <- create_test_hi_df(num_samples = 100) #' #' @param num_samples Number of samples to create #' -#' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE +#' @return A [tibble::tibble()] with columns `pnr`, `SAMPLINGDATE`, `ANALYSISCODE`, and `VALUE`. #' #' @examples #' create_test_lab_df(num_samples = 100) create_test_lab_df <- function(num_samples) { - data.frame( - # pnr: patient ID (chr) - # random ID's from 001-100 (even if num_samples > 100) - pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)), + tibble::tibble( + pnr = create_fake_pnr(num_samples), # SAMPLINGDATE: date of sample (date) # random dates between 1995-01-01 and 2015-12-31 SAMPLINGDATE = sample( - seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"), + seq(lubridate::as_date("1995-01-01"), lubridate::as_date("2015-12-31"), by = "day"), num_samples, replace = TRUE ), @@ -250,13 +273,9 @@ create_test_lab_df <- function(num_samples) { # ANALYSISCODE: npu code of analysis type (chr) # 50% is either NPU27300 or NPU03835 # other 50% is 'NPU'+random sample from 10000:99999 - ANALYSISCODE = ifelse( - # repeat 0 and 1 num_samples times and randomise them - sample(rep(c(0, 1), length.out = num_samples)), - # sample 'NPU27300' and 'NPU03835' for all 1's - sample(c("NPU27300", "NPU03835"), num_samples, replace = TRUE), - # sample NPU + random digit between 10000:99999 for all 0's - paste0("NPU", sample(10000:99999, num_samples, replace = TRUE)) + ANALYSISCODE = sample( + c(sample(c("NPU27300", "NPU03835"), num_samples / 2, replace = TRUE), + paste0("NPU", sample(10000:99999, num_samples / 2, replace = TRUE))) ), # VALUE: numerical result of test (num) @@ -266,6 +285,6 @@ create_test_lab_df <- function(num_samples) { } # create test lab df with 100 rows -fake_data <- create_test_lab_df(num_samples = 100) +test_lab_df <- create_test_lab_df(num_samples = 100) -usethis::use_data(fake_data, overwrite = TRUE, internal = TRUE) +usethis::use_data(test_lab_df, overwrite = TRUE, internal = TRUE) From f4e242e06f557a6c7ed3a6170285e8f3b9606700 Mon Sep 17 00:00:00 2001 From: Anders Aasted Isaksen <67263135+Aastedet@users.noreply.github.com> Date: Tue, 13 Feb 2024 12:02:34 +0100 Subject: [PATCH 15/18] Update testdata.R added assign_drugname_from_atc() to med_a10_df --- data-raw/testdata.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/data-raw/testdata.R b/data-raw/testdata.R index fe6f60d..2f17d65 100644 --- a/data-raw/testdata.R +++ b/data-raw/testdata.R @@ -128,7 +128,8 @@ med_a10_df <- data.frame( ), # ATC code volume = sample(20:100, 1000, replace = TRUE) # Volume -) +) |> + assign_drugname_from_atc() # Hardcode half of purchases to be metformin, Liraglutide or semaglutide: From cb9100fdc5f6f3a48e2de10231f8bdef4e17f3c4 Mon Sep 17 00:00:00 2001 From: Anders Aasted Isaksen <67263135+Aastedet@users.noreply.github.com> Date: Tue, 13 Feb 2024 12:05:06 +0100 Subject: [PATCH 16/18] Update testdata.R Fix to previous commit to assign drugnames to med_a10_df --- data-raw/testdata.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/data-raw/testdata.R b/data-raw/testdata.R index 2f17d65..c4ef4c6 100644 --- a/data-raw/testdata.R +++ b/data-raw/testdata.R @@ -128,8 +128,7 @@ med_a10_df <- data.frame( ), # ATC code volume = sample(20:100, 1000, replace = TRUE) # Volume -) |> - assign_drugname_from_atc() +) # Hardcode half of purchases to be metformin, Liraglutide or semaglutide: @@ -139,6 +138,10 @@ med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <- replace = TRUE ) +# Assign drugnames: +med_a10_df |> + assign_drugname_from_atc() + replaceDrugNames <- function(data) { # Check if the data frame contains the necessary columns if (!all(c("ATC", "drugname") %in% colnames(data))) { From f5395c60a39e438a884eeeac3cc1ca3e2134879b Mon Sep 17 00:00:00 2001 From: Anders Aasted Isaksen <67263135+Aastedet@users.noreply.github.com> Date: Tue, 13 Feb 2024 12:06:53 +0100 Subject: [PATCH 17/18] Update testdata.R forgot to actually assign drug names to med_a10_df --- data-raw/testdata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-raw/testdata.R b/data-raw/testdata.R index c4ef4c6..58d7933 100644 --- a/data-raw/testdata.R +++ b/data-raw/testdata.R @@ -171,7 +171,7 @@ replaceDrugNames <- function(data) { } # Apply the function to create drug names -med_a10_df <- replaceDrugNames(med_a10_df) +med_a10_df <- med_a10_df <- replaceDrugNames(med_a10_df) med_df <- rbind(med_df, med_a10_df) From 3222af8346bfd4c310d46cc3d916a3a5df4f9578 Mon Sep 17 00:00:00 2001 From: Anders Aasted Isaksen Date: Sat, 17 Feb 2024 23:30:32 +0100 Subject: [PATCH 18/18] testdata.R: - Added offset to pnr number generation to have more control when generating data for false-positive diabetes cases (for medication: 1-200: non-cases, 201-250: true cases). - Increased number of samples in health insurance/lab data and changed years covered by health insurance to match real world setting. --- data-raw/testdata.R | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/data-raw/testdata.R b/data-raw/testdata.R index 58d7933..351a7e7 100644 --- a/data-raw/testdata.R +++ b/data-raw/testdata.R @@ -26,8 +26,8 @@ number_rows <- 1000 #' #' @examples #' create_fake_pnr(10) -create_fake_pnr <- function(n, number_subjects = 200) { - 1:number_subjects |> +create_fake_pnr <- function(n, number_subjects = 200, offset = 0) { + (1 + offset):(number_subjects + offset) |> # Pad the string to match the number values, so that if 1000 `number_subjects` # is given, the string length of the output is 4 characters wide (e.g. "0001"). stringr::str_pad(width = stringr::str_length(number_subjects), pad = "0") |> @@ -67,7 +67,7 @@ assign_drugname_from_atc <- function(data) { # Pseudo-lmdb: -## Non-diabetes data: +## Non-diabetes data (ID 1-200): # Create a tibble with 1000 rows from 200 individuals med_df <- tibble( @@ -89,11 +89,11 @@ med_df <- tibble( ) |> assign_drugname_from_atc() -## Diabetes data: +## Diabetes data (ID 201-250): # Create a dataframe with 1000 rows from 50 individuals med_a10_df <- data.frame( - pnr = create_fake_pnr(number_rows, number_subjects = 50), + pnr = create_fake_pnr(number_rows, number_subjects = 50, offset = 200), # ID variable eksd = as.Date(sample( seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000, @@ -130,7 +130,7 @@ med_a10_df <- data.frame( volume = sample(20:100, 1000, replace = TRUE) # Volume ) -# Hardcode half of purchases to be metformin, Liraglutide or semaglutide: +# Hardcode half of purchases to be metformin, liraglutid or semaglutid: med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <- sample(c("A10BA02", "A10BJ02", "A10BJ06"), @@ -139,7 +139,7 @@ med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <- ) # Assign drugnames: -med_a10_df |> +med_a10_df <- med_a10_df |> assign_drugname_from_atc() replaceDrugNames <- function(data) { @@ -171,15 +171,14 @@ replaceDrugNames <- function(data) { } # Apply the function to create drug names -med_a10_df <- med_a10_df <- replaceDrugNames(med_a10_df) - -med_df <- rbind(med_df, med_a10_df) +med_a10_df <- replaceDrugNames(med_a10_df) +setDT(med_a10_df) setDT(med_df) # Handcode a few false-positive cases with purchases of metformin: med_df[pnr %in% c(sprintf("%03d", 180:190)), `:=`( - indo = sample(c("0000092", "0000276", "0000781"), 55, replace = TRUE), + indo = sample(c("0000092", "0000276", "0000781"), nrow(med_df[pnr %in% c(sprintf("%03d", 180:190))]), replace = TRUE), ATC = "A10BA02", drugname = "Metformin" )] @@ -196,6 +195,11 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`( drugname = "Wegovy Flextouch" )] +# Combine the two: +med_df <- rbind(med_df, med_a10_df) + +setDT(med_df) + # Hospital diagnoses ------------------------------------------------------ # lpr_adm ----------------------------------------------------------------- @@ -240,17 +244,17 @@ create_test_hi_df <- function(num_samples) { # HONUGE: year/week of the service being billed (4-digit chr) # first and second digits are random numbers between 01-52 - # third and fourth digits are random numbers between 00-99 + # third and fourth digits are random numbers corresponding to 1990 onward HONUGE = sprintf( "%02d%02d", sample(1:52, num_samples, replace = TRUE), - sample(0:99, num_samples, replace = TRUE) + sample(c(90:99, 01:22), num_samples, replace = TRUE) ) ) } # create test health insurance df with 100 rows -health_insurance_df <- create_test_hi_df(num_samples = 100) +health_insurance_df <- create_test_hi_df(num_samples = 1000) # Laboratory data --------------------------------------------------------- @@ -289,6 +293,6 @@ create_test_lab_df <- function(num_samples) { } # create test lab df with 100 rows -test_lab_df <- create_test_lab_df(num_samples = 100) +test_lab_df <- create_test_lab_df(num_samples = 1000) -usethis::use_data(test_lab_df, overwrite = TRUE, internal = TRUE) +# usethis::use_data(test_lab_df, overwrite = TRUE, internal = TRUE)