From 652aaca2232b6b366b227efff142ac96bc6d5cb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <signekb@clin.au.dk>
Date: Wed, 6 Dec 2023 20:17:00 +0100
Subject: [PATCH 01/18] feat: add functions script with create_test_lab_df()

---
 R/functions.R | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 R/functions.R

diff --git a/R/functions.R b/R/functions.R
new file mode 100644
index 0000000..a16be6d
--- /dev/null
+++ b/R/functions.R
@@ -0,0 +1,25 @@
+#' Create synthetic lab data
+#'
+#' @param num_samples Number of samples to create (1 row per individual)
+#'
+#' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE
+create_test_lab_df <- function(num_samples) {
+  data.frame(
+    # patient ID
+    pnr = sprintf("%03d", seq_len(num_samples)),
+    # date of sample
+    SAMPLINGDATE = sample(
+      seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"),
+      num_samples, replace = TRUE),
+    # npu code of analysis type (50% is either NPU27300 or NPU03835)
+    ANALYSISCODE = ifelse(
+      # repeat 1 and 2 num_samples times and randomise them
+      sample(rep(c(1, 0), length.out = num_samples)),
+      # sample 'NPU27300' and 'NPU03835' for all 1's
+      sample(c('NPU27300', 'NPU03835'), num_samples, replace = TRUE),
+      # sample NPU + random digit between 10000:99999 for all 0's
+      paste0('NPU', sample(10000:99999, num_samples, replace = TRUE))),
+    # numerical result of test
+    VALUE = runif(num_samples, 0.1, 99.9)
+  )
+}

From e6baf5be7bc791acb857f24551b8e1bbdafbdcc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <signekb@clin.au.dk>
Date: Wed, 6 Dec 2023 20:19:54 +0100
Subject: [PATCH 02/18] feat: create test lab_df using create_test_lab_df()

---
 R/create_test_data.R | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/R/create_test_data.R b/R/create_test_data.R
index e3cd8bc..252e6c9 100644
--- a/R/create_test_data.R
+++ b/R/create_test_data.R
@@ -3,8 +3,11 @@
 # Load required libraries
 library(stringr)
 library(data.table)
+library(tidyverse)
+library(here)
 
-
+# Load functions
+source(here::here("R/functions.R"))
 
 # MEDICATION DATA ---------------------------------------------------------
 
@@ -182,4 +185,5 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=` (ATC = "A10BJ06",
 
 # Laboratory data ---------------------------------------------------------
 
-
+# create test lab df with 100 rows (one row per individual)
+lab_df <- create_test_lab_df(num_samples = 100)

From d930fc36d8381bff4bac2b755a76dce4716b3f0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <signekb@clin.au.dk>
Date: Fri, 8 Dec 2023 11:33:35 +0100
Subject: [PATCH 03/18] fix: change pnr to only include 001-100 independent of
 num_samples

---
 R/functions.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/functions.R b/R/functions.R
index a16be6d..e19d8ba 100644
--- a/R/functions.R
+++ b/R/functions.R
@@ -1,12 +1,12 @@
 #' Create synthetic lab data
 #'
-#' @param num_samples Number of samples to create (1 row per individual)
+#' @param num_samples Number of samples to create
 #'
 #' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE
 create_test_lab_df <- function(num_samples) {
   data.frame(
     # patient ID
-    pnr = sprintf("%03d", seq_len(num_samples)),
+    pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
     # date of sample
     SAMPLINGDATE = sample(
       seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"),

From 88f0b40c349125ff23579c048491d6a2303716e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <signekb@clin.au.dk>
Date: Fri, 8 Dec 2023 13:11:05 +0100
Subject: [PATCH 04/18] style: edit pnr comment to clarify it's only 001-100
 even if num_samples > 100

---
 R/functions.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/functions.R b/R/functions.R
index e19d8ba..f924781 100644
--- a/R/functions.R
+++ b/R/functions.R
@@ -5,7 +5,7 @@
 #' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE
 create_test_lab_df <- function(num_samples) {
   data.frame(
-    # patient ID
+    # patient ID (will only include 001-100 even if num_samples > 100)
     pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
     # date of sample
     SAMPLINGDATE = sample(

From 3c04bd831873e76e9444f51034364393c77fe4f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <signekb@clin.au.dk>
Date: Fri, 8 Dec 2023 13:54:06 +0100
Subject: [PATCH 05/18] style: update comments in create_test_lab_df

---
 R/functions.R | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/R/functions.R b/R/functions.R
index f924781..7806934 100644
--- a/R/functions.R
+++ b/R/functions.R
@@ -5,21 +5,32 @@
 #' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE
 create_test_lab_df <- function(num_samples) {
   data.frame(
-    # patient ID (will only include 001-100 even if num_samples > 100)
+    # pnr: patient ID (chr)
+    # random ID's from 001-100 (even if num_samples > 100)
     pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
-    # date of sample
+
+    # SAMPLINGDATE: date of sample (date)
+    # random dates between 1995-01-01 and 2015-12-31
     SAMPLINGDATE = sample(
       seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"),
-      num_samples, replace = TRUE),
-    # npu code of analysis type (50% is either NPU27300 or NPU03835)
+      num_samples,
+      replace = TRUE
+    ),
+
+    # ANALYSISCODE: npu code of analysis type (chr)
+    # 50% is either NPU27300 or NPU03835
+    # other 50% is 'NPU'+random sample from 10000:99999
     ANALYSISCODE = ifelse(
-      # repeat 1 and 2 num_samples times and randomise them
-      sample(rep(c(1, 0), length.out = num_samples)),
+      # repeat 0 and 1 num_samples times and randomise them
+      sample(rep(c(0, 1), length.out = num_samples)),
       # sample 'NPU27300' and 'NPU03835' for all 1's
-      sample(c('NPU27300', 'NPU03835'), num_samples, replace = TRUE),
+      sample(c("NPU27300", "NPU03835"), num_samples, replace = TRUE),
       # sample NPU + random digit between 10000:99999 for all 0's
-      paste0('NPU', sample(10000:99999, num_samples, replace = TRUE))),
-    # numerical result of test
+      paste0("NPU", sample(10000:99999, num_samples, replace = TRUE))
+    ),
+
+    # VALUE: numerical result of test (num)
+    # random decimal number between 0.1-99.9
     VALUE = runif(num_samples, 0.1, 99.9)
   )
 }

From a6e07a8fc960c29f1b590539316b0891d32df4f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <signekb@clin.au.dk>
Date: Fri, 8 Dec 2023 13:58:21 +0100
Subject: [PATCH 06/18] feat: add create_test_hi_df()

---
 R/functions.R | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/R/functions.R b/R/functions.R
index 7806934..fb51f88 100644
--- a/R/functions.R
+++ b/R/functions.R
@@ -34,3 +34,42 @@ create_test_lab_df <- function(num_samples) {
     VALUE = runif(num_samples, 0.1, 99.9)
   )
 }
+
+#' Create synthetic health insurance data
+#'
+#' @param num_samples Number of samples to create
+#'
+#' @return Data.frame with columns pnr, BARNMAK, SPECIALE, and HONUGE
+create_test_hi_df <- function(num_samples) {
+  data.frame(
+    # pnr: patientID (chr)
+    # random values from 001-100
+    pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
+
+    # BARNMAK: service performed on patient' child or not (binary)
+    # 1 = child, 0 = not, 5% are 1's
+    BARNMAK = sample(c(0, 1), num_samples, replace = TRUE,
+                     prob = c(0.95, 0.05)),
+
+    # SPECIALE: service code (6-digit int)
+    # 50% random samples between 100000 and 600000
+    # 50% random samples from 540000 to 549999
+    SPECIALE = ifelse(
+      # repeat 0 and 1 num_samples times and randomise them
+      sample(rep(c(0, 1), length.out = num_samples)),
+      # sample 100000:600000 for all 1's
+      sample(100000:600000, num_samples, replace = TRUE),
+      # sample 540000:549999 for all 0's
+      sample(540000:549999, num_samples, replace = TRUE)
+    ),
+
+    # HONUGE: year/week of the service being billed (4-digit chr)
+    # first and second digits are random numbers between 01-52
+    # third and fourth digits are random numbers between 00-99
+    HONUGE = sprintf(
+      "%02d%02d",
+      sample(1:52, num_samples, replace = TRUE),
+      sample(0:99, num_samples, replace = TRUE)
+    )
+  )
+}

From bbb2910da3113c07854af9fdab7094aea1f54efa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <signekb@clin.au.dk>
Date: Fri, 8 Dec 2023 13:59:17 +0100
Subject: [PATCH 07/18] feat: create test health insurance df using.
 create_test_hi_df()

---
 R/create_test_data.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/create_test_data.R b/R/create_test_data.R
index 252e6c9..b3adff5 100644
--- a/R/create_test_data.R
+++ b/R/create_test_data.R
@@ -180,8 +180,8 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=` (ATC = "A10BJ06",
 
 # Health Service data -----------------------------------------------------
 
-
-
+# create test health insurance df with 100 rows
+health_insurance_df <- create_test_hi_df(num_samples = 100)
 
 # Laboratory data ---------------------------------------------------------
 

From 17e40d26edbb9f74acb22bdf34ef6ba4f1f5e106 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <signekb@clin.au.dk>
Date: Fri, 8 Dec 2023 13:59:51 +0100
Subject: [PATCH 08/18] style: remove old parenthesis from comment

---
 R/create_test_data.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/create_test_data.R b/R/create_test_data.R
index b3adff5..804ec7c 100644
--- a/R/create_test_data.R
+++ b/R/create_test_data.R
@@ -185,5 +185,5 @@ health_insurance_df <- create_test_hi_df(num_samples = 100)
 
 # Laboratory data ---------------------------------------------------------
 
-# create test lab df with 100 rows (one row per individual)
+# create test lab df with 100 rows
 lab_df <- create_test_lab_df(num_samples = 100)

From e09b902a7c24eea0d22c45d470fcb19b28e7b67f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <signekb@clin.au.dk>
Date: Fri, 8 Dec 2023 14:01:05 +0100
Subject: [PATCH 09/18] fix: move set.seed up

this way it's clear that we set it for all test datasets and not only medication data
---
 R/create_test_data.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/create_test_data.R b/R/create_test_data.R
index 804ec7c..db7402b 100644
--- a/R/create_test_data.R
+++ b/R/create_test_data.R
@@ -9,15 +9,15 @@ library(here)
 # Load functions
 source(here::here("R/functions.R"))
 
+# Set seed for reproducibility
+set.seed(123)
+
 # MEDICATION DATA ---------------------------------------------------------
 
 # Pseudo-lmdb:
 
 #### Non-diabetes data:
 
-# Set seed for reproducibility
-set.seed(123)
-
 # Create a dataframe with 1000 rows from 200 individuals
 med_df <- data.frame(
   pnr = sprintf("%03d", 1:200),

From ea8e089ffeab39b7250942367646077caa589894 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <signekb@clin.au.dk>
Date: Sun, 10 Dec 2023 13:21:16 +0100
Subject: [PATCH 10/18] refactor: move functions to create test lab and hi data
 into create_test_data.R and delete empty functions.R

---
 R/create_test_data.R | 120 +++++++++++++++++++++++++++++++++++++------
 R/functions.R        |  75 ---------------------------
 2 files changed, 105 insertions(+), 90 deletions(-)
 delete mode 100644 R/functions.R

diff --git a/R/create_test_data.R b/R/create_test_data.R
index db7402b..79f6cd4 100644
--- a/R/create_test_data.R
+++ b/R/create_test_data.R
@@ -6,9 +6,6 @@ library(data.table)
 library(tidyverse)
 library(here)
 
-# Load functions
-source(here::here("R/functions.R"))
-
 # Set seed for reproducibility
 set.seed(123)
 
@@ -23,7 +20,8 @@ med_df <- data.frame(
   pnr = sprintf("%03d", 1:200),
   # ID variable
   eksd = as.Date(sample(
-    seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000, replace = TRUE
+    seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000,
+    replace = TRUE
   )),
   # Date of purchase
   apk = sample(1:3, 1000, replace = TRUE),
@@ -63,7 +61,8 @@ med_a10_df <- data.frame(
   pnr = sprintf("%03d", 1:50),
   # ID variable
   eksd = as.Date(sample(
-    seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000, replace = TRUE
+    seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000,
+    replace = TRUE
   )),
   # Date of purchase
   apk = sample(1:3, 1000, replace = TRUE),
@@ -100,10 +99,11 @@ med_a10_df <- data.frame(
 
 # Hardcode half of purchases to be metformin, Liraglutide or semaglutide:
 
-med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2),]$ATC <-
+med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <-
   sample(c("A10BA02", "A10BJ02", "A10BJ06"),
-         nrow(med_a10_df) / 2,
-         replace = TRUE)
+    nrow(med_a10_df) / 2,
+    replace = TRUE
+  )
 
 generateDrugName <- function(atc) {
   # You can implement your own logic to generate drug names based on ATC codes
@@ -123,8 +123,10 @@ replaceDrugNames <- function(data) {
   }
 
   # Define replacement mappings for ATC codes
-  replacement_mappings <- list("A10BJ02" = "Saxenda",
-                               "A10BJ06" = "Wegovy Flextouch")
+  replacement_mappings <- list(
+    "A10BJ02" = "Saxenda",
+    "A10BJ06" = "Wegovy Flextouch"
+  )
 
   # Iterate through rows and make replacements
   for (i in 1:nrow(data)) {
@@ -149,19 +151,23 @@ med_df <- rbind(med_df, med_a10_df)
 setDT(med_df)
 
 # Handcode a few false-positive cases with purchases of metformin:
-med_df[pnr %in% c(sprintf("%03d", 180:190)), `:=` (
+med_df[pnr %in% c(sprintf("%03d", 180:190)), `:=`(
   indo = sample(c("0000092", "0000276", "0000781"), 55, replace = TRUE),
   ATC = "A10BA02",
   drugname = "Metformin"
 )]
 
 # Handcode a few false-positive cases with purchases of Saxenda:
-med_df[pnr %in% c(sprintf("%03d", 190:195)), `:=` (ATC = "A10BJ02",
-                                                   drugname = "Saxenda")]
+med_df[pnr %in% c(sprintf("%03d", 190:195)), `:=`(
+  ATC = "A10BJ02",
+  drugname = "Saxenda"
+)]
 
 # Handcode a few false-positive cases with purchases of Wegovy:
-med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=` (ATC = "A10BJ06",
-                                                   drugname = "Wegovy Flextouch")]
+med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`(
+  ATC = "A10BJ06",
+  drugname = "Wegovy Flextouch"
+)]
 
 
 
@@ -180,10 +186,94 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=` (ATC = "A10BJ06",
 
 # Health Service data -----------------------------------------------------
 
+#' Create synthetic health insurance data
+#'
+#' @param num_samples Number of samples to create
+#'
+#' @return Data.frame with columns pnr, BARNMAK, SPECIALE, and HONUGE
+#'
+#' @examples
+#' create_test_hi_df(num_samples = 100)
+create_test_hi_df <- function(num_samples) {
+  data.frame(
+    # pnr: patientID (chr)
+    # random values from 001-100
+    pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
+
+    # BARNMAK: service performed on patient' child or not (binary)
+    # 1 = child, 0 = not, 5% are 1's
+    BARNMAK = sample(c(0, 1), num_samples,
+      replace = TRUE,
+      prob = c(0.95, 0.05)
+    ),
+
+    # SPECIALE: service code (6-digit int)
+    # 50% random samples between 100000 and 600000
+    # 50% random samples from 540000 to 549999
+    SPECIALE = ifelse(
+      # repeat 0 and 1 num_samples times and randomise them
+      sample(rep(c(0, 1), length.out = num_samples)),
+      # sample 100000:600000 for all 1's
+      sample(100000:600000, num_samples, replace = TRUE),
+      # sample 540000:549999 for all 0's
+      sample(540000:549999, num_samples, replace = TRUE)
+    ),
+
+    # HONUGE: year/week of the service being billed (4-digit chr)
+    # first and second digits are random numbers between 01-52
+    # third and fourth digits are random numbers between 00-99
+    HONUGE = sprintf(
+      "%02d%02d",
+      sample(1:52, num_samples, replace = TRUE),
+      sample(0:99, num_samples, replace = TRUE)
+    )
+  )
+}
+
 # create test health insurance df with 100 rows
 health_insurance_df <- create_test_hi_df(num_samples = 100)
 
 # Laboratory data ---------------------------------------------------------
 
+#' Create synthetic lab data
+#'
+#' @param num_samples Number of samples to create
+#'
+#' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE
+#'
+#' @examples
+#' create_test_lab_df(num_samples = 100)
+create_test_lab_df <- function(num_samples) {
+  data.frame(
+    # pnr: patient ID (chr)
+    # random ID's from 001-100 (even if num_samples > 100)
+    pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
+
+    # SAMPLINGDATE: date of sample (date)
+    # random dates between 1995-01-01 and 2015-12-31
+    SAMPLINGDATE = sample(
+      seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"),
+      num_samples,
+      replace = TRUE
+    ),
+
+    # ANALYSISCODE: npu code of analysis type (chr)
+    # 50% is either NPU27300 or NPU03835
+    # other 50% is 'NPU'+random sample from 10000:99999
+    ANALYSISCODE = ifelse(
+      # repeat 0 and 1 num_samples times and randomise them
+      sample(rep(c(0, 1), length.out = num_samples)),
+      # sample 'NPU27300' and 'NPU03835' for all 1's
+      sample(c("NPU27300", "NPU03835"), num_samples, replace = TRUE),
+      # sample NPU + random digit between 10000:99999 for all 0's
+      paste0("NPU", sample(10000:99999, num_samples, replace = TRUE))
+    ),
+
+    # VALUE: numerical result of test (num)
+    # random decimal number between 0.1-99.9
+    VALUE = runif(num_samples, 0.1, 99.9)
+  )
+}
+
 # create test lab df with 100 rows
 lab_df <- create_test_lab_df(num_samples = 100)
diff --git a/R/functions.R b/R/functions.R
deleted file mode 100644
index fb51f88..0000000
--- a/R/functions.R
+++ /dev/null
@@ -1,75 +0,0 @@
-#' Create synthetic lab data
-#'
-#' @param num_samples Number of samples to create
-#'
-#' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE
-create_test_lab_df <- function(num_samples) {
-  data.frame(
-    # pnr: patient ID (chr)
-    # random ID's from 001-100 (even if num_samples > 100)
-    pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
-
-    # SAMPLINGDATE: date of sample (date)
-    # random dates between 1995-01-01 and 2015-12-31
-    SAMPLINGDATE = sample(
-      seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"),
-      num_samples,
-      replace = TRUE
-    ),
-
-    # ANALYSISCODE: npu code of analysis type (chr)
-    # 50% is either NPU27300 or NPU03835
-    # other 50% is 'NPU'+random sample from 10000:99999
-    ANALYSISCODE = ifelse(
-      # repeat 0 and 1 num_samples times and randomise them
-      sample(rep(c(0, 1), length.out = num_samples)),
-      # sample 'NPU27300' and 'NPU03835' for all 1's
-      sample(c("NPU27300", "NPU03835"), num_samples, replace = TRUE),
-      # sample NPU + random digit between 10000:99999 for all 0's
-      paste0("NPU", sample(10000:99999, num_samples, replace = TRUE))
-    ),
-
-    # VALUE: numerical result of test (num)
-    # random decimal number between 0.1-99.9
-    VALUE = runif(num_samples, 0.1, 99.9)
-  )
-}
-
-#' Create synthetic health insurance data
-#'
-#' @param num_samples Number of samples to create
-#'
-#' @return Data.frame with columns pnr, BARNMAK, SPECIALE, and HONUGE
-create_test_hi_df <- function(num_samples) {
-  data.frame(
-    # pnr: patientID (chr)
-    # random values from 001-100
-    pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
-
-    # BARNMAK: service performed on patient' child or not (binary)
-    # 1 = child, 0 = not, 5% are 1's
-    BARNMAK = sample(c(0, 1), num_samples, replace = TRUE,
-                     prob = c(0.95, 0.05)),
-
-    # SPECIALE: service code (6-digit int)
-    # 50% random samples between 100000 and 600000
-    # 50% random samples from 540000 to 549999
-    SPECIALE = ifelse(
-      # repeat 0 and 1 num_samples times and randomise them
-      sample(rep(c(0, 1), length.out = num_samples)),
-      # sample 100000:600000 for all 1's
-      sample(100000:600000, num_samples, replace = TRUE),
-      # sample 540000:549999 for all 0's
-      sample(540000:549999, num_samples, replace = TRUE)
-    ),
-
-    # HONUGE: year/week of the service being billed (4-digit chr)
-    # first and second digits are random numbers between 01-52
-    # third and fourth digits are random numbers between 00-99
-    HONUGE = sprintf(
-      "%02d%02d",
-      sample(1:52, num_samples, replace = TRUE),
-      sample(0:99, num_samples, replace = TRUE)
-    )
-  )
-}

From 2fc565591451d8476e6c5de60ed18d62c57b515c Mon Sep 17 00:00:00 2001
From: "Luke W. Johnston" <lwjohnst@gmail.com>
Date: Tue, 12 Dec 2023 19:15:23 +0100
Subject: [PATCH 11/18] chore: add setup for making fake data using
 usethis::use_data_raw().

---
 .Rbuildignore       | 1 +
 data-raw/testdata.R | 3 +++
 2 files changed, 4 insertions(+)
 create mode 100644 data-raw/testdata.R

diff --git a/.Rbuildignore b/.Rbuildignore
index e13c405..732aa56 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,3 +1,4 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^LICENSE\.md$
+^data-raw$
diff --git a/data-raw/testdata.R b/data-raw/testdata.R
new file mode 100644
index 0000000..aaea9d1
--- /dev/null
+++ b/data-raw/testdata.R
@@ -0,0 +1,3 @@
+## code to prepare `testdata` dataset goes here
+
+usethis::use_data(testdata, overwrite = TRUE)

From 17dfea6a1a447eaf6e702c894c03acf80025d6bb Mon Sep 17 00:00:00 2001
From: "Luke W. Johnston" <lwjohnst@gmail.com>
Date: Tue, 12 Dec 2023 19:17:04 +0100
Subject: [PATCH 12/18] chore: Moved code over into data-raw folder

---
 R/create_test_data.R | 279 ------------------------------------------
 data-raw/testdata.R  | 282 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 280 insertions(+), 281 deletions(-)
 delete mode 100644 R/create_test_data.R

diff --git a/R/create_test_data.R b/R/create_test_data.R
deleted file mode 100644
index 79f6cd4..0000000
--- a/R/create_test_data.R
+++ /dev/null
@@ -1,279 +0,0 @@
-# Script to generate synthetic data for tests
-
-# Load required libraries
-library(stringr)
-library(data.table)
-library(tidyverse)
-library(here)
-
-# Set seed for reproducibility
-set.seed(123)
-
-# MEDICATION DATA ---------------------------------------------------------
-
-# Pseudo-lmdb:
-
-#### Non-diabetes data:
-
-# Create a dataframe with 1000 rows from 200 individuals
-med_df <- data.frame(
-  pnr = sprintf("%03d", 1:200),
-  # ID variable
-  eksd = as.Date(sample(
-    seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000,
-    replace = TRUE
-  )),
-  # Date of purchase
-  apk = sample(1:3, 1000, replace = TRUE),
-  # Number of packages
-  indo = ifelse(runif(1000) <= 0.05, "", sprintf(
-    "%07d", sample(1:9999999, 50, replace = TRUE)
-  )),
-  # Indication for treatment
-  ATC = paste(
-    sample(LETTERS, 1000, replace = TRUE),
-    sample(0:9, 1000, replace = TRUE),
-    sample(0:9, 1000, replace = TRUE),
-    sample(LETTERS, 1000, replace = TRUE),
-    sample(LETTERS, 1000, replace = TRUE),
-    sample(0:9, 1000, replace = TRUE),
-    sample(0:9, 1000, replace = TRUE),
-    sep = ""
-  ),
-  # ATC code
-  volume = sample(20:100, 1000, replace = TRUE) # Volume
-)
-
-# Create a function to generate drug names based on ATC codes (replace this with your own drug name generation logic)
-generateDrugName <- function(atc) {
-  # You can implement your own logic to generate drug names based on ATC codes
-  # Here, we are using a placeholder logic that simply returns the atc code.
-  return(atc)
-}
-
-# Apply the function to create drug names
-med_df$drugname <- sapply(med_df$ATC, generateDrugName)
-
-#### Diabetes data:
-
-# Create a dataframe with 1000 rows from 50 individuals
-med_a10_df <- data.frame(
-  pnr = sprintf("%03d", 1:50),
-  # ID variable
-  eksd = as.Date(sample(
-    seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000,
-    replace = TRUE
-  )),
-  # Date of purchase
-  apk = sample(1:3, 1000, replace = TRUE),
-  # Number of packages
-  indo = ifelse(runif(1000) <= 0.05, "", sprintf(
-    "%07d", sample(1:9999999, 50, replace = TRUE)
-  )),
-  # Indication for treatment
-  ATC = paste(
-    rep(
-      c(
-        "A10AB",
-        "A10AC",
-        "A10AD",
-        "A10AE",
-        "A10BA",
-        "A10BB",
-        "A10BD",
-        "A10BG",
-        "A10BH",
-        "A10BJ",
-        "A10BK",
-        "A10BX"
-      ),
-      80
-    ),
-    sample(0:9, 1000, replace = TRUE),
-    sample(0:9, 1000, replace = TRUE),
-    sep = ""
-  ),
-  # ATC code
-  volume = sample(20:100, 1000, replace = TRUE) # Volume
-)
-
-# Hardcode half of purchases to be metformin, Liraglutide or semaglutide:
-
-med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <-
-  sample(c("A10BA02", "A10BJ02", "A10BJ06"),
-    nrow(med_a10_df) / 2,
-    replace = TRUE
-  )
-
-generateDrugName <- function(atc) {
-  # You can implement your own logic to generate drug names based on ATC codes
-  # Here, we are using a placeholder logic that simply returns the atc code.
-  return(atc)
-}
-
-# Apply the function to create drug names
-med_a10_df$drugname <- sapply(med_a10_df$ATC, generateDrugName)
-
-
-replaceDrugNames <- function(data) {
-  # Check if the data frame contains the necessary columns
-  if (!all(c("ATC", "drugname") %in% colnames(data))) {
-    cat("Required columns not found in the data frame.\n")
-    return(NULL)
-  }
-
-  # Define replacement mappings for ATC codes
-  replacement_mappings <- list(
-    "A10BJ02" = "Saxenda",
-    "A10BJ06" = "Wegovy Flextouch"
-  )
-
-  # Iterate through rows and make replacements
-  for (i in 1:nrow(data)) {
-    atc_code <- data$ATC[i]
-    if (atc_code %in% names(replacement_mappings)) {
-      # Check if the ATC code is in the mappings
-      if (runif(1) < 0.5) {
-        # Replace with the corresponding drug name with 50% probability
-        data$drugname[i] <- replacement_mappings[[atc_code]]
-      }
-    }
-  }
-
-  return(data)
-}
-
-# Apply the function to create drug names
-med_a10_df <- replaceDrugNames(med_a10_df)
-
-med_df <- rbind(med_df, med_a10_df)
-
-setDT(med_df)
-
-# Handcode a few false-positive cases with purchases of metformin:
-med_df[pnr %in% c(sprintf("%03d", 180:190)), `:=`(
-  indo = sample(c("0000092", "0000276", "0000781"), 55, replace = TRUE),
-  ATC = "A10BA02",
-  drugname = "Metformin"
-)]
-
-# Handcode a few false-positive cases with purchases of Saxenda:
-med_df[pnr %in% c(sprintf("%03d", 190:195)), `:=`(
-  ATC = "A10BJ02",
-  drugname = "Saxenda"
-)]
-
-# Handcode a few false-positive cases with purchases of Wegovy:
-med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`(
-  ATC = "A10BJ06",
-  drugname = "Wegovy Flextouch"
-)]
-
-
-
-# Hospital diagnoses ------------------------------------------------------
-
-
-
-# lpr_adm -----------------------------------------------------------------
-
-
-
-
-# lpr_diag ----------------------------------------------------------------
-
-
-
-# Health Service data -----------------------------------------------------
-
-#' Create synthetic health insurance data
-#'
-#' @param num_samples Number of samples to create
-#'
-#' @return Data.frame with columns pnr, BARNMAK, SPECIALE, and HONUGE
-#'
-#' @examples
-#' create_test_hi_df(num_samples = 100)
-create_test_hi_df <- function(num_samples) {
-  data.frame(
-    # pnr: patientID (chr)
-    # random values from 001-100
-    pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
-
-    # BARNMAK: service performed on patient' child or not (binary)
-    # 1 = child, 0 = not, 5% are 1's
-    BARNMAK = sample(c(0, 1), num_samples,
-      replace = TRUE,
-      prob = c(0.95, 0.05)
-    ),
-
-    # SPECIALE: service code (6-digit int)
-    # 50% random samples between 100000 and 600000
-    # 50% random samples from 540000 to 549999
-    SPECIALE = ifelse(
-      # repeat 0 and 1 num_samples times and randomise them
-      sample(rep(c(0, 1), length.out = num_samples)),
-      # sample 100000:600000 for all 1's
-      sample(100000:600000, num_samples, replace = TRUE),
-      # sample 540000:549999 for all 0's
-      sample(540000:549999, num_samples, replace = TRUE)
-    ),
-
-    # HONUGE: year/week of the service being billed (4-digit chr)
-    # first and second digits are random numbers between 01-52
-    # third and fourth digits are random numbers between 00-99
-    HONUGE = sprintf(
-      "%02d%02d",
-      sample(1:52, num_samples, replace = TRUE),
-      sample(0:99, num_samples, replace = TRUE)
-    )
-  )
-}
-
-# create test health insurance df with 100 rows
-health_insurance_df <- create_test_hi_df(num_samples = 100)
-
-# Laboratory data ---------------------------------------------------------
-
-#' Create synthetic lab data
-#'
-#' @param num_samples Number of samples to create
-#'
-#' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE
-#'
-#' @examples
-#' create_test_lab_df(num_samples = 100)
-create_test_lab_df <- function(num_samples) {
-  data.frame(
-    # pnr: patient ID (chr)
-    # random ID's from 001-100 (even if num_samples > 100)
-    pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
-
-    # SAMPLINGDATE: date of sample (date)
-    # random dates between 1995-01-01 and 2015-12-31
-    SAMPLINGDATE = sample(
-      seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"),
-      num_samples,
-      replace = TRUE
-    ),
-
-    # ANALYSISCODE: npu code of analysis type (chr)
-    # 50% is either NPU27300 or NPU03835
-    # other 50% is 'NPU'+random sample from 10000:99999
-    ANALYSISCODE = ifelse(
-      # repeat 0 and 1 num_samples times and randomise them
-      sample(rep(c(0, 1), length.out = num_samples)),
-      # sample 'NPU27300' and 'NPU03835' for all 1's
-      sample(c("NPU27300", "NPU03835"), num_samples, replace = TRUE),
-      # sample NPU + random digit between 10000:99999 for all 0's
-      paste0("NPU", sample(10000:99999, num_samples, replace = TRUE))
-    ),
-
-    # VALUE: numerical result of test (num)
-    # random decimal number between 0.1-99.9
-    VALUE = runif(num_samples, 0.1, 99.9)
-  )
-}
-
-# create test lab df with 100 rows
-lab_df <- create_test_lab_df(num_samples = 100)
diff --git a/data-raw/testdata.R b/data-raw/testdata.R
index aaea9d1..855cde7 100644
--- a/data-raw/testdata.R
+++ b/data-raw/testdata.R
@@ -1,3 +1,281 @@
-## code to prepare `testdata` dataset goes here
+# Script to generate synthetic data for tests
 
-usethis::use_data(testdata, overwrite = TRUE)
+# Load required libraries
+library(stringr)
+library(data.table)
+library(tidyverse)
+library(here)
+
+# Set seed for reproducibility
+set.seed(123)
+
+# MEDICATION DATA ---------------------------------------------------------
+
+# Pseudo-lmdb:
+
+#### Non-diabetes data:
+
+# Create a dataframe with 1000 rows from 200 individuals
+med_df <- data.frame(
+  pnr = sprintf("%03d", 1:200),
+  # ID variable
+  eksd = as.Date(sample(
+    seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000,
+    replace = TRUE
+  )),
+  # Date of purchase
+  apk = sample(1:3, 1000, replace = TRUE),
+  # Number of packages
+  indo = ifelse(runif(1000) <= 0.05, "", sprintf(
+    "%07d", sample(1:9999999, 50, replace = TRUE)
+  )),
+  # Indication for treatment
+  ATC = paste(
+    sample(LETTERS, 1000, replace = TRUE),
+    sample(0:9, 1000, replace = TRUE),
+    sample(0:9, 1000, replace = TRUE),
+    sample(LETTERS, 1000, replace = TRUE),
+    sample(LETTERS, 1000, replace = TRUE),
+    sample(0:9, 1000, replace = TRUE),
+    sample(0:9, 1000, replace = TRUE),
+    sep = ""
+  ),
+  # ATC code
+  volume = sample(20:100, 1000, replace = TRUE) # Volume
+)
+
+# Create a function to generate drug names based on ATC codes (replace this with your own drug name generation logic)
+generateDrugName <- function(atc) {
+  # You can implement your own logic to generate drug names based on ATC codes
+  # Here, we are using a placeholder logic that simply returns the atc code.
+  return(atc)
+}
+
+# Apply the function to create drug names
+med_df$drugname <- sapply(med_df$ATC, generateDrugName)
+
+#### Diabetes data:
+
+# Create a dataframe with 1000 rows from 50 individuals
+med_a10_df <- data.frame(
+  pnr = sprintf("%03d", 1:50),
+  # ID variable
+  eksd = as.Date(sample(
+    seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000,
+    replace = TRUE
+  )),
+  # Date of purchase
+  apk = sample(1:3, 1000, replace = TRUE),
+  # Number of packages
+  indo = ifelse(runif(1000) <= 0.05, "", sprintf(
+    "%07d", sample(1:9999999, 50, replace = TRUE)
+  )),
+  # Indication for treatment
+  ATC = paste(
+    rep(
+      c(
+        "A10AB",
+        "A10AC",
+        "A10AD",
+        "A10AE",
+        "A10BA",
+        "A10BB",
+        "A10BD",
+        "A10BG",
+        "A10BH",
+        "A10BJ",
+        "A10BK",
+        "A10BX"
+      ),
+      80
+    ),
+    sample(0:9, 1000, replace = TRUE),
+    sample(0:9, 1000, replace = TRUE),
+    sep = ""
+  ),
+  # ATC code
+  volume = sample(20:100, 1000, replace = TRUE) # Volume
+)
+
+# Hardcode half of purchases to be metformin, Liraglutide or semaglutide:
+
+med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <-
+  sample(c("A10BA02", "A10BJ02", "A10BJ06"),
+    nrow(med_a10_df) / 2,
+    replace = TRUE
+  )
+
+generateDrugName <- function(atc) {
+  # You can implement your own logic to generate drug names based on ATC codes
+  # Here, we are using a placeholder logic that simply returns the atc code.
+  return(atc)
+}
+
+# Apply the function to create drug names
+med_a10_df$drugname <- sapply(med_a10_df$ATC, generateDrugName)
+
+
+replaceDrugNames <- function(data) {
+  # Check if the data frame contains the necessary columns
+  if (!all(c("ATC", "drugname") %in% colnames(data))) {
+    cat("Required columns not found in the data frame.\n")
+    return(NULL)
+  }
+
+  # Define replacement mappings for ATC codes
+  replacement_mappings <- list(
+    "A10BJ02" = "Saxenda",
+    "A10BJ06" = "Wegovy Flextouch"
+  )
+
+  # Iterate through rows and make replacements
+  for (i in 1:nrow(data)) {
+    atc_code <- data$ATC[i]
+    if (atc_code %in% names(replacement_mappings)) {
+      # Check if the ATC code is in the mappings
+      if (runif(1) < 0.5) {
+        # Replace with the corresponding drug name with 50% probability
+        data$drugname[i] <- replacement_mappings[[atc_code]]
+      }
+    }
+  }
+
+  return(data)
+}
+
+# Apply the function to create drug names
+med_a10_df <- replaceDrugNames(med_a10_df)
+
+med_df <- rbind(med_df, med_a10_df)
+
+setDT(med_df)
+
+# Handcode a few false-positive cases with purchases of metformin:
+med_df[pnr %in% c(sprintf("%03d", 180:190)), `:=`(
+  indo = sample(c("0000092", "0000276", "0000781"), 55, replace = TRUE),
+  ATC = "A10BA02",
+  drugname = "Metformin"
+)]
+
+# Handcode a few false-positive cases with purchases of Saxenda:
+med_df[pnr %in% c(sprintf("%03d", 190:195)), `:=`(
+  ATC = "A10BJ02",
+  drugname = "Saxenda"
+)]
+
+# Handcode a few false-positive cases with purchases of Wegovy:
+med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`(
+  ATC = "A10BJ06",
+  drugname = "Wegovy Flextouch"
+)]
+
+
+
+# Hospital diagnoses ------------------------------------------------------
+
+
+
+# lpr_adm -----------------------------------------------------------------
+
+
+
+
+# lpr_diag ----------------------------------------------------------------
+
+
+
+# Health Service data -----------------------------------------------------
+
+#' Create synthetic health insurance data
+#'
+#' @param num_samples Number of samples to create
+#'
+#' @return Data.frame with columns pnr, BARNMAK, SPECIALE, and HONUGE
+#'
+#' @examples
+#' create_test_hi_df(num_samples = 100)
+create_test_hi_df <- function(num_samples) {
+  data.frame(
+    # pnr: patientID (chr)
+    # random values from 001-100
+    pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
+
+    # BARNMAK: service performed on patient' child or not (binary)
+    # 1 = child, 0 = not, 5% are 1's
+    BARNMAK = sample(c(0, 1), num_samples,
+      replace = TRUE,
+      prob = c(0.95, 0.05)
+    ),
+
+    # SPECIALE: service code (6-digit int)
+    # 50% random samples between 100000 and 600000
+    # 50% random samples from 540000 to 549999
+    SPECIALE = ifelse(
+      # repeat 0 and 1 num_samples times and randomise them
+      sample(rep(c(0, 1), length.out = num_samples)),
+      # sample 100000:600000 for all 1's
+      sample(100000:600000, num_samples, replace = TRUE),
+      # sample 540000:549999 for all 0's
+      sample(540000:549999, num_samples, replace = TRUE)
+    ),
+
+    # HONUGE: year/week of the service being billed (4-digit chr)
+    # first and second digits are random numbers between 01-52
+    # third and fourth digits are random numbers between 00-99
+    HONUGE = sprintf(
+      "%02d%02d",
+      sample(1:52, num_samples, replace = TRUE),
+      sample(0:99, num_samples, replace = TRUE)
+    )
+  )
+}
+
+# create test health insurance df with 100 rows
+health_insurance_df <- create_test_hi_df(num_samples = 100)
+
+# Laboratory data ---------------------------------------------------------
+
+#' Create synthetic lab data
+#'
+#' @param num_samples Number of samples to create
+#'
+#' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE
+#'
+#' @examples
+#' create_test_lab_df(num_samples = 100)
+create_test_lab_df <- function(num_samples) {
+  data.frame(
+    # pnr: patient ID (chr)
+    # random ID's from 001-100 (even if num_samples > 100)
+    pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
+
+    # SAMPLINGDATE: date of sample (date)
+    # random dates between 1995-01-01 and 2015-12-31
+    SAMPLINGDATE = sample(
+      seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"),
+      num_samples,
+      replace = TRUE
+    ),
+
+    # ANALYSISCODE: npu code of analysis type (chr)
+    # 50% is either NPU27300 or NPU03835
+    # other 50% is 'NPU'+random sample from 10000:99999
+    ANALYSISCODE = ifelse(
+      # repeat 0 and 1 num_samples times and randomise them
+      sample(rep(c(0, 1), length.out = num_samples)),
+      # sample 'NPU27300' and 'NPU03835' for all 1's
+      sample(c("NPU27300", "NPU03835"), num_samples, replace = TRUE),
+      # sample NPU + random digit between 10000:99999 for all 0's
+      paste0("NPU", sample(10000:99999, num_samples, replace = TRUE))
+    ),
+
+    # VALUE: numerical result of test (num)
+    # random decimal number between 0.1-99.9
+    VALUE = runif(num_samples, 0.1, 99.9)
+  )
+}
+
+# create test lab df with 100 rows
+fake_data <- create_test_lab_df(num_samples = 100)
+
+usethis::use_data(fake_data, overwrite = TRUE, internal = TRUE)

From 4eb894714ef4a27bd61bf00940a5158e682d0984 Mon Sep 17 00:00:00 2001
From: "Luke W. Johnston" <lwjohnst@gmail.com>
Date: Tue, 12 Dec 2023 20:00:16 +0100
Subject: [PATCH 13/18] refactor: Started refactoring but not sure what output
 should be.

---
 DESCRIPTION         |   1 +
 data-raw/testdata.R | 102 ++++++++++++++++++++------------------------
 2 files changed, 47 insertions(+), 56 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 83edf41..8975652 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -22,4 +22,5 @@ Imports:
     here,
     lubridate
 Suggests: 
+    codeCollection,
     DiagrammeR
diff --git a/data-raw/testdata.R b/data-raw/testdata.R
index 855cde7..69060b3 100644
--- a/data-raw/testdata.R
+++ b/data-raw/testdata.R
@@ -1,60 +1,68 @@
 # Script to generate synthetic data for tests
 
 # Load required libraries
-library(stringr)
-library(data.table)
 library(tidyverse)
+library(data.table)
 library(here)
+library(lubridate)
 
 # Set seed for reproducibility
 set.seed(123)
 
-# MEDICATION DATA ---------------------------------------------------------
+# Set number of rows to create for fake data
+number_rows <- 1000
+
+
+# Functions ---------------------------------------------------------------
+
+generate_fake_atc <- function(number_rows) {
+  codeCollection::ATCKoodit |>
+    tibble::as_tibble() |>
+    dplyr::filter(stringr::str_length(Koodi) == 7) |>
+    dplyr::pull(Koodi) |>
+    sample(number_rows, replace = TRUE)
+}
+
+generate_fake_indication <- function(number_rows) {
+  sample(1:9e8, number_rows, replace = TRUE) |>
+    stringr::str_trunc(width = 7, ellipsis = "") |>
+    stringr::str_pad(width = 7, pad = "0")
+}
+
+assign_drugname_from_atc <- function(data) {
+  codeCollection::ATCKoodit |>
+    tibble::as_tibble() |>
+    dplyr::select(ATC = Koodi, drugname = en) |>
+    dplyr::right_join(data, by = "ATC", relationship = "many-to-many")
+}
+
+# Medication data ---------------------------------------------------------
 
 # Pseudo-lmdb:
 
-#### Non-diabetes data:
+## Non-diabetes data:
 
-# Create a dataframe with 1000 rows from 200 individuals
-med_df <- data.frame(
-  pnr = sprintf("%03d", 1:200),
+# Create a tibble with 1000 rows from 200 individuals
+med_df <- tibble(
   # ID variable
-  eksd = as.Date(sample(
-    seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000,
+  pnr = sample(sprintf("%03d", 1:200), number_rows, replace = TRUE),
+  # Date of purchase
+  eksd = as_date(sample(
+    seq(as_date("1995-01-01"), as_date("2014-12-31"), by = "day"), 1000,
     replace = TRUE
   )),
-  # Date of purchase
-  apk = sample(1:3, 1000, replace = TRUE),
   # Number of packages
-  indo = ifelse(runif(1000) <= 0.05, "", sprintf(
-    "%07d", sample(1:9999999, 50, replace = TRUE)
-  )),
+  apk = sample(1:3, 1000, replace = TRUE),
   # Indication for treatment
-  ATC = paste(
-    sample(LETTERS, 1000, replace = TRUE),
-    sample(0:9, 1000, replace = TRUE),
-    sample(0:9, 1000, replace = TRUE),
-    sample(LETTERS, 1000, replace = TRUE),
-    sample(LETTERS, 1000, replace = TRUE),
-    sample(0:9, 1000, replace = TRUE),
-    sample(0:9, 1000, replace = TRUE),
-    sep = ""
-  ),
+  indo = generate_fake_indication(number_rows),
   # ATC code
-  volume = sample(20:100, 1000, replace = TRUE) # Volume
-)
+  ATC = generate_fake_atc(number_rows),
+  # Volume
+  volume = sample(20:100, 1000, replace = TRUE)
+) |>
+  assign_drugname_from_atc()
 
-# Create a function to generate drug names based on ATC codes (replace this with your own drug name generation logic)
-generateDrugName <- function(atc) {
-  # You can implement your own logic to generate drug names based on ATC codes
-  # Here, we are using a placeholder logic that simply returns the atc code.
-  return(atc)
-}
-
-# Apply the function to create drug names
-med_df$drugname <- sapply(med_df$ATC, generateDrugName)
-
-#### Diabetes data:
+## Diabetes data:
 
 # Create a dataframe with 1000 rows from 50 individuals
 med_a10_df <- data.frame(
@@ -67,9 +75,7 @@ med_a10_df <- data.frame(
   # Date of purchase
   apk = sample(1:3, 1000, replace = TRUE),
   # Number of packages
-  indo = ifelse(runif(1000) <= 0.05, "", sprintf(
-    "%07d", sample(1:9999999, 50, replace = TRUE)
-  )),
+  indo = generate_fake_indication(number_rows),
   # Indication for treatment
   ATC = paste(
     rep(
@@ -105,16 +111,6 @@ med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <-
     replace = TRUE
   )
 
-generateDrugName <- function(atc) {
-  # You can implement your own logic to generate drug names based on ATC codes
-  # Here, we are using a placeholder logic that simply returns the atc code.
-  return(atc)
-}
-
-# Apply the function to create drug names
-med_a10_df$drugname <- sapply(med_a10_df$ATC, generateDrugName)
-
-
 replaceDrugNames <- function(data) {
   # Check if the data frame contains the necessary columns
   if (!all(c("ATC", "drugname") %in% colnames(data))) {
@@ -169,12 +165,8 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`(
   drugname = "Wegovy Flextouch"
 )]
 
-
-
 # Hospital diagnoses ------------------------------------------------------
 
-
-
 # lpr_adm -----------------------------------------------------------------
 
 
@@ -182,8 +174,6 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`(
 
 # lpr_diag ----------------------------------------------------------------
 
-
-
 # Health Service data -----------------------------------------------------
 
 #' Create synthetic health insurance data

From d35dc94cdcec24515d7b95f7a8c74ec02fa7128a Mon Sep 17 00:00:00 2001
From: "Luke W. Johnston" <lwjohnst@gmail.com>
Date: Tue, 12 Dec 2023 20:29:34 +0100
Subject: [PATCH 14/18] refactor: create function to make pnr, plus other small
 edits

---
 data-raw/testdata.R | 75 ++++++++++++++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 28 deletions(-)

diff --git a/data-raw/testdata.R b/data-raw/testdata.R
index 69060b3..fe6f60d 100644
--- a/data-raw/testdata.R
+++ b/data-raw/testdata.R
@@ -15,15 +15,42 @@ number_rows <- 1000
 
 # Functions ---------------------------------------------------------------
 
-generate_fake_atc <- function(number_rows) {
+#' Create a vector of fake PNR (person ID number).
+#'
+#' These numbers range from 001 to 100.
+#'
+#' @param n The number (length) of items to output.
+#' @param number_subjects Number of patients to create
+#'
+#' @return A vector.
+#'
+#' @examples
+#' create_fake_pnr(10)
+create_fake_pnr <- function(n, number_subjects = 200) {
+  1:number_subjects |>
+    # Pad the string to match the number values, so that if 1000 `number_subjects`
+    # is given, the string length of the output is 4 characters wide (e.g. "0001").
+    stringr::str_pad(width = stringr::str_length(number_subjects), pad = "0") |>
+    sample(number_rows, replace = TRUE)
+}
+
+#' Create a vector of fake ATC Codes.
+#'
+#' @inheritParams create_fake_pnr
+#'
+#' @return A vector.
+#'
+#' @examples
+#' create_fake_atc(10)
+create_fake_atc <- function(n) {
   codeCollection::ATCKoodit |>
     tibble::as_tibble() |>
     dplyr::filter(stringr::str_length(Koodi) == 7) |>
     dplyr::pull(Koodi) |>
-    sample(number_rows, replace = TRUE)
+    sample(n, replace = TRUE)
 }
 
-generate_fake_indication <- function(number_rows) {
+create_fake_indication <- function(number_rows) {
   sample(1:9e8, number_rows, replace = TRUE) |>
     stringr::str_trunc(width = 7, ellipsis = "") |>
     stringr::str_pad(width = 7, pad = "0")
@@ -45,7 +72,7 @@ assign_drugname_from_atc <- function(data) {
 # Create a tibble with 1000 rows from 200 individuals
 med_df <- tibble(
   # ID variable
-  pnr = sample(sprintf("%03d", 1:200), number_rows, replace = TRUE),
+  pnr = create_fake_pnr(number_rows),
   # Date of purchase
   eksd = as_date(sample(
     seq(as_date("1995-01-01"), as_date("2014-12-31"), by = "day"), 1000,
@@ -54,9 +81,9 @@ med_df <- tibble(
   # Number of packages
   apk = sample(1:3, 1000, replace = TRUE),
   # Indication for treatment
-  indo = generate_fake_indication(number_rows),
+  indo = create_fake_indication(number_rows),
   # ATC code
-  ATC = generate_fake_atc(number_rows),
+  ATC = create_fake_atc(number_rows),
   # Volume
   volume = sample(20:100, 1000, replace = TRUE)
 ) |>
@@ -66,7 +93,7 @@ med_df <- tibble(
 
 # Create a dataframe with 1000 rows from 50 individuals
 med_a10_df <- data.frame(
-  pnr = sprintf("%03d", 1:50),
+  pnr = create_fake_pnr(number_rows, number_subjects = 50),
   # ID variable
   eksd = as.Date(sample(
     seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000,
@@ -75,7 +102,7 @@ med_a10_df <- data.frame(
   # Date of purchase
   apk = sample(1:3, 1000, replace = TRUE),
   # Number of packages
-  indo = generate_fake_indication(number_rows),
+  indo = create_fake_indication(number_rows),
   # Indication for treatment
   ATC = paste(
     rep(
@@ -180,15 +207,13 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`(
 #'
 #' @param num_samples Number of samples to create
 #'
-#' @return Data.frame with columns pnr, BARNMAK, SPECIALE, and HONUGE
+#' @return A [tibble::tibble()] with columns `pnr`, `BARNMAK`, `SPECIALE`, and `HONUGE`.
 #'
 #' @examples
 #' create_test_hi_df(num_samples = 100)
 create_test_hi_df <- function(num_samples) {
-  data.frame(
-    # pnr: patientID (chr)
-    # random values from 001-100
-    pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
+  tibble::tibble(
+    pnr = create_fake_pnr(num_samples),
 
     # BARNMAK: service performed on patient' child or not (binary)
     # 1 = child, 0 = not, 5% are 1's
@@ -229,20 +254,18 @@ health_insurance_df <- create_test_hi_df(num_samples = 100)
 #'
 #' @param num_samples Number of samples to create
 #'
-#' @return Data.frame with columns pnr, SAMPLINGDATE, ANALYSISCODE, and VALUE
+#' @return A [tibble::tibble()] with columns `pnr`, `SAMPLINGDATE`, `ANALYSISCODE`, and `VALUE`.
 #'
 #' @examples
 #' create_test_lab_df(num_samples = 100)
 create_test_lab_df <- function(num_samples) {
-  data.frame(
-    # pnr: patient ID (chr)
-    # random ID's from 001-100 (even if num_samples > 100)
-    pnr = sprintf("%03d", sample(1:100, num_samples, replace = TRUE)),
+  tibble::tibble(
+    pnr = create_fake_pnr(num_samples),
 
     # SAMPLINGDATE: date of sample (date)
     # random dates between 1995-01-01 and 2015-12-31
     SAMPLINGDATE = sample(
-      seq(as.Date("1995-01-01"), as.Date("2015-12-31"), by = "day"),
+      seq(lubridate::as_date("1995-01-01"), lubridate::as_date("2015-12-31"), by = "day"),
       num_samples,
       replace = TRUE
     ),
@@ -250,13 +273,9 @@ create_test_lab_df <- function(num_samples) {
     # ANALYSISCODE: npu code of analysis type (chr)
     # 50% is either NPU27300 or NPU03835
     # other 50% is 'NPU'+random sample from 10000:99999
-    ANALYSISCODE = ifelse(
-      # repeat 0 and 1 num_samples times and randomise them
-      sample(rep(c(0, 1), length.out = num_samples)),
-      # sample 'NPU27300' and 'NPU03835' for all 1's
-      sample(c("NPU27300", "NPU03835"), num_samples, replace = TRUE),
-      # sample NPU + random digit between 10000:99999 for all 0's
-      paste0("NPU", sample(10000:99999, num_samples, replace = TRUE))
+    ANALYSISCODE = sample(
+      c(sample(c("NPU27300", "NPU03835"), num_samples / 2, replace = TRUE),
+      paste0("NPU", sample(10000:99999, num_samples / 2, replace = TRUE)))
     ),
 
     # VALUE: numerical result of test (num)
@@ -266,6 +285,6 @@ create_test_lab_df <- function(num_samples) {
 }
 
 # create test lab df with 100 rows
-fake_data <- create_test_lab_df(num_samples = 100)
+test_lab_df <- create_test_lab_df(num_samples = 100)
 
-usethis::use_data(fake_data, overwrite = TRUE, internal = TRUE)
+usethis::use_data(test_lab_df, overwrite = TRUE, internal = TRUE)

From f4e242e06f557a6c7ed3a6170285e8f3b9606700 Mon Sep 17 00:00:00 2001
From: Anders Aasted Isaksen <67263135+Aastedet@users.noreply.github.com>
Date: Tue, 13 Feb 2024 12:02:34 +0100
Subject: [PATCH 15/18] Update testdata.R

added assign_drugname_from_atc() to med_a10_df
---
 data-raw/testdata.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/data-raw/testdata.R b/data-raw/testdata.R
index fe6f60d..2f17d65 100644
--- a/data-raw/testdata.R
+++ b/data-raw/testdata.R
@@ -128,7 +128,8 @@ med_a10_df <- data.frame(
   ),
   # ATC code
   volume = sample(20:100, 1000, replace = TRUE) # Volume
-)
+) |>
+  assign_drugname_from_atc()
 
 # Hardcode half of purchases to be metformin, Liraglutide or semaglutide:
 

From cb9100fdc5f6f3a48e2de10231f8bdef4e17f3c4 Mon Sep 17 00:00:00 2001
From: Anders Aasted Isaksen <67263135+Aastedet@users.noreply.github.com>
Date: Tue, 13 Feb 2024 12:05:06 +0100
Subject: [PATCH 16/18] Update testdata.R

Fix to previous commit to assign drugnames to med_a10_df
---
 data-raw/testdata.R | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/data-raw/testdata.R b/data-raw/testdata.R
index 2f17d65..c4ef4c6 100644
--- a/data-raw/testdata.R
+++ b/data-raw/testdata.R
@@ -128,8 +128,7 @@ med_a10_df <- data.frame(
   ),
   # ATC code
   volume = sample(20:100, 1000, replace = TRUE) # Volume
-) |>
-  assign_drugname_from_atc()
+)
 
 # Hardcode half of purchases to be metformin, Liraglutide or semaglutide:
 
@@ -139,6 +138,10 @@ med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <-
     replace = TRUE
   )
 
+# Assign drugnames:
+med_a10_df |>
+  assign_drugname_from_atc()
+
 replaceDrugNames <- function(data) {
   # Check if the data frame contains the necessary columns
   if (!all(c("ATC", "drugname") %in% colnames(data))) {

From f5395c60a39e438a884eeeac3cc1ca3e2134879b Mon Sep 17 00:00:00 2001
From: Anders Aasted Isaksen <67263135+Aastedet@users.noreply.github.com>
Date: Tue, 13 Feb 2024 12:06:53 +0100
Subject: [PATCH 17/18] Update testdata.R

forgot to actually assign drug names to med_a10_df
---
 data-raw/testdata.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data-raw/testdata.R b/data-raw/testdata.R
index c4ef4c6..58d7933 100644
--- a/data-raw/testdata.R
+++ b/data-raw/testdata.R
@@ -171,7 +171,7 @@ replaceDrugNames <- function(data) {
 }
 
 # Apply the function to create drug names
-med_a10_df <- replaceDrugNames(med_a10_df)
+med_a10_df <- med_a10_df <- replaceDrugNames(med_a10_df)
 
 med_df <- rbind(med_df, med_a10_df)
 

From 3222af8346bfd4c310d46cc3d916a3a5df4f9578 Mon Sep 17 00:00:00 2001
From: Anders Aasted Isaksen <ANDAAS@onerm.dk>
Date: Sat, 17 Feb 2024 23:30:32 +0100
Subject: [PATCH 18/18] testdata.R: - Added offset to pnr number generation to
 have more control when generating data for false-positive diabetes cases (for
 medication: 1-200: non-cases, 201-250: true cases). - Increased number of
 samples in health insurance/lab data and changed years covered by health
 insurance to match real world setting.

---
 data-raw/testdata.R | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/data-raw/testdata.R b/data-raw/testdata.R
index 58d7933..351a7e7 100644
--- a/data-raw/testdata.R
+++ b/data-raw/testdata.R
@@ -26,8 +26,8 @@ number_rows <- 1000
 #'
 #' @examples
 #' create_fake_pnr(10)
-create_fake_pnr <- function(n, number_subjects = 200) {
-  1:number_subjects |>
+create_fake_pnr <- function(n, number_subjects = 200, offset = 0) {
+  (1 + offset):(number_subjects + offset) |>
     # Pad the string to match the number values, so that if 1000 `number_subjects`
     # is given, the string length of the output is 4 characters wide (e.g. "0001").
     stringr::str_pad(width = stringr::str_length(number_subjects), pad = "0") |>
@@ -67,7 +67,7 @@ assign_drugname_from_atc <- function(data) {
 
 # Pseudo-lmdb:
 
-## Non-diabetes data:
+## Non-diabetes data (ID 1-200):
 
 # Create a tibble with 1000 rows from 200 individuals
 med_df <- tibble(
@@ -89,11 +89,11 @@ med_df <- tibble(
 ) |>
   assign_drugname_from_atc()
 
-## Diabetes data:
+## Diabetes data (ID 201-250):
 
 # Create a dataframe with 1000 rows from 50 individuals
 med_a10_df <- data.frame(
-  pnr = create_fake_pnr(number_rows, number_subjects = 50),
+  pnr = create_fake_pnr(number_rows, number_subjects = 50, offset = 200),
   # ID variable
   eksd = as.Date(sample(
     seq(as.Date("1995-01-01"), as.Date("2014-12-31"), by = "day"), 1000,
@@ -130,7 +130,7 @@ med_a10_df <- data.frame(
   volume = sample(20:100, 1000, replace = TRUE) # Volume
 )
 
-# Hardcode half of purchases to be metformin, Liraglutide or semaglutide:
+# Hardcode half of purchases to be metformin, liraglutid or semaglutid:
 
 med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <-
   sample(c("A10BA02", "A10BJ02", "A10BJ06"),
@@ -139,7 +139,7 @@ med_a10_df[sample(nrow(med_a10_df), nrow(med_a10_df) / 2), ]$ATC <-
   )
 
 # Assign drugnames:
-med_a10_df |>
+med_a10_df <- med_a10_df |>
   assign_drugname_from_atc()
 
 replaceDrugNames <- function(data) {
@@ -171,15 +171,14 @@ replaceDrugNames <- function(data) {
 }
 
 # Apply the function to create drug names
-med_a10_df <- med_a10_df <- replaceDrugNames(med_a10_df)
-
-med_df <- rbind(med_df, med_a10_df)
+med_a10_df <- replaceDrugNames(med_a10_df)
 
+setDT(med_a10_df)
 setDT(med_df)
 
 # Handcode a few false-positive cases with purchases of metformin:
 med_df[pnr %in% c(sprintf("%03d", 180:190)), `:=`(
-  indo = sample(c("0000092", "0000276", "0000781"), 55, replace = TRUE),
+  indo = sample(c("0000092", "0000276", "0000781"), nrow(med_df[pnr %in% c(sprintf("%03d", 180:190))]), replace = TRUE),
   ATC = "A10BA02",
   drugname = "Metformin"
 )]
@@ -196,6 +195,11 @@ med_df[pnr %in% c(sprintf("%03d", 195:200)), `:=`(
   drugname = "Wegovy Flextouch"
 )]
 
+# Combine the two:
+med_df <- rbind(med_df, med_a10_df)
+
+setDT(med_df)
+
 # Hospital diagnoses ------------------------------------------------------
 
 # lpr_adm -----------------------------------------------------------------
@@ -240,17 +244,17 @@ create_test_hi_df <- function(num_samples) {
 
     # HONUGE: year/week of the service being billed (4-digit chr)
     # first and second digits are random numbers between 01-52
-    # third and fourth digits are random numbers between 00-99
+    # third and fourth digits are random numbers corresponding to 1990 onward
     HONUGE = sprintf(
       "%02d%02d",
       sample(1:52, num_samples, replace = TRUE),
-      sample(0:99, num_samples, replace = TRUE)
+      sample(c(90:99, 01:22), num_samples, replace = TRUE)
     )
   )
 }
 
 # create test health insurance df with 100 rows
-health_insurance_df <- create_test_hi_df(num_samples = 100)
+health_insurance_df <- create_test_hi_df(num_samples = 1000)
 
 # Laboratory data ---------------------------------------------------------
 
@@ -289,6 +293,6 @@ create_test_lab_df <- function(num_samples) {
 }
 
 # create test lab df with 100 rows
-test_lab_df <- create_test_lab_df(num_samples = 100)
+test_lab_df <- create_test_lab_df(num_samples = 1000)
 
-usethis::use_data(test_lab_df, overwrite = TRUE, internal = TRUE)
+# usethis::use_data(test_lab_df, overwrite = TRUE, internal = TRUE)