From 154e6b44da8bc4043235e66704c75d5ae5570f43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= Date: Wed, 19 Jun 2024 15:01:14 +0200 Subject: [PATCH 1/7] feat: add join_lpr3 with tests --- R/joins.R | 20 ++++++++++++ tests/testthat/test-joins.R | 65 +++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/R/joins.R b/R/joins.R index 6438258..6dcd1e0 100644 --- a/R/joins.R +++ b/R/joins.R @@ -19,3 +19,23 @@ join_lpr2 <- function(lpr_diag, lpr_adm) { ) } +#' Join together the LPR3 (`diagnoser` and `kontakter`) registers. +#' +#' @param diagnoser The diagnosis register. +#' @param kontakter The contacts register. +#' +#' @return The same class as the input, defaults to a [tibble::tibble()]. +#' @keywords internal +#' +#' @examples +#' register_data$diagnoser |> +#' join_lpr3(register_data$kontakter) +join_lpr3 <- function(diagnoser, kontakter) { + verify_required_variables(diagnoser, "diagnoser") + verify_required_variables(kontakter, "kontakter") + dplyr::full_join( + column_names_to_lower(kontakter), + column_names_to_lower(diagnoser), + by = "dw_ek_kontakt" + ) +} diff --git a/tests/testthat/test-joins.R b/tests/testthat/test-joins.R index 1753b46..d6c4ca9 100644 --- a/tests/testthat/test-joins.R +++ b/tests/testthat/test-joins.R @@ -1,3 +1,5 @@ +# join_lpr2 ----------------------------------------------------------------- + actual_lpr_diag <- tibble::tibble( recnum = c(1:2), c_diag = 1:2, @@ -56,3 +58,66 @@ test_that("joining works for data.table", { expect_contains(class(actual), "data.table") }) + +# join_lpr3 ----------------------------------------------------------------- + +actual_diagnoser <- tibble::tibble( + dw_ek_kontakt = 1:2, + diagnosekode = c("DA071","DD075"), + diagnosetype = c("A", "B"), + senere_afkraefter = c("Nej", "Ja") +) + +actual_kontakter <- tibble::tibble( + cpr = c(1, 1, 2), + dw_ek_kontakt = 1:3, + dato_start = c("20230101", "20220101", "20200101"), + hovedspaciale_ans = c("Neurologi", "Akut medicin", "Kardiologi"), +) + +expected_lpr3 <- tibble::tibble( + cpr = c(1, 1, 2), + dw_ek_kontakt = c(1,2,3), + dato_start = c("20230101", "20220101", "20200101"), + hovedspaciale_ans = c("Neurologi", "Akut medicin", "Kardiologi"), + diagnosekode = c("DA071","DD075", NA), + diagnosetype = c("A", "B", NA), + senere_afkraefter = c("Nej", "Ja", NA), +) + +test_that("joining LPR3 correctly", { + actual <- join_lpr3( + actual_diagnoser, + actual_kontakter + ) + + expect_equal(actual, expected_lpr3) +}) + +test_that("joining works for DuckDB Database", { + actual <- arrow::to_duckdb(actual_diagnoser) |> + join_lpr3(arrow::to_duckdb(actual_kontakter)) + + expect_contains(class(actual), "tbl_duckdb_connection") +}) + +test_that("joining works for Arrow Tables (from Parquet)", { + actual <- arrow::as_arrow_table(actual_diagnoser) |> + join_lpr3(arrow::as_arrow_table(actual_kontakter)) + + expect_contains(class(actual), "arrow_dplyr_query") +}) + +test_that("joining works for data.frame", { + actual <- as.data.frame(actual_diagnoser) |> + join_lpr3(as.data.frame(actual_kontakter)) + + expect_contains(class(actual), "data.frame") +}) + +test_that("joining works for data.table", { + actual <- data.table::as.data.table(actual_diagnoser) |> + join_lpr3(data.table::as.data.table(actual_kontakter)) + + expect_contains(class(actual), "data.table") +}) From 8ecd57adc92b040f53e698339ffc550ab88fd6cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <40836345+signekb@users.noreply.github.com> Date: Tue, 25 Jun 2024 15:24:50 +0200 Subject: [PATCH 2/7] fix: typos --- tests/testthat/test-joins.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-joins.R b/tests/testthat/test-joins.R index d6c4ca9..f59802e 100644 --- a/tests/testthat/test-joins.R +++ b/tests/testthat/test-joins.R @@ -65,24 +65,24 @@ actual_diagnoser <- tibble::tibble( dw_ek_kontakt = 1:2, diagnosekode = c("DA071","DD075"), diagnosetype = c("A", "B"), - senere_afkraefter = c("Nej", "Ja") + senere_afkraeftet = c("Nej", "Ja") ) actual_kontakter <- tibble::tibble( cpr = c(1, 1, 2), dw_ek_kontakt = 1:3, dato_start = c("20230101", "20220101", "20200101"), - hovedspaciale_ans = c("Neurologi", "Akut medicin", "Kardiologi"), + hovedspeciale_ans = c("Neurologi", "Akut medicin", "Kardiologi"), ) expected_lpr3 <- tibble::tibble( cpr = c(1, 1, 2), dw_ek_kontakt = c(1,2,3), dato_start = c("20230101", "20220101", "20200101"), - hovedspaciale_ans = c("Neurologi", "Akut medicin", "Kardiologi"), + hovedspeciale_ans = c("Neurologi", "Akut medicin", "Kardiologi"), diagnosekode = c("DA071","DD075", NA), diagnosetype = c("A", "B", NA), - senere_afkraefter = c("Nej", "Ja", NA), + senere_afkraeftet = c("Nej", "Ja", NA), ) test_that("joining LPR3 correctly", { From 509d9af7e4d31d2d99ab93cd5db0d7007890f131 Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Tue, 25 Jun 2024 17:32:12 +0200 Subject: [PATCH 3/7] test: update tests to include extra rows in each separate dataset --- tests/testthat/test-joins.R | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/testthat/test-joins.R b/tests/testthat/test-joins.R index 0775123..c6332a2 100644 --- a/tests/testthat/test-joins.R +++ b/tests/testthat/test-joins.R @@ -97,27 +97,27 @@ test_that("joining works for data.table", { # join_lpr3 ----------------------------------------------------------------- actual_diagnoser <- tibble::tibble( - dw_ek_kontakt = 1:2, - diagnosekode = c("DA071","DD075"), - diagnosetype = c("A", "B"), - senere_afkraeftet = c("Nej", "Ja") + dw_ek_kontakt = 1:4, + diagnosekode = rep(c("DA071","DD075"), times = 2), + diagnosetype = rep(c("A", "B"), times = 2), + senere_afkraeftet = rep(c("Nej", "Ja"), times = 2) ) actual_kontakter <- tibble::tibble( - cpr = c(1, 1, 2), - dw_ek_kontakt = 1:3, - dato_start = c("20230101", "20220101", "20200101"), - hovedspeciale_ans = c("Neurologi", "Akut medicin", "Kardiologi"), + cpr = c(1, 1, 2, 3), + dw_ek_kontakt = 2:5, + dato_start = c("20230101", "20220101", "20200101", "20200101"), + hovedspeciale_ans = c("Neurologi", "Akut medicin", "Kardiologi", "Neurologi") ) expected_lpr3 <- tibble::tibble( cpr = c(1, 1, 2), - dw_ek_kontakt = c(1,2,3), + dw_ek_kontakt = 2:4, dato_start = c("20230101", "20220101", "20200101"), hovedspeciale_ans = c("Neurologi", "Akut medicin", "Kardiologi"), - diagnosekode = c("DA071","DD075", NA), - diagnosetype = c("A", "B", NA), - senere_afkraeftet = c("Nej", "Ja", NA), + diagnosekode = c("DD075","DA071", "DD075"), + diagnosetype = c("B", "A", "B"), + senere_afkraeftet = c("Ja", "Nej", "Ja") ) test_that("joining LPR3 correctly", { From 78515fa202a32de22c641e30d19683ddb22ca420 Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Tue, 25 Jun 2024 17:32:36 +0200 Subject: [PATCH 4/7] test: switch order of datasets to join by --- tests/testthat/test-joins.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-joins.R b/tests/testthat/test-joins.R index c6332a2..e961465 100644 --- a/tests/testthat/test-joins.R +++ b/tests/testthat/test-joins.R @@ -122,8 +122,8 @@ expected_lpr3 <- tibble::tibble( test_that("joining LPR3 correctly", { actual <- join_lpr3( - actual_diagnoser, - actual_kontakter + actual_kontakter, + actual_diagnoser ) expect_equal(actual, expected_lpr3) From 82b355c55d615ff8e4be34f67967c3b2e100c5d1 Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Tue, 25 Jun 2024 17:33:05 +0200 Subject: [PATCH 5/7] test: make sure datasets are provided in correct order --- tests/testthat/test-joins.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/testthat/test-joins.R b/tests/testthat/test-joins.R index e961465..a1c1a6e 100644 --- a/tests/testthat/test-joins.R +++ b/tests/testthat/test-joins.R @@ -129,6 +129,13 @@ test_that("joining LPR3 correctly", { expect_equal(actual, expected_lpr3) }) +test_that("kontakter and diagnoser are in correct order", { + expect_error(join_lpr3( + actual_diagnoser, + actual_kontakter + )) +}) + test_that("joining works for DuckDB Database", { actual <- arrow::to_duckdb(actual_diagnoser) |> join_lpr3(arrow::to_duckdb(actual_kontakter)) From 0d0b96310e06e3ea4b50f873321ccb6f9600e9fe Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Tue, 25 Jun 2024 17:33:26 +0200 Subject: [PATCH 6/7] test: expand on tests for other data formats --- tests/testthat/test-joins.R | 44 ++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/tests/testthat/test-joins.R b/tests/testthat/test-joins.R index a1c1a6e..ab66278 100644 --- a/tests/testthat/test-joins.R +++ b/tests/testthat/test-joins.R @@ -137,29 +137,57 @@ test_that("kontakter and diagnoser are in correct order", { }) test_that("joining works for DuckDB Database", { - actual <- arrow::to_duckdb(actual_diagnoser) |> - join_lpr3(arrow::to_duckdb(actual_kontakter)) + actual <- arrow::to_duckdb(actual_kontakter) |> + join_lpr3(arrow::to_duckdb(actual_diagnoser)) + + actual_rows <- actual |> + dplyr::count() |> + dplyr::pull(n) |> + as.integer() expect_contains(class(actual), "tbl_duckdb_connection") + expect_identical(colnames(actual), colnames(expected_lpr3)) + expect_identical(actual_rows, nrow(expected_lpr3)) }) test_that("joining works for Arrow Tables (from Parquet)", { - actual <- arrow::as_arrow_table(actual_diagnoser) |> - join_lpr3(arrow::as_arrow_table(actual_kontakter)) + actual <- arrow::as_arrow_table(actual_kontakter) |> + join_lpr3(arrow::as_arrow_table(actual_diagnoser)) + + actual_rows <- actual |> + dplyr::count() |> + dplyr::pull(n) |> + as.integer() expect_contains(class(actual), "arrow_dplyr_query") + expect_identical(names(actual), colnames(expected_lpr3)) + expect_identical(actual_rows, nrow(expected_lpr3)) }) test_that("joining works for data.frame", { - actual <- as.data.frame(actual_diagnoser) |> - join_lpr3(as.data.frame(actual_kontakter)) + actual <- as.data.frame(actual_kontakter) |> + join_lpr3(as.data.frame(actual_diagnoser)) + + actual_rows <- actual |> + dplyr::count() |> + dplyr::pull(n) |> + as.integer() expect_contains(class(actual), "data.frame") + expect_identical(names(actual), colnames(expected_lpr3)) + expect_identical(actual_rows, nrow(expected_lpr3)) }) test_that("joining works for data.table", { - actual <- data.table::as.data.table(actual_diagnoser) |> - join_lpr3(data.table::as.data.table(actual_kontakter)) + actual <- data.table::as.data.table(actual_kontakter) |> + join_lpr3(data.table::as.data.table(actual_diagnoser)) + + actual_rows <- actual |> + dplyr::count() |> + dplyr::pull(n) |> + as.integer() expect_contains(class(actual), "data.table") + expect_identical(colnames(actual), colnames(expected_lpr3)) + expect_identical(actual_rows, nrow(expected_lpr3)) }) From e2fce26136d1f4efaa0faba601e626a8b88d950a Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Tue, 25 Jun 2024 17:35:04 +0200 Subject: [PATCH 7/7] refactor: kontakter should come first so cpr is first column, plus use inner join --- R/joins.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/R/joins.R b/R/joins.R index 5553294..2092cc6 100644 --- a/R/joins.R +++ b/R/joins.R @@ -28,12 +28,13 @@ join_lpr2 <- function(lpr_adm, lpr_diag) { #' @keywords internal #' #' @examples -#' register_data$diagnoser |> -#' join_lpr3(register_data$kontakter) -join_lpr3 <- function(diagnoser, kontakter) { - verify_required_variables(diagnoser, "diagnoser") +#' register_data$kontakter |> +#' join_lpr3(register_data$diagnoser) +join_lpr3 <- function(kontakter, diagnoser) { verify_required_variables(kontakter, "kontakter") - dplyr::full_join( + verify_required_variables(diagnoser, "diagnoser") + + dplyr::inner_join( column_names_to_lower(kontakter), column_names_to_lower(diagnoser), by = "dw_ek_kontakt"