atorus-research · bms63 · Sep 27, 2024 · Aug 19, 2024 · Aug 20, 2024 · Aug 20, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -42,6 +42,7 @@ Suggests:
     knitr,
     labelled,
     metacore,
+    pharmaverseadam,
     readxl,
     rmarkdown,
     testthat (>= 3.0.0),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# xportr 0.5.0
+
+## New Feature
+
+* New argument in `xportr_write()` allows users to specify the maximum file size (in GB) of their exported xpt files. (#268)
+
 # xportr 0.4.0
 
 ## New Features

diff --git a/R/write.R b/R/write.R
@@ -7,6 +7,8 @@
 #' @param .df A data frame to write.
 #' @param path Path where transport file will be written. File name sans will be
 #'   used as `xpt` name.
+#' @param max_size_gb Maximum size in GB of the exported file(s). If size of xpt file exceeds the specified maximum,
+#' it will split the data frame into multiple exported chunk(s).
 #' @param label `r lifecycle::badge("deprecated")` Previously used to to set the Dataset label.
 #' Use the `metadata` argument to set the dataset label.
 #' @param strict_checks If TRUE, xpt validation will report errors and not write
@@ -47,6 +49,7 @@
 #'
 xportr_write <- function(.df,
                          path,
+                         max_size_gb = NULL,
                          metadata = NULL,
                          domain = NULL,
                          strict_checks = FALSE,
@@ -73,6 +76,7 @@ xportr_write <- function(.df,
 
   assert_data_frame(.df)
   assert_string(path)
+  checkmate::assert_numeric(max_size_gb, null.ok = TRUE)
-  checkmate::assert_numeric(max_size_gb, null.ok = TRUE)
+assert_numeric(max_size_gb, null.ok = TRUE)
-  checkmate::assert_numeric(max_size_gb, null.ok = TRUE)
+assert_numeric(max_size_gb, null.ok = TRUE)
   assert_metadata(metadata, null.ok = TRUE)
   assert_logical(strict_checks)
 
@@ -88,6 +92,10 @@ xportr_write <- function(.df,
     assert(".df file name must be 8 characters or less.", .var.name = "path")
   }
 
+  if (is.numeric(max_size_gb) && max_size_gb <= 0) {
+    assert("max_size_gb must be NULL or a numeric value greater than 0.", .var.name = "max_size_gb")
+  }
+
   checks <- xpt_validate(.df)
 
   if (stringr::str_detect(name, "[^a-zA-Z0-9]")) {
@@ -107,22 +115,11 @@ xportr_write <- function(.df,
   tryCatch(
     {
       # If data is not split, data is just written out
-      if (is.null(attr(data, "_xportr.split_by_"))) {
+      if (is.null(max_size_gb)) {
         write_xpt(data, path = path, version = 5, name = name)
         check_xpt_size(path)
       } else {
-        # If data is split, perform the split and get an index for the for loop
-        split_data <- split(data, data[[attr(data, "_xportr.split_by_")]])
-        split_index <- unique(data[[attr(data, "_xportr.split_by_")]])
-        paths <- get_split_path(path, seq_along(split_index))
-        # Iterate on the unique values of the split
-        for (i in seq_along(split_index)) {
-          # Write out the split data, `get_split_path` will determine file name
-          write_xpt(split_data[[i]],
-            path = paths[i], version = 5, name = name
-          )
-          check_xpt_size(paths[i])
-        }
+        export_to_xpt(data, path = path, max_size_gb = max_size_gb, file_prefix = name)
       }
     },
     error = function(err) {
@@ -156,3 +153,71 @@ get_split_path <- function(path, ind) {
     tools::file_ext(path)
   )
 }
+
+#' Function to export data frame to xpt files ensuring each file does not exceed the maximum specified size in GB
+#'
+#' @param df A data frame to write.
+#' @param max_size_gb Maximum size in GB of the exported files.
+#' @param file_prefix Name of each exported file.
+#'
+#' @noRd
+
+export_to_xpt <- function(.df, path, max_size_gb, file_prefix) {
+  # Convert GB to bytes
+  max_size_bytes <- max_size_gb * 1000^3
+
+  temp_file <- tempfile()
+  write_xpt(.df, temp_file)
+
+  file_size <- file.info(temp_file)$size
+  unlink(temp_file) # Clean up the temporary file
+
+  dir_path <- dirname(path)
+
+  if (file_size <= max_size_bytes) {
+    chunk_counter <- 2
+    file_name <- sprintf("%s.xpt", file_prefix)
+    path <- file.path(dir_path, file_name)
+    write_xpt(.df, path = path, version = 5, name = file_prefix)
+  } else {
+    total_rows <- nrow(.df)
+    row_start <- 1
+    chunk_counter <- 1
+
+    while (row_start <= total_rows) {
+      # Binary search to find the maximum number of rows that fit within the size limit
+      low <- row_start
+      high <- total_rows
+      best_fit <- row_start
+
+      while (low <= high) {
+        mid <- floor((low + high) / 2)
+        write_xpt(.df[row_start:mid, ], temp_file)
+        file_size <- file.info(temp_file)$size
+
+        if (file_size <= max_size_bytes) {
+          best_fit <- mid
+          low <- mid + 1
+        } else {
+          high <- mid - 1
+        }
+
+        unlink(temp_file) # Clean up the temporary file
+      }
+
+      # Write the best fitting chunk to the final file
+      chunk <- .df[row_start:best_fit, ]
+
+      file_name <- sprintf("%s%d.xpt", file_prefix, chunk_counter)
+      path <- file.path(dir_path, file_name)
+
+      write_xpt(chunk, path = path, version = 5, name = file_prefix)
+
+      # Update counters
+      row_start <- best_fit + 1
+      chunk_counter <- chunk_counter + 1
+    }
+  }
+
+  message("Data frame exported to ", chunk_counter - 1, " xpt files.")
+}
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -56,6 +56,7 @@ https
 incompletedatetime
 intervaldatetime
 iso
+lgl
 magrittr
 metacore
 num

diff --git a/man/xportr_write.Rd b/man/xportr_write.Rd
diff --git a/tests/testthat/test-write.R b/tests/testthat/test-write.R
@@ -212,51 +212,78 @@ test_that("write Test 13: Capture errors by haven and report them as such", {
   )
 })
 
-## Test 14: xportr_write: `split_by` attribute is used to split the data ----
-test_that("write Test 14: `split_by` attribute is used to split the data", {
+## Test 14: xportr_write: if file size is less than `max_size_gb`, export only one file  ----
+test_that("write Test 14: if file size is less than `max_size_gb`, export only one file", {
+  adlb <- pharmaverseadam::adlb
+
   tmpdir <- tempdir()
-  tmp <- file.path(tmpdir, "xyz.xpt")
 
-  on.exit(unlink(tmpdir))
+  # 5 GB
+  max_size_gb <- 5
 
-  dts <- data_to_save()
-  dts %>%
-    xportr_split(split_by = "X") %>%
-    xportr_write(path = tmp)
+  expect_message(
+    xportr_write(adlb,
+      path = paste0(tmpdir, "/adlb.xpt"),
+      domain = "adlb",
+      max_size_gb = max_size_gb,
+      strict_checks = FALSE
+    ),
+    "Data frame exported to 1 xpt files."
+  )
 
   expect_true(
-    file.exists(file.path(tmpdir, "xyz1.xpt"))
+    file.exists(file.path(tmpdir, "adlb.xpt")),
+    file.info(file.path(tmpdir, "adlb.xpt"))$size <= as.numeric(format(max_size_gb * 10^9, scientific = FALSE))
+  )
+})
+
+## Test 15: xportr_write: `max_size_gb` is used to split data frame into specified maximum file size ----
+test_that("write Test 15: `max_size_gb` is used to split data frame into specified maximum file size", {
+  adlb <- pharmaverseadam::adlb
+
+  tmpdir <- tempdir()
+
+  # 20 mb
+  max_size_gb <- 20 / 1000
+
+  expect_message(
+    xportr_write(adlb,
+      path = paste0(tmpdir, "/adlb.xpt"),
+      domain = "adlb",
+      max_size_gb = max_size_gb,
+      strict_checks = FALSE
+    ),
+    "Data frame exported to 5 xpt files."
   )
+
   expect_true(
-    file.exists(file.path(tmpdir, "xyz2.xpt"))
+    file.exists(file.path(tmpdir, "adlb1.xpt")),
+    file.info(file.path(tmpdir, "adlb1.xpt"))$size <= as.numeric(format(max_size_gb * 10^9, scientific = FALSE))
   )
+
   expect_true(
-    file.exists(file.path(tmpdir, "xyz3.xpt"))
+    file.exists(file.path(tmpdir, "adlb2.xpt")),
+    file.info(file.path(tmpdir, "adlb2.xpt"))$size <= as.numeric(format(max_size_gb * 10^9, scientific = FALSE))
   )
-  expect_equal(
-    read_xpt(file.path(tmpdir, "xyz1.xpt")) %>%
-      extract2("X") %>%
-      unique() %>%
-      length(),
-    1
+
+  expect_true(
+    file.exists(file.path(tmpdir, "adlb3.xpt")),
+    file.info(file.path(tmpdir, "adlb3.xpt"))$size <= as.numeric(format(max_size_gb * 10^9, scientific = FALSE))
   )
-  expect_equal(
-    read_xpt(file.path(tmpdir, "xyz2.xpt")) %>%
-      extract2("X") %>%
-      unique() %>%
-      length(),
-    1
+
+  expect_true(
+    file.exists(file.path(tmpdir, "adlb4.xpt")),
+    file.info(file.path(tmpdir, "adlb4.xpt"))$size <= as.numeric(format(max_size_gb * 10^9, scientific = FALSE))
   )
-  expect_equal(
-    read_xpt(file.path(tmpdir, "xyz3.xpt")) %>%
-      extract2("X") %>%
-      unique() %>%
-      length(),
-    1
+
+  expect_true(
+    file.exists(file.path(tmpdir, "adlb5.xpt")),
+    file.info(file.path(tmpdir, "adlb5.xpt"))$size <= as.numeric(format(max_size_gb * 10^9, scientific = FALSE))
   )
 })
 
-## Test 15: xportr_write: Large file sizes are reported and warned ----
+
+## Test 16: xportr_write: Large file sizes are reported and warned ----
 test_that("write Test 15: Large file sizes are reported and warned", {
   skip_if_not(test_large_files)
   tmpdir <- tempdir()