Skip to content

Commit

Permalink
Merge pull request #213 from atorus-research/91-max-length
Browse files Browse the repository at this point in the history
Closes #91 length attribute from max data length
  • Loading branch information
bms63 authored Feb 11, 2024
2 parents 63932b2 + 98c075f commit de2beba
Show file tree
Hide file tree
Showing 13 changed files with 222 additions and 26 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export(length_log)
export(type_log)
export(var_names_log)
export(var_ord_msg)
export(variable_max_length)
export(xportr)
export(xportr_df_label)
export(xportr_format)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

* All core functions can be run together by using new function `xportr()` (#137)

* New argument in `xportr_length()` allows selection between the length from metadata, as previously done, or from the calculated maximum length per variable when `length_source` is set to “data” (#91)

## Documentation

## Deprecation and Breaking Changes
Expand Down
58 changes: 46 additions & 12 deletions R/length.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
#' Assign SAS Length
#'
#' Assigns SAS length from a metadata object to a given data frame. If a
#' length isn't present for a variable the length value is set to 200 for
#' character columns, and 8 for non-character columns. This value is stored in
#' the 'width' attribute of the column.
#' Assigns the SAS length to a specified data frame, either from a metadata object
#' or based on the calculated maximum data length. If a length isn't present for
#' a variable the length value is set to 200 for character columns, and 8
#' for non-character columns. This value is stored in the 'width' attribute of the column.
#'
#' @inheritParams xportr
#' @param metadata A data frame containing variable level metadata. See
#' 'Metadata' section for details.
#' @param domain Appropriate CDSIC dataset name, e.g. ADAE, DM. Used to subset
#' the metadata object. If none is passed, then name of the dataset passed as
#' .df will be used.
#' @param verbose The action this function takes when an action is taken on the
#' dataset or function validation finds an issue. See 'Messaging' section for
#' details. Options are 'stop', 'warn', 'message', and 'none'
#' @param length_source Choose the assigned length from either metadata or data.
#'
#' If `"metadata"` is specified, the assigned length is from the metadata length.
#' If `"data"` is specified, the assigned length is determined by the calculated maximum data length.
#'
#' *Permitted Values*: `"metadata"`, `"data"`
#' @param metacore `r lifecycle::badge("deprecated")` Previously used to pass
#' metadata now renamed with `metadata`
#'
Expand Down Expand Up @@ -56,12 +68,14 @@
#' length = c(10, 8)
#' )
#'
#' adsl <- xportr_length(adsl, metadata, domain = "adsl")
#' adsl <- xportr_length(adsl, metadata, domain = "adsl", length_source = "metadata")
xportr_length <- function(.df,
metadata = NULL,
domain = NULL,
verbose = NULL,
length_source = c("metadata", "data"),
metacore = deprecated()) {
length_source <- match.arg(length_source)
if (!missing(metacore)) {
lifecycle::deprecate_stop(
when = "0.3.1.9005",
Expand Down Expand Up @@ -109,17 +123,37 @@ xportr_length <- function(.df,

length_log(miss_vars, verbose)

length <- metadata[[variable_length]]
names(length) <- metadata[[variable_name]]
if (length_source == "metadata") {
length_metadata <- metadata[[variable_length]]
names(length_metadata) <- metadata[[variable_name]]

for (i in names(.df)) {
if (i %in% miss_vars) {
attr(.df[[i]], "width") <- impute_length(.df[[i]])
} else {
attr(.df[[i]], "width") <- length_metadata[[i]]
}
}
}

# Assign length from data
if (length_source == "data") {
var_length_max <- variable_max_length(.df)

length_data <- var_length_max[[variable_length]]
names(length_data) <- var_length_max[[variable_name]]

for (i in names(.df)) {
if (i %in% miss_vars) {
attr(.df[[i]], "width") <- impute_length(.df[[i]])
} else {
attr(.df[[i]], "width") <- length[[i]]
for (i in names(.df)) {
attr(.df[[i]], "width") <- length_data[[i]]
}

length_msg <- left_join(var_length_max, metadata[, c(variable_name, variable_length)], by = variable_name) %>%
filter(length.x < length.y)

max_length_msg(length_msg, verbose)
}


.df
}

Expand Down
31 changes: 31 additions & 0 deletions R/messages.R
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,34 @@ var_ord_msg <- function(reordered_vars, moved_vars, verbose) {
cli_h2("All variables in dataset are ordered")
}
}

#' Utility for data Lengths
#'
#' @param max_length Dataframe with data and metadata length
#' @param verbose Provides additional messaging for user
#'
#' @return Output to Console

max_length_msg <- function(max_length, verbose) {
assert_data_frame(max_length)
assert_choice(verbose, choices = .internal_verbose_choices)

if (nrow(max_length) > 0) {
cli_h2("Variable length is shorter than the length specified in the metadata.")

xportr_logger(
glue(
"Update length in metadata to trim the variables:"
),
type = verbose
)

xportr_logger(
glue(
"{format(max_length[[1]], width = 8)} has a length of {format(as.character(max_length[[2]]), width = 3)}",
" and a length of {format(as.character(max_length[[3]]), width = 3)} in metadata"
),
type = verbose
)
}
}
40 changes: 39 additions & 1 deletion R/utils-xportr.R
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ xpt_validate <- function(data) {

# 4.0 max length of Character variables <= 200 bytes
max_nchar <- data %>%
summarize(across(where(is.character), ~ max(nchar(., type = "bytes"))))
summarize(across(where(is.character), ~ max(0L, nchar(., type = "bytes"), na.rm = TRUE)))
nchar_gt_200 <- max_nchar[which(max_nchar > 200)]
if (length(nchar_gt_200) > 0) {
err_cnd <- c(
Expand Down Expand Up @@ -354,6 +354,44 @@ check_multiple_var_specs <- function(metadata,
}
}


#' Calculate the maximum length of variables
#'
#' Function to calculate the maximum length of variables in a given dataframe
#'
#' @inheritParams xportr_length
#'
#' @return Returns a dataframe with variables and their maximum length
#'
#' @export

variable_max_length <- function(.df) {
assert_data_frame(.df)

variable_length <- getOption("xportr.length")
variable_name <- getOption("xportr.variable_name")

max_nchar <- .df %>%
summarize(across(where(is.character), ~ max(0L, nchar(., type = "bytes"), na.rm = TRUE)))


xport_max_length <- data.frame()
col <- 0
for (var in names(.df)) {
col <- col + 1

xport_max_length[col, variable_name] <- var

if (is.character(.df[[var]])) {
xport_max_length[col, variable_length] <- max_nchar[var]
} else {
xport_max_length[col, variable_length] <- 8
}
}

return(xport_max_length)
}

#' Custom check for metadata object
#'
#' Improvement on the message clarity over the default assert(...) messages.
Expand Down
2 changes: 1 addition & 1 deletion R/xportr-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ globalVariables(c(
"abbr_parsed", "abbr_stem", "adj_orig", "adj_parsed", "col_pos", "dict_varname",
"lower_original_varname", "my_minlength", "num_st_ind", "original_varname",
"renamed_n", "renamed_var", "use_bundle", "viable_start", "type.x", "type.y",
"variable"
"variable", "length.x", "lenght.y"
))

# The following block is used by usethis to automatically manage
Expand Down
19 changes: 19 additions & 0 deletions man/max_length_msg.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/variable_max_length.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 13 additions & 5 deletions man/xportr_length.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions tests/testthat/test-length.R
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,32 @@ test_that("xportr_length: Gets warning when metadata has multiple rows with same
multiple_vars_in_spec_helper2(xportr_length)
})

meta_example <- data.frame(
dataset = "df",
variable = c("USUBJID", "WEIGHT"),
length = c(10, 8)
)

df <- data.frame(
USUBJID = c("1", "12", "123"),
WEIGHT = c(85, 45, 121)
)

test_that("xportr_length: length assigned as expected from metadata or data", {
result <- df %>%
xportr_length(meta_example, domain = "df", length_source = "metadata") %>%
expect_attr_width(c(10, 8))

result <- df %>%
xportr_length(meta_example, domain = "df", length_source = "data") %>%
expect_attr_width(c(3, 8))
})

test_that("xportr_length: Gets message when length in metadata longer than data length", {
result <- df %>%
xportr_length(meta_example, domain = "df", length_source = "data") %>%
expect_message()
})

test_that("xportr_length: Works as expected with only one domain in metadata", {
adsl <- data.frame(
Expand Down
8 changes: 8 additions & 0 deletions tests/testthat/test-utils-xportr.R
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,11 @@ test_that("xpt_validate: Get error message when the length of a non-ASCII charac
"Length of A must be 200 bytes or less."
)
})

test_that("xpt_validate: Get error message when the length of a character variable is > 200 bytes and contains NAs", {
df <- data.frame(A = c(paste(rep("A", 201), collapse = ""), NA_character_))
expect_equal(
xpt_validate(df),
"Length of A must be 200 bytes or less."
)
})
22 changes: 17 additions & 5 deletions vignettes/deepdive.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ Each of the core `{xportr}` functions requires several inputs: A valid dataframe
```{r, eval = FALSE}
adsl %>%
xportr_type(var_spec, "ADSL", "message") %>%
xportr_length(var_spec, "ADSL", "message") %>%
xportr_length(var_spec, "ADSL", verbose = "message") %>%
xportr_label(var_spec, "ADSL", "message") %>%
xportr_order(var_spec, "ADSL", "message") %>%
xportr_format(var_spec, "ADSL") %>%
Expand All @@ -194,7 +194,7 @@ To help reduce these repetitive calls, we have created `xportr_metadata()`. A us
adsl %>%
xportr_metadata(var_spec, "ADSL") %>%
xportr_type() %>%
xportr_length() %>%
xportr_length(length_source = "metadata") %>%
xportr_label() %>%
xportr_order() %>%
xportr_format() %>%
Expand Down Expand Up @@ -310,7 +310,13 @@ str(adsl)
```

```{r, echo = TRUE}
adsl_length <- xportr_length(.df = adsl, metadata = var_spec, domain = "ADSL", verbose = "warn")
adsl_length <- xportr_length(
.df = adsl,
metadata = var_spec,
domain = "ADSL",
verbose = "warn",
length_source = "metadata"
)
```

Using `xportr_length()` with `verbose = "warn"` we can apply the length column to all the columns in the dataset. The function detects that two variables, `TRTDUR` and `DCREASCD` are missing from the metadata file. Note that the variables have slight misspellings in the dataset and metadata, which is a great catch! However, lengths are still applied with TRTDUR being give a length of 8 and DCREASCD a length of 200.
Expand All @@ -325,7 +331,13 @@ str(adsl_length)
Just like we did for `xportr_type()`, setting `verbose = "stop"` immediately stops R from processing the lengths. Here the function detects the missing variables and will not apply any lengths to the dataset until corrective action is applied.

```{r, echo = TRUE, error = TRUE}
adsl_length <- xportr_length(.df = adsl, metadata = var_spec, domain = "ADSL", verbose = "stop")
adsl_length <- xportr_length(
.df = adsl,
metadata = var_spec,
domain = "ADSL",
verbose = "stop",
length_source = "metadata"
)
```


Expand Down Expand Up @@ -426,7 +438,7 @@ It is also note worthy that you can set the dataset label using the `xportr_df_l
adsl %>%
xportr_metadata(var_spec, "ADSL") %>%
xportr_type() %>%
xportr_length() %>%
xportr_length(length_source = "metadata") %>%
xportr_label() %>%
xportr_order() %>%
xportr_format() %>%
Expand Down
Loading

0 comments on commit de2beba

Please sign in to comment.