From cf4fc8dbc9450629c64155cdba103cb8756263cb Mon Sep 17 00:00:00 2001 From: Dan Knight Date: Fri, 10 Jan 2025 15:49:22 -0800 Subject: [PATCH 1/5] Load RCC header info with readxl --- R/read.xls.RCC.R | 80 +++++++++++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 31 deletions(-) diff --git a/R/read.xls.RCC.R b/R/read.xls.RCC.R index 74c28a1..cf957d7 100644 --- a/R/read.xls.RCC.R +++ b/R/read.xls.RCC.R @@ -22,59 +22,77 @@ read.xls.RCC <- function(xls, sheet = 1, perl, sample.id.row = "File.Name") { } # check if worksheet exists - sheet.names <- gdata::sheetNames(xls = xls, perl = perl); + sheet.names <- readxl::excel_sheets(xls); cat(paste("\nYou have chosen to import worksheet ", sheet, " named ", sheet.names[sheet], ". Does that sound correct?\n", sep = "")); cat(paste("The other sheet names are: \n")); cat(paste(paste(1:length(sheet.names), sheet.names, sep = ":"), collapse = "\n")); cat("\n\n"); - # define pattern of first line of sample names - pattern.first.line.header <- "File"; - - # call gdata::read.excel and load header with sample names - header <- gdata::read.xls( - xls = xls, - sheet = sheet, - pattern = pattern.first.line.header, - method = "tab", - perl = perl, - header = FALSE, - as.is = TRUE, - row.names = 1, - nrow = 16, - strip.white = TRUE - ); + prep.rcc <- function(path, sheet) { + data <- as.data.frame(readxl::read_excel( + xls, + sheet = sheet, + col_names = FALSE, + col_types = 'text', + trim_ws = TRUE + )); + + data.start.index <- min(which(data[, 1] == 'Reporter Counts')); + header <- data[1:(data.start.index - 1), ]; + data <- data[data.start.index:nrow(data), ]; + + return(list( + header = header, + counts = data + )); + } + rcc <- prep.rcc(xls, sheet); + + header <- rcc$header; if (is.null(header)) { stop("READ.XLS.RCC: There appears to be a problem with RCC file. No header found."); } + header <- header[!is.na(header[1]), ]; + rownames(header) <- header[, 1]; + header <- header[, -1]; + rownames(header) <- gsub(" $", "", rownames(header)); rownames(header) <- gsub(" ", ".", rownames(header)); rownames(header) <- tolower(rownames(header)); - if ("id" %in% rownames(header)) {rownames(header)[rownames(header) == "id"] <- "sample.id"} - + + if ('id' %in% rownames(header)) { + rownames(header)[rownames(header) == 'id'] <- 'sample.id'; + } if (!all(c("file.name", "sample.id", "binding.density") %in% rownames(header))) { stop("READ.XLS.RCC: There appears to be a problem with RCC file. Rownames in header are missing File name , Sample id, Binding density"); } # parse the header + header <- header[!rownames(header) %in% c('file.attributes', 'lane.attributes'), ]; + header['sample.date', ] <- format( + as.Date( + as.integer(header['sample.date', ]), + origin = '1899-12-30' + ), + format = '%Y/%m/%d' + ); + header['binding.density', ] <- as.numeric(header['binding.density', ]); + + prep.file.versions <- function(file.versions) { + result <- as.character(file.versions) + numeric.versions <- as.numeric(result); + result[!is.na(numeric.versions)] <- numeric.versions[!is.na(numeric.versions)]; + return(as.character(result)); + } + header['file.version', ] <- prep.file.versions(header['file.version', ]); + header <- header[, -c(1,2)]; - # drop missing rows - header <- header[!rownames(header) %in% c('file.attributes','lane.attributes'),]; - # drop missing columns - header <- header[,-c(1,2)]; - # drop trailing columns - header <- header[,!is.na(header[1,]) & !is.na(header[2,])]; - # get sample IDs sample.ids <- header[rownames(header) %in% tolower(sample.id.row),]; - - # change spaces to dots in sample names sample.ids <- gsub(" ", ".", sample.ids); - sample.ids <- gsub("^([0-9])", "X\\1" ,sample.ids); - - # add sample names + sample.ids <- gsub("^([0-9])", "X\\1", sample.ids); colnames(header) <- sample.ids; # define pattern of first line of count data From 5bcfc0e0f6c7bf869731f588100402b8392f26c3 Mon Sep 17 00:00:00 2001 From: Dan Knight Date: Fri, 10 Jan 2025 15:54:28 -0800 Subject: [PATCH 2/5] Restructure RCC Excel loading implementation --- R/read.xls.RCC.R | 53 +++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/R/read.xls.RCC.R b/R/read.xls.RCC.R index cf957d7..60f07a7 100644 --- a/R/read.xls.RCC.R +++ b/R/read.xls.RCC.R @@ -43,43 +43,41 @@ read.xls.RCC <- function(xls, sheet = 1, perl, sample.id.row = "File.Name") { return(list( header = header, - counts = data + x = data )); } rcc <- prep.rcc(xls, sheet); - - header <- rcc$header; - if (is.null(header)) { - stop("READ.XLS.RCC: There appears to be a problem with RCC file. No header found."); + if (is.null(rcc$header)) { + stop("READ.XLS.RCC: There appears to be a problem with RCC file. No rcc$header found."); } - header <- header[!is.na(header[1]), ]; - rownames(header) <- header[, 1]; - header <- header[, -1]; + rcc$header <- rcc$header[!is.na(rcc$header[1]), ]; + rownames(rcc$header) <- rcc$header[, 1]; + rcc$header <- rcc$header[, -1]; - rownames(header) <- gsub(" $", "", rownames(header)); - rownames(header) <- gsub(" ", ".", rownames(header)); - rownames(header) <- tolower(rownames(header)); + rownames(rcc$header) <- gsub(" $", "", rownames(rcc$header)); + rownames(rcc$header) <- gsub(" ", ".", rownames(rcc$header)); + rownames(rcc$header) <- tolower(rownames(rcc$header)); - if ('id' %in% rownames(header)) { - rownames(header)[rownames(header) == 'id'] <- 'sample.id'; + if ('id' %in% rownames(rcc$header)) { + rownames(rcc$header)[rownames(rcc$header) == 'id'] <- 'sample.id'; } - if (!all(c("file.name", "sample.id", "binding.density") %in% rownames(header))) { - stop("READ.XLS.RCC: There appears to be a problem with RCC file. Rownames in header are missing File name , Sample id, Binding density"); + if (!all(c("file.name", "sample.id", "binding.density") %in% rownames(rcc$header))) { + stop("READ.XLS.RCC: There appears to be a problem with RCC file. Rownames in rcc$header are missing File name , Sample id, Binding density"); } - # parse the header - header <- header[!rownames(header) %in% c('file.attributes', 'lane.attributes'), ]; - header['sample.date', ] <- format( + # parse the rcc$header + rcc$header <- rcc$header[!rownames(rcc$header) %in% c('file.attributes', 'lane.attributes'), ]; + rcc$header['sample.date', ] <- format( as.Date( - as.integer(header['sample.date', ]), + as.integer(rcc$header['sample.date', ]), origin = '1899-12-30' ), format = '%Y/%m/%d' ); - header['binding.density', ] <- as.numeric(header['binding.density', ]); + rcc$header['binding.density', ] <- as.numeric(rcc$header['binding.density', ]); prep.file.versions <- function(file.versions) { result <- as.character(file.versions) @@ -87,13 +85,13 @@ read.xls.RCC <- function(xls, sheet = 1, perl, sample.id.row = "File.Name") { result[!is.na(numeric.versions)] <- numeric.versions[!is.na(numeric.versions)]; return(as.character(result)); } - header['file.version', ] <- prep.file.versions(header['file.version', ]); - header <- header[, -c(1,2)]; + rcc$header['file.version', ] <- prep.file.versions(rcc$header['file.version', ]); + rcc$header <- rcc$header[, -c(1,2)]; - sample.ids <- header[rownames(header) %in% tolower(sample.id.row),]; + sample.ids <- rcc$header[rownames(rcc$header) %in% tolower(sample.id.row),]; sample.ids <- gsub(" ", ".", sample.ids); sample.ids <- gsub("^([0-9])", "X\\1", sample.ids); - colnames(header) <- sample.ids; + colnames(rcc$header) <- sample.ids; # define pattern of first line of count data pattern.first.line.counts <- "Code"; @@ -111,7 +109,7 @@ read.xls.RCC <- function(xls, sheet = 1, perl, sample.id.row = "File.Name") { ); if (is.null(x)) { - stop("READ.XLS.RCC: There appears to be a problem with RCC file. Likely couldnt find the count header specifically `Code Class`"); + stop("READ.XLS.RCC: There appears to be a problem with RCC file. Likely couldnt find the count rcc$header specifically `Code Class`"); } # drop any trailing columns @@ -146,7 +144,6 @@ read.xls.RCC <- function(xls, sheet = 1, perl, sample.id.row = "File.Name") { cat(paste("\n\nThere were", nrow(x), "genes imported with the following Code Class breakdown:")); print(table(x$Code.Class)); - x <- list(x = x, header = header); - class(x) <- 'NanoString'; - return(x); + class(rcc) <- 'NanoString'; + return(rcc); } From e94ed5803fdb866cb4888a6ed1a7cafc47eefa7e Mon Sep 17 00:00:00 2001 From: Dan Knight Date: Fri, 10 Jan 2025 16:27:07 -0800 Subject: [PATCH 3/5] Replace gdata Excel dependency with readxl --- DESCRIPTION | 8 +++-- NAMESPACE | 2 +- R/read.xls.RCC.R | 40 +++++++---------------- metadata.yaml | 2 +- vignettes/NanoStringNorm_Introduction.Rnw | 4 +-- 5 files changed, 21 insertions(+), 35 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9235802..474b39b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -7,8 +7,12 @@ Authors@R: c( person(c("Daryl", "M."), "Waggott", role = "aut"), person("Paul", "Boutros", email = "PBoutros@mednet.ucla.edu", role = "cre"), person("Dan", "Knight", role = "ctb")) -Depends: R (>= 2.14.0), gdata (>= 2.8.2), XML (>= 3.98-1.5) -Imports: methods +Depends: + R (>= 2.14.0), + XML (>= 3.98-1.5) +Imports: + methods, + readxl Suggests: googleVis (>= 0.2.14), lme4, RUnit (>= 0.4.26) Description: A set of tools for normalizing, diagnostics and visualization of NanoString nCounter data. License: GPL-2 diff --git a/NAMESPACE b/NAMESPACE index 586431d..ad89d2c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,7 +21,7 @@ importFrom("utils", "download.file", "packageDescription", "read.table") importFrom("utils", "read.csv") import( - 'gdata', + 'readxl', 'XML' ) diff --git a/R/read.xls.RCC.R b/R/read.xls.RCC.R index 60f07a7..d7eca17 100644 --- a/R/read.xls.RCC.R +++ b/R/read.xls.RCC.R @@ -93,47 +93,29 @@ read.xls.RCC <- function(xls, sheet = 1, perl, sample.id.row = "File.Name") { sample.ids <- gsub("^([0-9])", "X\\1", sample.ids); colnames(rcc$header) <- sample.ids; - # define pattern of first line of count data - pattern.first.line.counts <- "Code"; - - # call gdata::read.excel and load counts - x <- gdata::read.xls( - xls = xls, - sheet = sheet, - pattern = pattern.first.line.counts, - method = "tab", - perl = perl, - header = TRUE, - strip.white = TRUE, - as.is = TRUE - ); - - if (is.null(x)) { + if (is.null(rcc$x)) { stop("READ.XLS.RCC: There appears to be a problem with RCC file. Likely couldnt find the count rcc$header specifically `Code Class`"); } - # drop any trailing columns - x <- x[,1:(3+length(sample.ids))]; + colnames(rcc$x) <- rcc$x[2, ]; + rcc$x <- rcc$x[-c(1:2), 1:(3 + length(sample.ids))]; # drop rows that have a missing code class or gene name - rows.with.missing.anno <- (x[,1] == '' | x[,2] == ''); + rows.with.missing.anno <- (rcc$x[, 1] == '' | rcc$x[, 2] == ''); if (any(rows.with.missing.anno)) { + rcc$x <- rcc$x[!rows.with.missing.anno,]; cat(paste("The following row(s)", paste(which(rows.with.missing.anno), collapse = ", "), "have been dropped due to missing annotation.\n\t You may want to double check the excel file.\n\n")); } - if (any(rows.with.missing.anno)) { - x <- x[!rows.with.missing.anno,]; - } - - # add sample names - colnames(x) <- c(colnames(x)[1:3], sample.ids); + colnames(rcc$x) <- gsub(" ", ".", colnames(rcc$x)); + colnames(rcc$x) <- c(colnames(rcc$x)[1:3], sample.ids); # print summary of samples cat(paste("There were", length(sample.ids), "samples imported. \nNote that spaces in sample names will be replaced by dots. \n")); - if ( length(sample.ids) > 5) { + if (length(sample.ids) > 5) { cat("The first and last 3 sample names found in the dataset are:\n"); - cat(paste(c(sample.ids[1:3],rev(sample.ids)[1:3]))); + cat(paste(c(sample.ids[1:3], rev(sample.ids)[1:3]))); } else { cat("The sample names found in the dataset are:\n"); @@ -141,8 +123,8 @@ read.xls.RCC <- function(xls, sheet = 1, perl, sample.id.row = "File.Name") { } # print summary of genes - cat(paste("\n\nThere were", nrow(x), "genes imported with the following Code Class breakdown:")); - print(table(x$Code.Class)); + cat(paste("\n\nThere were", nrow(rcc$x), "genes imported with the following Code Class breakdown:")); + print(table(rcc$x[, 'Code.Class'])); class(rcc) <- 'NanoString'; return(rcc); diff --git a/metadata.yaml b/metadata.yaml index c760d94..e2a18b3 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -8,7 +8,7 @@ Contributors: - Julie Livingstone Languages: R (>= 2.14.0) Dependencies: - - gdata (>= 2.8.2) + - readxl - XML (>= 3.98-1.5) - googleVis (>= 0.2.14) - lme4 diff --git a/vignettes/NanoStringNorm_Introduction.Rnw b/vignettes/NanoStringNorm_Introduction.Rnw index a63705f..b46fbb0 100644 --- a/vignettes/NanoStringNorm_Introduction.Rnw +++ b/vignettes/NanoStringNorm_Introduction.Rnw @@ -1,5 +1,5 @@ %\VignetteIndexEntry{Introduction to NanoStringNorm} -%\VignetteDepends{googleVis,gdata} +%\VignetteDepends{googleVis,readxl} %\VignetteKeywords{Expression Analysis} %\VignettePackage{NanoStringNorm} @@ -95,7 +95,7 @@ dev.off(); The input data usually comes in the form of a structured Excel spreadsheet. You can export the raw count data from Excel as a delimited text file for use with R. Start by opening the \emph{raw} worksheet in a blank Excel page for editing. Copy the count data (row 23) for each sample including the first 3 annotation columns (Code.Class, Name and Accession) to a separate worksheet or text file. Don't forget to add the sample IDs (row 5), and remove any incomplete rows or columns. The resulting tabular data can be saved as a tab delimited file for import into R. \\ \\ -Alternatively, you can import data directly from xls format into R using the function \Rfunction{read.xls.RCC} based on core functionality in the \Rpackage{gdata} package. +Alternatively, you can import data directly from xls format into R using the function \Rfunction{read.xls.RCC} based on core functionality in the \Rpackage{readxl} package. <>= # directly import the nCounter output From 1a2c0ca4f2b3a4e10aff18b1dc911e47b8f32f14 Mon Sep 17 00:00:00 2001 From: Dan Knight Date: Fri, 10 Jan 2025 16:28:49 -0800 Subject: [PATCH 4/5] Update changelog --- DESCRIPTION | 4 ++-- NEWS | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 474b39b..0ae8dec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: NanoStringNorm Type: Package Title: Normalize NanoString miRNA and mRNA Data -Version: 2.0.0 -Date: 2023-03-21 +Version: 3.0.0 +Date: 2025-01-10 Authors@R: c( person(c("Daryl", "M."), "Waggott", role = "aut"), person("Paul", "Boutros", email = "PBoutros@mednet.ucla.edu", role = "cre"), diff --git a/NEWS b/NEWS index 9286f42..026ba19 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,9 @@ +NanoStringNorm 3.0.0 2025-01-10 +---------------------------------------------------------------- + +UPDATED +- Replaced gdata dependency with readxl for loading Excel files due to deprecated Excel support. + NanoStringNorm 2.0.0 2023-03-21 ---------------------------------------------------------------- REMOVED From b120f154649de0f0b82cafd501970c5a75270671 Mon Sep 17 00:00:00 2001 From: Dan Knight Date: Fri, 10 Jan 2025 16:38:46 -0800 Subject: [PATCH 5/5] Update readxl dependency in GitHub action --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 99b8f41..e925ed1 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -17,6 +17,6 @@ jobs: with: path: NanoStringNorm - run: apt-get update && apt-get install -y libxml2-dev cmake - - run: R -e "install.packages(c('gdata', 'XML', 'googleVis', 'lme4', 'RUnit'))" + - run: R -e "install.packages(c('readxl', 'XML', 'googleVis', 'lme4', 'RUnit'))" - run: R CMD build --compact-vignettes="gs+qpdf" NanoStringNorm - run: R CMD check --as-cran --run-donttest NanoStringNorm_*.tar.gz