From fafdb7c95a91cdc596b98dbf86b47437b54b4cac Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:29:30 +0200 Subject: [PATCH 1/3] fixed `formatTxSpots` to read already processed data --- R/read.R | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/R/read.R b/R/read.R index 6e6813f..59e4b57 100644 --- a/R/read.R +++ b/R/read.R @@ -455,7 +455,7 @@ readVizgen <- function(data_dir, parq_clean <- grep("cell_boundaries|micron_space", parq, value = TRUE) - message(">>> ", length(parq), " `.parquet` files exists:", + message(">>> ", length(parq), " `.parquet` file(s) exists:", paste0("\n", parq), "\n", ">>> using -> " , parq_clean) parq <- parq_clean if (any(grepl("cell_boundaries.parquet", parq))) { @@ -492,7 +492,7 @@ readVizgen <- function(data_dir, polys$ZLevel <- 1.5 * (polys$ZIndex + 1L) polys <- polys[,c("ID", "ZIndex", "Type", "ZLevel", "geometry")] } else { - warning("No '.parquet' files present, check input directory -> `data_dir`") + warning("No '.parquet' or `hdf5` files present, check input directory -> `data_dir`") polys <- NULL } } else { rlang::check_installed("rhdf5") @@ -794,40 +794,41 @@ formatTxSpots <- function(file, dest = c("rowGeometry", "colGeometry"), if (!dir.exists(dirname(file_out))) dir.create(dirname(file_out)) file_dir <- file_path_sans_ext(file_out) - # File already exists, skip processing - # make sure if z = "all" transcripts are read from ./detected_transcripts dir - if (file.exists(file_out) && !dir.exists(file_out) && z != "all") { - if (!return) return(file_out) - out <- sfarrow::st_read_parquet(file_out) - rownames(out) <- out$ID - return(out) - } else if (dir.exists(file_dir)) { + # File or dir already exists, skip processing + # read transcripts from ./detected_transcripts + if (dir.exists(file_dir) && z == "all" && z_option != "3d") { # Multiple files pattern <- "\\.parquet$" # Need to deal with z-planes if (z != "all") { - pattern <- paste0("_z", paste0(z, collapse = "|"), pattern) + pattern <- paste0("_z", paste0(z, collapse = "|"), pattern) } fns <- list.files(file_dir, pattern, full.names = TRUE) if (!length(fns) && length(z) == 1L) { - pattern <- "\\.parquet$" - fns <- list.files(file_dir, pattern, full.names = TRUE) + pattern <- "\\.parquet$" + fns <- list.files(file_dir, pattern, full.names = TRUE) } if (length(fns)) { - if (!return) return(file_dir) - out <- lapply(fns, sfarrow::st_read_parquet) - # add names to a list - names(out) <- gsub(".parquet", "", - x = list.files(file_dir, pattern)) - out <- lapply(out, function(x) { - # row names are dropped in st_read/write_parquet - rownames(x) <- x$ID - return(x) - }) + if (!return) return(file_dir) + out <- lapply(fns, sfarrow::st_read_parquet) + # add names to a list + names(out) <- gsub(".parquet", "", + x = list.files(file_dir, pattern)) + out <- lapply(out, function(x) { + # row names are dropped in st_read/write_parquet + rownames(x) <- x$ID + return(x) + }) return(out) + } + # read transcripts from detected_transcripts.parquet + } else if (file.exists(file_out) && !dir.exists(file_dir) && z_option != "3d") { + if (!return) return(file_out) + out <- sfarrow::st_read_parquet(file_out) + rownames(out) <- out$ID + return(out) } } - } if (!is.numeric(z) && z != "all") { stop("z must either be numeric or be 'all' indicating all z-planes.") } From 4269e1923928aebda5c687f5e585dbed4ddd1729 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Mon, 20 Nov 2023 15:24:51 +0100 Subject: [PATCH 2/3] =?UTF-8?q?added=20`readXenium`=20=F0=9F=8E=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/read.R | 695 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 537 insertions(+), 158 deletions(-) diff --git a/R/read.R b/R/read.R index 59e4b57..d07a309 100644 --- a/R/read.R +++ b/R/read.R @@ -78,7 +78,7 @@ read10xVisiumSFE <- function(samples = "", sfes <- lapply(seq_along(samples), function(i) { o <- read10xVisium(dirs[i], sample_id[i], type, data, images, load = FALSE) imgData(o) <- NULL - + scalefactors <- fromJSON(file = file.path( dirs[i], "spatial", "scalefactors_json.json" @@ -118,7 +118,7 @@ read10xVisiumSFE <- function(samples = "", fluo$in_tissue <- NULL colData(o) <- cbind(colData(o), fluo[row_inds,]) } - + names_use <- paste("tissue", images, "scalef", sep = "_") scale_imgs <- unlist(scalefactors[names_use]) # Convert to microns and set extent for image @@ -185,7 +185,7 @@ read10xVisiumSFE <- function(samples = "", # remove empty elements geometries <- geometries[inds] geometries <- lapply(geometries, function(m) st_polygon(list(t(m)))) - + # keep non-emplty elements df <- st_sf(geometry = sf::st_sfc(geometries), ID = cell_ids[which(inds)], @@ -216,7 +216,7 @@ read10xVisiumSFE <- function(samples = "", st_cast(st_sfc(x), "POLYGON") }) areas <- lapply(polys_sep, st_area) - + if (!is.null(min_area)) { which_keep <- lapply(areas, function(x) which(x > min_area)) multi_inds <- which(lengths(which_keep) > 1L) @@ -395,7 +395,7 @@ readVizgen <- function(data_dir, if ((any(z < 0) || any(z > 6)) && z != "all") { stop("z must be beween 0 and 6 (inclusive).") } - + # Read images---------- # sanity on image names # .."Cellbound" image usually has a digit, eg "Cellbound3" @@ -403,7 +403,7 @@ readVizgen <- function(data_dir, if (any("Cellbound" %in% image)) { image_regex[which(image %in% "Cellbound")] <- paste0(grep("Cell", image_regex, value = TRUE), "\\d") } - + if (z == "all") { img_pattern <- paste0("mosaic_(", paste(image_regex, collapse = "|"), ")_z-?\\d+\\.tif$") } else { @@ -422,7 +422,7 @@ readVizgen <- function(data_dir, do_flip <- .if_flip_img(img_fn, max_flip) if (!length(img_fn)) flip <- "none" else if (!any(do_flip) && flip == "image") flip <- "geometry" - + # Read cell segmentation------------- # Use segmentation output from ".parquet" file # check if ".parquet" file is present @@ -437,11 +437,11 @@ readVizgen <- function(data_dir, pattern = ".parquet$", full.names = TRUE, recursive = TRUE) - } - - # set to use .parquet" file if present - use.parquet <- any(length(parq)) & use_cellpose - if (use.parquet) { + } + + # set to use .parquet" file if present + use.parquet <- any(length(parq)) & use_cellpose + if (use.parquet) { # sanity check parq_sanity <- grepl("cell_boundaries|micron_space", parq) @@ -470,7 +470,7 @@ readVizgen <- function(data_dir, } } else if (all(parq_sanity == FALSE)) { parq <- NULL } if (!is.null(parq)) { - rlang::check_installed("sfarrow") + check_installed("sfarrow") message(">>> Cell segmentations are found in `.parquet` file", if (any(grepl("hdf5s_micron", parq))) { paste0("\n", ">>> processed hdf5 files will be used") }) @@ -491,11 +491,11 @@ readVizgen <- function(data_dir, if (!"ZLevel" %in% names(polys)) # For reading what's written after HDF5 polys$ZLevel <- 1.5 * (polys$ZIndex + 1L) polys <- polys[,c("ID", "ZIndex", "Type", "ZLevel", "geometry")] - } else { - warning("No '.parquet' or `hdf5` files present, check input directory -> `data_dir`") - polys <- NULL } } else { - rlang::check_installed("rhdf5") + warning("No '.parquet' or `hdf5` files present, check input directory -> `data_dir`") + polys <- NULL } + } else { + check_installed("rhdf5") fns <- list.files(file.path(data_dir, "cell_boundaries"), "*.hdf5", full.names = TRUE) if (length(fns)) { @@ -516,19 +516,19 @@ readVizgen <- function(data_dir, } } if (!is.null(polys) && nrow(polys) == 0L) - stop("No polygons left after filtering.") + stop("No polygons left after filtering.") if (flip == "geometry" && !is.null(polys)) { # Flip the coordinates mat_flip <- matrix(c(1,0,0,-1), ncol = 2) st_geometry(polys) <- st_geometry(polys) * mat_flip } - + # get count data file mat_fn <- .check_vizgen_fns(data_dir, "cell_by_gene") - + # Column without colname is read as V1 mat <- fread(mat_fn, colClasses = list(character = 1)) - + # get spatial metadata file--------- meta_fn <- .check_vizgen_fns(data_dir, "cell_metadata") metadata <- fread(meta_fn, colClasses = list(character = 1)) @@ -536,7 +536,7 @@ readVizgen <- function(data_dir, message(">>> ..filtering `cell_metadata` - keep cells with `transcript_count` > 0") metadata <- metadata[metadata$transcript_count > 0,] } - + if (!is.null(polys)) { # remove NAs when matching metadata <- @@ -547,7 +547,7 @@ readVizgen <- function(data_dir, if (flip == "geometry") { metadata$center_y <- -metadata$center_y } - + # convert counts df to sparse matrix------------ mat <- mat[match(rownames(metadata), mat[[1]]),] # polys already matched to metadata rns <- mat[[1]] @@ -564,7 +564,7 @@ readVizgen <- function(data_dir, metadata <- metadata[inds,] polys <- polys[inds,] } - + # check matching cell ids in polygon geometries, should match the count matrix cell ids if (!is.null(polys) && !identical(polys$ID, rns)) { @@ -573,8 +573,8 @@ readVizgen <- function(data_dir, message(">>> filtering geometries to match ", length(matched.cells), " cells with counts > 0") polys <- polys[matched.cells, , drop = FALSE] - } - + } + if (any(if_exists)) { manifest <- fromJSON(file = file.path(data_dir, "images", "manifest.json")) extent <- setNames(manifest$bbox_microns, c("xmin", "ymin", "xmax", "ymax")) @@ -596,7 +596,7 @@ readVizgen <- function(data_dir, sample_id = sample_id, spatialCoordsNames = c("center_x", "center_y"), unit = "micron", BPPARAM = BPPARAM) - + # If none of segmentations are present, make bounding boxes # NOTE: might take some time to run if (use_bboxes && is.null(polys)) { @@ -612,15 +612,15 @@ readVizgen <- function(data_dir, rownames(bboxes) <- rownames(metadata) cellSeg(sfe) <- bboxes } - + if (!is.null(polys)) { rownames(polys) <- polys$ID polys$ID <- NULL cellSeg(sfe) <- polys } - + if (any(if_exists)) { imgData(sfe) <- img_df } - + if (add_molecules) { message(">>> Reading transcript coordinates") # get molecule coordiantes file @@ -653,22 +653,45 @@ readVizgen <- function(data_dir, .mols2geo_split <- function(mols, dest, spatialCoordsNames, gene_col, cell_col, BPPARAM, not_in_cell_id, split_col) { - if (!is.null(split_col) && split_col %in% names(mols)) { - mols <- split(mols, mols[[split_col]]) - mols <- lapply(mols, .mols2geo, dest = dest, BPPARAM = BPPARAM, - spatialCoordsNames = spatialCoordsNames, - gene_col = gene_col, cell_col = cell_col, - not_in_cell_id = not_in_cell_id) - if (dest == "colGeometry") { - # Will be a nested list - mols <- unlist(mols, recursive = FALSE) - # names will be something like nucleus.Gapdh if split by compartment - } - } else { - mols <- .mols2geo(mols, dest, spatialCoordsNames, gene_col, cell_col, - BPPARAM, not_in_cell_id) + if (!is.null(split_col) && split_col %in% names(mols)) { + mols <- split(mols, mols[[split_col]]) + mols <- lapply(mols, .mols2geo, dest = dest, BPPARAM = BPPARAM, + spatialCoordsNames = spatialCoordsNames, + gene_col = gene_col, cell_col = cell_col, + not_in_cell_id = not_in_cell_id) + if (dest == "colGeometry") { + # Will be a nested list + mols <- unlist(mols, recursive = FALSE) + # names will be something like nucleus.Gapdh if split by compartment } - mols + } else { + mols <- .mols2geo(mols, dest, spatialCoordsNames, gene_col, cell_col, + BPPARAM, not_in_cell_id) + } + mols +} + +# helper function to convert from raw bytes to character +.rawToChar_df <- function(input_df, BPPARAM = SerialParam()) { + convert_ids <- + lapply(input_df, function(x) is(x, "arrow_binary")) |> unlist() |> which() + if (any(convert_ids)) { + message(">>> Converting columns with raw bytes (ie 'arrow_binary') to character") + cols_converted <- + lapply(seq(convert_ids), function(i) { + bplapply(input_df[,convert_ids][[i]], function(x) { + x <- rawToChar(x) + }, BPPARAM = BPPARAM) + }) + # replace the converted cell ids + for (i in seq(cols_converted)) { + input_df[,convert_ids][[i]] <- unlist(cols_converted[[i]]) + } + } + if (!is(input_df, "data.table")) { + input_df <- data.table::as.data.table(input_df) + } + return(input_df) } #' Read and process transcript spots geometry for SFE @@ -773,7 +796,7 @@ formatTxSpots <- function(file, dest = c("rowGeometry", "colGeometry"), spatialCoordsNames = c("global_x", "global_y", "global_z"), gene_col = "gene", cell_col = "cell_id", z = 3L, phred_col = "qv", min_phred = 20, split_col = NULL, - not_in_cell_id = "-1", + not_in_cell_id = c("-1", "UNASSIGNED"), z_option = c("split", "3d"), file_out = NULL, BPPARAM = SerialParam(), return = TRUE) { @@ -786,8 +809,8 @@ formatTxSpots <- function(file, dest = c("rowGeometry", "colGeometry"), if (is.null(file_out)) stop("file_out must be specified for dest = 'colGeometry'.") } - if (!ext %in% c("csv", "tsv", "txt", "parquet")) { - stop("The file must be one of csv, tsv, txt, or parquet") + if (!ext %in% c("csv", "gz", "tsv", "txt", "parquet")) { + stop("The file must be one of csv, gz, tsv, txt, or parquet") } if (!is.null(file_out)) { file_out <- normalizePath(file_out, mustWork = FALSE) @@ -820,21 +843,28 @@ formatTxSpots <- function(file, dest = c("rowGeometry", "colGeometry"), return(x) }) return(out) - } - # read transcripts from detected_transcripts.parquet - } else if (file.exists(file_out) && !dir.exists(file_dir) && z_option != "3d") { - if (!return) return(file_out) - out <- sfarrow::st_read_parquet(file_out) - rownames(out) <- out$ID - return(out) } + # read transcripts from detected_transcripts.parquet + } else if (file.exists(file_out) && !dir.exists(file_dir) && z_option != "3d") { + if (!return) return(file_out) + out <- sfarrow::st_read_parquet(file_out) + rownames(out) <- out$ID + return(out) } + } if (!is.numeric(z) && z != "all") { stop("z must either be numeric or be 'all' indicating all z-planes.") } if (ext == "parquet") { check_installed("arrow") - mols <- arrow::read_parquet(file) |> data.table::as.data.table() + mols <- arrow::read_parquet(file) + # convert cols with raw bytes to character + # NOTE: can take a while. + mols <- .rawToChar_df(mols, BPPARAM = BPPARAM) + # sanity, convert to data.table + if (!is(mols, "data.table")) { + mols <- data.table::as.data.table(mols) + } } else { mols <- fread(file) } @@ -851,23 +881,23 @@ formatTxSpots <- function(file, dest = c("rowGeometry", "colGeometry"), if (use_z) { zs <- mols[[spatialCoordsNames[3]]] if (is.null(zs)) { # z column not found - spatialCoordsNames <- spatialCoordsNames[-3] - use_z <- FALSE + spatialCoordsNames <- spatialCoordsNames[-3] + use_z <- FALSE } if (all(floor(zs) == zs)) { # integer z values - if (z != "all") { - if (all(!z %in% unique(zs))) - stop("z plane(s) specified not found.") - inds <- mols[[spatialCoordsNames[3]]] %in% z - mols <- mols[inds,, drop = FALSE] - if (length(z) == 1L) { - spatialCoordsNames <- spatialCoordsNames[-3] - use_z <- FALSE - } + if (z != "all") { + if (all(!z %in% unique(zs))) + stop("z plane(s) specified not found.") + inds <- mols[[spatialCoordsNames[3]]] %in% z + mols <- mols[inds,, drop = FALSE] + if (length(z) == 1L) { + spatialCoordsNames <- spatialCoordsNames[-3] + use_z <- FALSE } + } } else { - z <- "all" # Non-integer z values - z_option <- "3d" + z <- "all" # Non-integer z values + z_option <- "3d" } } if (phred_col %in% names(mols)) { @@ -875,61 +905,61 @@ formatTxSpots <- function(file, dest = c("rowGeometry", "colGeometry"), } message(">>> Converting transcript spots to geometry") if (dest == "colGeometry") { - if (!length(cell_col) || any(!cell_col %in% names(mols))) - stop("Column indicating cell ID not found.") - mols <- mols[mols[[cell_col[1]]] != not_in_cell_id,] - if (length(cell_col) > 1L) { - if (!is.data.table(mols)) ..cell_col <- cell_col - cell_col_use <- do.call(paste, c(mols[,..cell_col], sep = "_")) - mols$cell_id_ <- cell_col_use - mols[,cell_col] <- NULL - cell_col <- "cell_id_" - } + if (!length(cell_col) || any(!cell_col %in% names(mols))) + stop("Column indicating cell ID not found.") + mols <- mols[mols[[cell_col[1]]] != not_in_cell_id,] + if (length(cell_col) > 1L) { + if (!is.data.table(mols)) ..cell_col <- cell_col + cell_col_use <- do.call(paste, c(mols[,..cell_col], sep = "_")) + mols$cell_id_ <- cell_col_use + mols[,cell_col] <- NULL + cell_col <- "cell_id_" + } } if (z_option == "split" && use_z) { - mols <- split(mols, mols[[spatialCoordsNames[3]]]) - mols <- lapply(mols, .mols2geo_split, dest = dest, - spatialCoordsNames = spatialCoordsNames[1:2], - gene_col = gene_col, cell_col = cell_col, BPPARAM = BPPARAM, - not_in_cell_id = not_in_cell_id, split_col = split_col) - # If list of list, i.e. colGeometry, or do split - if (!is(mols[[1]], "sf")) { - names_use <- lapply(names(mols), function(n) { - names_int <- names(mols[[n]]) - paste0(names_int, "_z", n) - }) |> unlist() - mols <- unlist(mols, recursive = FALSE) - names(mols) <- names_use - } else if (!is.null(file_out)) { - names(mols) <- paste0(basename(file_dir), "_z", names(mols)) - } else { + mols <- split(mols, mols[[spatialCoordsNames[3]]]) + mols <- lapply(mols, .mols2geo_split, dest = dest, + spatialCoordsNames = spatialCoordsNames[1:2], + gene_col = gene_col, cell_col = cell_col, BPPARAM = BPPARAM, + not_in_cell_id = not_in_cell_id, split_col = split_col) + # If list of list, i.e. colGeometry, or do split + if (!is(mols[[1]], "sf")) { + names_use <- lapply(names(mols), function(n) { + names_int <- names(mols[[n]]) + paste0(names_int, "_z", n) + }) |> unlist() + mols <- unlist(mols, recursive = FALSE) + names(mols) <- names_use + } else if (!is.null(file_out)) { + names(mols) <- paste0(basename(file_dir), "_z", names(mols)) + } else { names(mols) <- - file_path_sans_ext(file) |> - basename() |> - paste0("_z", names(mols)) - } + file_path_sans_ext(file) |> + basename() |> + paste0("_z", names(mols)) + } } else { - mols <- .mols2geo_split(mols, dest, spatialCoordsNames, gene_col, cell_col, - BPPARAM, not_in_cell_id, split_col) + mols <- .mols2geo_split(mols, dest, spatialCoordsNames, gene_col, cell_col, + BPPARAM, not_in_cell_id, split_col) } - + if (!is.null(file_out)) { - message(">>> Writing reformatted transcript spots to disk") - if (is(mols, "sf")) { - suppressWarnings(sfarrow::st_write_parquet(mols, file_out)) - if (!return) return(file_out) - } else { - if (!dir.exists(file_dir)) dir.create(file_dir) - suppressWarnings({ - bplapply(names(mols), function(n) { - name_use <- gsub("/", ".", n) - sfarrow::st_write_parquet(mols[[n]], - file.path(file_dir, - paste0(name_use, ".parquet"))) - }, BPPARAM = SerialParam(progressbar = TRUE)) - }) - if (!return) return(file_dir) - } + message(">>> Writing reformatted transcript spots to disk") + if (is(mols, "sf")) { + suppressWarnings(sfarrow::st_write_parquet(mols, file_out)) + if (!return) return(file_out) + } else { + if (!dir.exists(file_dir)) dir.create(file_dir) + suppressWarnings({ + bplapply(names(mols), function(n) { + name_use <- gsub("/", ".", n) + sfarrow::st_write_parquet(mols[[n]], + file.path(file_dir, + paste0(name_use, ".parquet"))) + }, BPPARAM = SerialParam(progressbar = TRUE)) + }) + if (!return) return(file_dir) + } } return(mols) } @@ -950,9 +980,9 @@ addTxSpots <- function(sfe, file, sample_id = NULL, z_option = z_option, file_out = file_out, BPPARAM = BPPARAM) if (is(mols, "sf")) { - txSpots(sfe, withDimnames = TRUE) <- mols + txSpots(sfe, withDimnames = TRUE) <- mols } else if (is.list(mols)) { - rowGeometries(sfe) <- c(rowGeometries(sfe), mols) + rowGeometries(sfe) <- c(rowGeometries(sfe), mols) } sfe } @@ -983,44 +1013,393 @@ readCosMX <- function(data_dir, split_cell_comps = FALSE, BPPARAM = SerialParam(), file_out = file.path(data_dir, "tx_spots.parquet"), ...) { - data_dir <- normalizePath(data_dir, mustWork = TRUE) - fns <- list.files(data_dir, pattern = "\\.csv$", full.names = TRUE) - fn_metadata <- grep("metadata", fns, value = TRUE) - fn_mat <- grep("exprMat", fns, value = TRUE) - fn_polys <- grep("polygons", fns, value = TRUE) - - meta <- fread(fn_metadata) - mat <- fread(fn_mat) - polys <- fread(fn_polys) - - meta$cell_ID <- paste(meta$cell_ID, meta$fov, sep = "_") - mat$cell_ID <- paste(mat$cell_ID, mat$fov, sep = "_") - polys$cellID <- paste(polys$cellID, polys$fov, sep = "_") - - mat <- mat[match(meta$cell_ID, mat$cell_ID),] - cell_ids <- mat$cell_ID - mat <- mat[,3:ncol(mat)] |> - as.matrix() |> - as("CsparseMatrix") |> Matrix::t() - colnames(mat) <- cell_ids - message(">>> Constructing cell polygons") - polys <- df2sf(polys, spatialCoordsNames = c("x_global_px", "y_global_px"), - geometryType = "POLYGON", - id_col = "cellID", BPPARAM = BPPARAM) - polys <- polys[match(meta$cell_ID, polys$ID),] - sfe <- SpatialFeatureExperiment(list(counts = mat), colData = meta, - spatialCoordsNames = c("CenterX_global_px", "CenterY_global_px"), - unit = "full_res_image_pixel") - cellSeg(sfe) <- polys + data_dir <- normalizePath(data_dir, mustWork = TRUE) + fns <- list.files(data_dir, pattern = "\\.csv$", full.names = TRUE) + fn_metadata <- grep("metadata", fns, value = TRUE) + fn_mat <- grep("exprMat", fns, value = TRUE) + fn_polys <- grep("polygons", fns, value = TRUE) + + meta <- fread(fn_metadata) + mat <- fread(fn_mat) + polys <- fread(fn_polys) + + meta$cell_ID <- paste(meta$cell_ID, meta$fov, sep = "_") + mat$cell_ID <- paste(mat$cell_ID, mat$fov, sep = "_") + polys$cellID <- paste(polys$cellID, polys$fov, sep = "_") + + mat <- mat[match(meta$cell_ID, mat$cell_ID),] + cell_ids <- mat$cell_ID + mat <- mat[,3:ncol(mat)] |> + as.matrix() |> + as("CsparseMatrix") |> Matrix::t() + colnames(mat) <- cell_ids + message(">>> Constructing cell polygons") + polys <- df2sf(polys, spatialCoordsNames = c("x_global_px", "y_global_px"), + geometryType = "POLYGON", + id_col = "cellID", BPPARAM = BPPARAM) + polys <- polys[match(meta$cell_ID, polys$ID),] + sfe <- SpatialFeatureExperiment(list(counts = mat), colData = meta, + spatialCoordsNames = c("CenterX_global_px", "CenterY_global_px"), + unit = "full_res_image_pixel") + cellSeg(sfe) <- polys + + if (add_molecules) { + message(">>> Reading transcript coordinates") + fn <- grep("tx_file.csv", fns, value = TRUE) + split_col <- if (split_cell_comps) "CellComp" else NULL + sfe <- addTxSpots(sfe, fn, spatialCoordsNames = c("x_global_px", "y_global_px", "z"), + gene_col = "target", split_col = split_col, + file_out = file_out, z = z, + BPPARAM = BPPARAM, ...) + } + sfe +} - if (add_molecules) { - message(">>> Reading transcript coordinates") - fn <- grep("tx_file.csv", fns, value = TRUE) - split_col <- if (split_cell_comps) "CellComp" else NULL - sfe <- addTxSpots(sfe, fn, spatialCoordsNames = c("x_global_px", "y_global_px", "z"), - gene_col = "target", split_col = split_col, - file_out = file_out, z = z, - BPPARAM = BPPARAM, ...) +.check_xenium_fns <- function(data_dir, keyword) { + fn <- + list.files(data_dir, + pattern = keyword, + full.names = TRUE) + if (any(grep(keyword, fn))) { + # prioritize reading .csv data + #..since .parquet has cols with raw bytes format + fn <- grep(".csv", fn, value = TRUE) + } else if (any(grep(keyword, fn))) { + fn <- grep(".parquet", fn, value = TRUE) + } + if (!length(fn)) { + stop("No `", keyword, "` file is available") + } + fn +} +#' Read 10X Xenium output as SpatialFeatureExperiment +#' +#' This function reads the standard 10X Xenium output into an SFE object. +#' @inheritParams readVizgen +#' @param image Which image(s) to load, can be "morphology_mip" and/or "morphology_focus" or +#' any combination of them. +#' @param segmentations Which segmentation outputs to read, can be "cell" and/or "nucleus", or +#' any combination of them. +#' @param read.image_args list of arguments to be passed to (`RBioFormats::read.image`) +#' @param image_threshold Integer value, below which threshold is to set values to `NA`, +#' default is to `30L`, this removes some background artifacts. +#' +#' @return An SFE object. +#' @export +#' +#' @importFrom sf st_area st_geometry<- st_as_sf +#' @importFrom terra rast ext vect +#' @importFrom BiocParallel bpmapply bplapply +#' @importFrom rlang check_installed +#' @importFrom SpatialExperiment imgData<- +#' @importFrom SummarizedExperiment assay +#' @importFrom data.table fread merge.data.table rbindlist is.data.table +#' @importFrom DropletUtils read10xCounts +#' @examples +#' # TODO: Example code for Xenium toy data +#' +#' # custom example run: +#' sfe <- +#' readXenium(data_dir = data_dir, +#' sample_id = "test_xenium", +#' image = c("morphology_focus", "morphology_mip"), +#' segmentations = c("cell", "nucleus"), +#' read.image_args = # list of arguments to passed to RBioFormats::read.image +#' list("resolution" = 4L, +#' "filter.metadata" = TRUE, +#' "read.metadata" = FALSE, +#' "normalize" = FALSE), +#' image_threshold = 30, +#' flip = "geometry", +#' filter_counts = TRUE, +#' add_molecules = TRUE, +#' BPPARAM = BiocParallel::MulticoreParam(14, tasks = 80L, force.GC = FALSE, progressbar = TRUE), +#' file_out = NULL) +#' +readXenium <- function(data_dir, + sample_id = "sample01", + image = c("morphology_focus", "morphology_mip"), + segmentations = c("cell", "nucleus"), + read.image_args = # list of arguments for RBioFormats::read.image + list("resolution" = 4L, + "filter.metadata" = TRUE, + "read.metadata" = FALSE, + "normalize" = FALSE), + image_threshold = NULL, + flip = c("geometry", "image", "none"), + max_flip = "50 MB", + filter_counts = FALSE, + add_molecules = FALSE, + BPPARAM = SerialParam(), + file_out = file.path(data_dir, "transcripts_sf.parquet"), ...) { + data_dir <- normalizePath(data_dir, mustWork = TRUE) + flip <- match.arg(flip) + image <- match.arg(image, several.ok = TRUE) + + # Read images ---- + # supports 2 images + # `morphology_mip.ome.tif` - 2D maximum projection intensity (MIP) image of the tissue morphology image. + # `morphology_focus.ome.tif` - 2D autofocus projection image of the tissue morphology image. + img_fn <- + list.files(data_dir, full.names = TRUE, + pattern = "morphology_") + if_exists <- vapply(image, function(img) any(grepl(img, img_fn, ignore.case = TRUE)), + FUN.VALUE = logical(1)) + if (!all(if_exists)) { + warning("The image file(s) for ", "`", paste0(image[!if_exists], collapse = "|"), "`", + " don't exist, or have non-standard file name(s).") + } + if (any(if_exists)) { image <- image[if_exists] } + + # convert OME-TIFF images, if no `.tif` images are present for `terra::rast` + img_tif <- grep(".ome.tif", img_fn, invert = TRUE, value = TRUE) + # check if images requested are converted already + if (!length(img_tif) == 0) { + image_match <- + match.arg(image, gsub(".tif", "", basename(img_tif)), several.ok = TRUE) + } else { image_match <- NaN } + + if (!all(image == image_match)) { + # check which remaining image to convert + if (any(image == image_match)) { + img_fn_add <- + grep(image[which(!image == image_match)], img_fn, value = TRUE) + } else { img_fn_add <- NULL } + if (is.null(img_fn_add)) { + message(">>> Images with OME-TIFF format are found:", paste0("\n", basename(img_fn))) + if (is.list(read.image_args) && !is.null(read.image_args)) { + # add file name args + read.image_args <- + lapply(seq(img_fn), function(x) { + read.image_args$file <- img_fn[x] + return(read.image_args) }) + message(">>> Reading images with RBioFormats, resolution = ", read.image_args[[1]]$resolution) + imgs <- + lapply(read.image_args, function(i) do.call(RBioFormats::read.image, i) ) + } else { + message(">>> Reading images with RBioFormats, resolution = ", 4) + imgs <- + lapply(seq(img_fn), function(i) { + RBioFormats::read.image(file = img_fn[i], + resolution = 4, + filter.metadata = TRUE, + read.metadata = FALSE, + normalize = FALSE) + }) + } + # given image_threshold, set some low values to NA + if (!is.null(image_threshold)) { + # make sure it is integer + if (is.numeric(image_threshold)) { + image_threshold <- + floor(image_threshold) |> as.integer() + } else { + # set some default value + image_threshold <- 30L + message(">>> Filtering image values with `image_threshold` = ", image_threshold) + } + imgs <- + lapply(imgs, function(x) { + x@.Data[x@.Data < image_threshold] <- NA + return(x)}) + } + # new files + img_fn <- gsub("ome.", "", img_fn) + message(">>> Saving lower resolution images with `.tif` (non OME-TIFF) format:", + paste0("\n", img_fn)) + # export as .tif + for (x in seq(imgs)) { + write.image(imgs[[x]], file = img_fn[x], force = TRUE)} + # combine image files + # if only 1 image was converted and another one was already present + if (!length(img_tif) == 0) { + img_fn <- c(img_tif, img_fn) + } + } + } else { + img_fn <- img_tif + message(">>> Images with `.tif` (non OME-TIFF) format will be used:", + paste0("\n", basename(img_fn))) + } + + do_flip <- .if_flip_img(img_fn, max_flip) + if (!length(img_fn)) { + flip <- "none" + } else if (!any(do_flip) && flip == "image") { flip <- "geometry" } + + # ---- + + # Read cell/nucleus segmentation ---- + if (!is.null(segmentations)) { + segmentations <- sort(segmentations) + # get files .parquet or .csv + segs <- paste0(sort(segmentations), "_boundaries", collapse = "|") + fn_segs <- .check_xenium_fns(data_dir, segs) + if (length(fn_segs) == 0) { + warning("No segmentation files are found, check input directory -> `data_dir`") + polys <- NULL + } + if (any(grep(".csv", fn_segs))) { + message(">>> Cell segmentations are found in `.csv` file(s)", "\n", + ">>> Reading ", + if (length(fn_segs) > 1) { paste0(segmentations, collapse = " and ") + } else { segmentations }, " segmentations") + # read .csv data + polys <- lapply(fn_segs, fread) + } else if (any(grep("..parquet", fn_segs))) { + check_installed("arrow") + message(">>> Cell segmentations are found in `.parquet` file(s)", "\n", + ">>> Reading ", + if (length(fn_segs) > 1) { paste0(segmentations, collapse = " and ") + } else { segmentations }, " segmentations") + polys <- lapply(fn_segs, arrow::read_parquet) + # convert cell ids, from raw bytes to character + polys <- lapply(polys, function(x) + .rawToChar_df(x, BPPARAM = BPPARAM)) + } + # generate sf dataframe with geometries + message(">>> Making POLYGON geometries") + polys <- + lapply(polys, function(x) { + df2sf(x, c("vertex_x", "vertex_y"), id_col = "cell_id", + geometryType = "POLYGON", BPPARAM = BPPARAM) }) + # add names to polys list + names(polys) <- segmentations + for (i in seq(polys)) { + if (flip == "geometry" && !is.null(polys[[i]])) { + # Flip the coordinates + mat_flip <- matrix(c(1,0,0,-1), ncol = 2) + st_geometry(polys[[i]]) <- st_geometry(polys[[i]]) * mat_flip + } + } + # keep only single segmentation file + if (length(polys) == 1) { polys <- polys[[1]] } + } else { polys <- NULL } + + # Read metadata ---- + fn_meta <- .check_xenium_fns(data_dir, "cells.") + if (length(fn_meta) == 0) { + warning("No metadata files are found, check input directory -> `data_dir`") + metadata <- NULL + } + if (any(grep(".csv", fn_meta))) { + message(">>> Reading cell metadata -> `cells.csv`") + # read .csv data + metadata <- fread(fn_meta) + } else if (any(grep(".parquet", fn_meta))) { + check_installed("arrow") + metadata <- arrow::read_parquet(fn_meta) + message(">>> Reading cell metadata -> `cells.parquet`") + # convert cell ids, from raw bytes to character + metadata <- .rawToChar_df(metadata, BPPARAM = BPPARAM) + } + + # Read count matrix or SCE ---- + # all feature types are read in single count matrix and stored in rowData(mat)$Type + #..ie -> 'Negative Control Probe, 'Negative Control Codeword', 'Unassigned Codeword' + if (file.exists(file.path(data_dir, "cell_feature_matrix.h5"))) { + sce <- read10xCounts(file.path(data_dir, "cell_feature_matrix.h5")) + } else if (dir.exists(file.path(data_dir, "cell_feature_matrix"))) { + sce <- read10xCounts(file.path(data_dir, "cell_feature_matrix")) + } else { stop("No `cell_feature_matrix` files are found, check input directory -> `data_dir`") } + mat <- assay(sce, "counts", withDimnames = TRUE) + mat <- as(mat, "CsparseMatrix") + colnames(mat) <- sce$Barcode + + # Filtering count matrix, metadata and segmentations ---- + # filtering metadata and count matrix + if (any(names(metadata) == "transcript_counts") && filter_counts) { + message(">>> ..filtering cell metadata - keep cells with `transcript_counts` > 0") + metadata <- metadata[metadata$transcript_count > 0,] + mat <- mat[,match(metadata$cell_id, colnames(mat)) |> stats::na.omit()] + } else { + # if metadata isn't already filtered + if (!"transcript_counts" %in% names(metadata) && filter_counts) { + inds <- colSums(mat) > 0 + mat <- mat[,inds] + metadata <- metadata[inds,] + }} + # filtering segmentations + if (!is.null(polys)) { + if (is.list(polys)) { + for (i in seq(polys)) { + # filter geometries + matched.cells <- match(colnames(mat), polys[[i]]$ID) |> stats::na.omit() + message(">>> filtering ", names(polys)[i], + " geometries to match ", + length(matched.cells), " cells with counts > 0") + polys[[i]] <- polys[[i]][matched.cells, , drop = FALSE] } + } else if (is(polys, "sf")) { + matched.cells <- match(colnames(mat), polys[[i]]$ID) |> stats::na.omit() + message(">>> filtering ", if (!is.null(segmentations) || exists("segmentations")) segmentations, + " geometries to match ", length(matched.cells), " cells with counts > 0") + polys <- polys[matched.cells, , drop = FALSE] + } + } + rownames(metadata) <- metadata$cell_id + metadata[,1] <- NULL + if (flip == "geometry") { + metadata$y_centroid <- -metadata$y_centroid + } + + # Make SFE object ---- + sfe <- SpatialFeatureExperiment(assays = list(counts = mat), + colData = metadata, + sample_id = sample_id, + spatialCoordsNames = c("x_centroid", "y_centroid"), + unit = "micron", BPPARAM = BPPARAM) + # add rowData from sce + rowData(sfe) <- rowData(sce) + # replace gene IDs with with gene Symbols + if (any(grep("ENSG", rownames(sfe)))) { + if (any(rowData(sce) |> names() == "Symbol")) + message(">>> Replacing gene IDs with Symbols") + rownames(sfe) <- rowData(sce)$Symbol + } + + # add segmentation geometries + if (!is.null(polys)) { + if (is.list(polys)) { + colGeometries(sfe) <- c(colGeometries(sfe), polys) + } else if (is(polys, "sf")) { + rownames(polys) <- polys$ID + polys$ID <- NULL + cellSeg(sfe) <- polys } - sfe + } + + # add images + if (any(if_exists)) { + # using cell segmentation centroids + extent <- colGeometry(sfe, 1) |> st_geometry() |> st_bbox() + + # Set up ImgData + img_dfs <- lapply(img_fn, function(fn) { + id_use <- sub("\\.tif$", "", basename(fn)) + .get_imgData(fn, sample_id = sample_id, + image_id = id_use, extent = extent, + flip = (flip == "image")) + }) + img_df <- do.call(rbind, img_dfs) + imgData(sfe) <- img_df + } + + # TODO: sometimes images don't overlap 100% with segmentations ---- + # try to register images with cell segmentation centroids + + + # Read transcript coordinates ---- + # NOTE z-planes are non-integer, cannot select or use `z` as in `readVizgen` + if (add_molecules) { + message(">>> Reading transcript coordinates") + # get molecule coordiantes file + fn_mols <- .check_xenium_fns(data_dir, "transcripts") + sfe <- addTxSpots(sfe, fn_mols, + sample_id, + gene_col = "feature_name", + spatialCoordsNames = c("x_location", "y_location", "z_location"), + BPPARAM = BPPARAM, + file_out = file_out, ...) + } + sfe } From 8b3fcedd0203c47910c42bd4f3891bf312fade0f Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Mon, 20 Nov 2023 16:15:07 +0100 Subject: [PATCH 3/3] add `RBioFormats::write.image` --- R/read.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/read.R b/R/read.R index d07a309..9b57ac3 100644 --- a/R/read.R +++ b/R/read.R @@ -1209,7 +1209,7 @@ readXenium <- function(data_dir, paste0("\n", img_fn)) # export as .tif for (x in seq(imgs)) { - write.image(imgs[[x]], file = img_fn[x], force = TRUE)} + RBioFormats::write.image(imgs[[x]], file = img_fn[x], force = TRUE)} # combine image files # if only 1 image was converted and another one was already present if (!length(img_tif) == 0) {