BorchLab · ncborcherding · Aug 11, 2023 · Aug 2, 2023 · Aug 2, 2023 · Aug 8, 2023
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -20,9 +20,7 @@ jobs:
         config:
           - {os: macos-latest,   r: 'release'}
           - {os: windows-latest, r: 'release'}
-          - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
           - {os: ubuntu-latest,   r: 'release'}
-          - {os: ubuntu-latest,   r: 'oldrel-1'}
 
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@
 .Rhistory
 .Rhistory
 local_tests.R
+.RData
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -11,7 +11,8 @@ RoxygenNote: 7.2.3
 biocViews: Software, ImmunoOncology, SingleCell, Classification, Annotation, Sequencing
 Depends: 
 	ggplot2, 
-	R (>= 4.0)
+	R (>= 4.0),
+	Seurat
 Imports: 
     stringdist,
     dplyr,
@@ -33,16 +34,19 @@ Imports:
     tidygraph,
     SeuratObject,
     stats,
-    Seurat
+    Rcpp
 Suggests: 
     knitr,
     rmarkdown,
     BiocStyle,
     circlize,
     scales,
     scater,
+    spelling,
     testthat (>= 3.0.0),
-    spelling
+    vdiffr
 VignetteBuilder: knitr
 Config/testthat/edition: 3
 Language: en-US
+LinkingTo: 
+    Rcpp
diff --git a/NAMESPACE b/NAMESPACE
@@ -33,6 +33,7 @@ export(vizGenes)
 exportClasses(StartracOut)
 import(dplyr)
 import(ggplot2)
+importFrom(Rcpp,sourceCpp)
 importFrom(SeuratObject,Embeddings)
 importFrom(SeuratObject,Idents)
 importFrom(SingleCellExperiment,colData)
@@ -97,3 +98,4 @@ importFrom(utils,combn)
 importFrom(utils,head)
 importFrom(vegan,diversity)
 importFrom(vegan,estimateR)
+useDynLib(scRepertoire, .registration = TRUE)
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -0,0 +1,7 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+rcpp_hello_world <- function() {
+    invisible(.Call(`_scRepertoire_rcpp_hello_world`))
+}
+
diff --git a/R/combineContigs.R b/R/combineContigs.R
@@ -13,20 +13,25 @@ data1_lines <- c("TCR1", "cdr3", "cdr3_nt")
 data2_lines <- c("TCR2", "cdr3", "cdr3_nt")
 CT_lines <- c("CTgene", "CTnt", "CTaa", "CTstrict")
 
-utils::globalVariables(c("heavy_lines", "light_lines", "l_lines", "k_lines", 
-            "h_lines", "tcr1_lines", "tcr2_lines", "data1_lines", 
-            "data2_lines", "CT_lines"))
+utils::globalVariables(c(
+    "heavy_lines", "light_lines", "l_lines", "k_lines", "h_lines", "tcr1_lines",
+    "tcr2_lines", "data1_lines", "data2_lines", "CT_lines"
+))
 
-#' Combining the list of T Cell Receptor contigs
+#' @title Combining the list of T Cell Receptor contigs
 #'
-#' This function consolidates a list of TCR sequencing results to the level of 
-#' the individual cell barcodes. Using the samples and ID parameters, the 
-#' function will add the strings as prefixes to prevent issues with repeated 
-#' barcodes. The resulting new barcodes will need to match the Seurat or SCE 
-#' object in order to use, \code{\link{combineExpression}}. Several 
-#' levels of filtering exist - remove or filterMulti are parameters that 
-#' control how  the function  deals with barcodes with multiple chains 
-#' recovered.
+#' @description This function consolidates a list of TCR sequencing results to
+#' the level of  the individual cell barcodes. Using the samples and ID
+#' parameters, the function will add the strings as prefixes to prevent issues
+#' with repeated  barcodes. The resulting new barcodes will need to match the
+#' Seurat or SCE object in order to use, \code{\link{combineExpression}}.
+#' Several levels of filtering exist - `removeNA`, `removeMulti`, or
+#' `filterMulti` are parameters that control how  the function  deals with
+#' barcodes with multiple chains recovered.
+#' 
+#' @details For single-sample TCR sequencing experiments, where the input is
+#' just a single data.frame, the function will add the consolidated information
+#' columns after the existing raw clonotype data
 #' 
 #' @examples
 #' combineTCR(contig_list, 
@@ -44,6 +49,7 @@ utils::globalVariables(c("heavy_lines", "light_lines", "l_lines", "k_lines",
 #' @import dplyr
 #' @export
 #' @return List of clonotypes for individual cell barcodes
+#' 
 combineTCR <- function(df, 
                        samples = NULL, 
                        ID = NULL, 
@@ -83,10 +89,10 @@ combineTCR <- function(df,
     } else {
       out <- df
     }
-    for (i in seq_along(out)) { 
+    for (i in seq_along(out)) { # ideally the nested code could be in a function for a better development/testing experience
         data2 <- out[[i]]
         data2 <- makeGenes(cellType = "T", data2)
-        unique_df <- unique(data2$barcode)
+        unique_df <- unique(data2$barcode) # could potentially display % here
         Con.df <- data.frame(matrix(NA, length(unique_df), 7))
         colnames(Con.df) <- c("barcode",tcr1_lines, tcr2_lines)
         Con.df$barcode <- unique_df
@@ -104,22 +110,24 @@ combineTCR <- function(df,
         }
         final[[i]] <- data3 
     }
-    names <- NULL
+    name_vector <- character(length(samples))
     for (i in seq_along(samples)) { 
-      if (!is.null(samples) & !is.null(ID)) {
-          c <- paste(samples[i], "_", ID[i], sep="")
-      } else if (!is.null(samples) & is.null(ID)) {
-          c <- paste(samples[i], sep="")
-      }
-        names <- c(names, c)
+        if (!is.null(samples) & !is.null(ID)) {
+            curr <- paste(samples[i], "_", ID[i], sep="")
+        } else if (!is.null(samples) & is.null(ID)) {
+            curr <- paste(samples[i], sep="")
+        }
+        name_vector[i] <- curr
     }
-    names(final) <- names
+    names(final) <- name_vector
     for (i in seq_along(final)){
-        final[[i]]<-final[[i]][!duplicated(final[[i]]$barcode),]
-        final[[i]]<-final[[i]][rowSums(is.na(final[[i]])) < 10, ]}
-    if (removeNA == TRUE) { final <- removingNA(final)}
-    if (removeMulti == TRUE) { final <- removingMulti(final) }
-    return(final) }
+      final[[i]]<-final[[i]][!duplicated(final[[i]]$barcode),]
+      final[[i]]<-final[[i]][rowSums(is.na(final[[i]])) < 10, ]
+    }
+    if (removeNA) { final <- removingNA(final)}
+    if (removeMulti) { final <- removingMulti(final) }
+    final
+}
 
 #' Combining the list of B Cell Receptor contigs
 #'
@@ -133,8 +141,8 @@ combineTCR <- function(df,
 #' and the corresponding v-gene. This index automatically calculates 
 #' the Levenshtein distance between sequences with the same V gene and will 
 #' index sequences with <= 0.15 normalized Levenshtein distance with the same 
-#' ID. After which, clonotype clusters are called using the igraph 
-#' component() function. Clonotype that are clustered across multiple 
+#' ID. After which, clonotype clusters are called using the
+#' `igraph:: component()` function. Clonotype that are clustered across multiple 
 #' sequences will then be labeled with "LD" in the CTstrict header.
 #'
 #' @examples
@@ -188,9 +196,9 @@ combineBCR <- function(df,
         }
     }
     if (!is.null(samples)) {
-      out <- modifyBarcodes(df, samples, ID)
+        out <- modifyBarcodes(df, samples, ID)
     } else {
-      out <- df
+        out <- df
     }
     for (i in seq_along(out)) { 
         data2 <- data.frame(out[[i]])
@@ -226,26 +234,27 @@ combineBCR <- function(df,
         if (!is.null(sample) & !is.null(ID)) {
           final[[i]]<- final[[i]][, c("barcode", "sample", "ID", 
               heavy_lines[c(1,2,3)], light_lines[c(1,2,3)], CT_lines)]
-          }
+        }
         else if (!is.null(sample) & is.null(ID)) {
           final[[i]]<- final[[i]][, c("barcode", "sample", 
                     heavy_lines[c(1,2,3)], light_lines[c(1,2,3)], CT_lines)]
         }
     }
     names <- NULL
     for (i in seq_along(samples)) { 
-      if (!is.null(samples) & !is.null(ID)) {
-        c <- paste(samples[i], "_", ID[i], sep="")
-      } else if (!is.null(samples) & is.null(ID)) {
-        c <- paste(samples[i], sep="")
-      }
-      names <- c(names, c)}
+        if (!is.null(samples) & !is.null(ID)) {
+            c <- paste(samples[i], "_", ID[i], sep="")
+        } else if (!is.null(samples) & is.null(ID)) {
+            c <- paste(samples[i], sep="")
+        }
+        names <- c(names, c)
+    }
     names(final) <- names
     for (i in seq_along(final)) {
         final[[i]] <- final[[i]][!duplicated(final[[i]]$barcode),]
         final[[i]]<-final[[i]][rowSums(is.na(final[[i]])) < 10, ]}
-    if (removeNA == TRUE) { final <- removingNA(final) }
-    if (removeMulti == TRUE) { final <- removingMulti(final) }
+    if (removeNA) { final <- removingNA(final) }
+    if (removeMulti) { final <- removingMulti(final) }
     return(final) 
 }
 

diff --git a/R/contig_list.R b/R/contig_list.R
diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,64 @@
+#' A data set of T cell contigs as a list outputed from the 
+#' filter_contig_annotation files.
+#' @docType data
+#' @name contig_list
+#' 
+NULL
+
+#' A seurat object of 100 single T cells derived
+#' from 3 clear cell renal carcinoma patients.
+#' 
+#' @description The object is compatible with `contig_list` and the TCR
+#' sequencing data can be added with `combineExpression`.
+#' 
+#' @name screp_example
+#' @docType data
+#'
+NULL
+
+#' Processed subset of `contig_list`
+#' 
+#' @description A list of 6 dataframes of T cell contigs outputted from the
+#' `filtered_contig_annotation` files, but subsetted to about 92 valid T cells
+#' which correspond to the same barcodes found in `screp_example`
+#'
+#' @usage data("combined_mini_contig_list")
+#'
+#' @format An R `list` of `data.frame` objects
+#' 
+#' @docType data
+#'
+#' @seealso \code{\link{contig_list}}
+#'
+"combined_mini_contig_list"
+
+# # Code used for creating the combined_mini_contig_list:
+
+# library(hash, usethis)
+# 
+#data("contig_list", "screp_example")
+
+#combined_mini_contig_list <- combineTCR(
+#	contig_list,
+#	samples = c("PY", "PY", "PX", "PX", "PZ","PZ"),
+#	ID = c("P", "T", "P", "T", "P", "T")
+#)
+#all_barcodes <- names([email protected])
+#barcode_set <- hash::hash(all_barcodes, all_barcodes) # a worse version of a set
+#col_names <- names(combined_mini_contig_list[[1]])
+
+#for (i in seq_along(combined_mini_contig_list)) {
+#	curr_df <- setNames(
+#		data.frame(replicate(length(col_names), character(0))), col_names
+#	)
+#	len <- 0
+#	for (j in seq_along(combined_mini_contig_list[[i]][[1]])) {
+#		if (is.null(barcode_set[[combined_mini_contig_list[[i]][[1]][[j]]]])) {
+#			next
+#		}
+#		len <- len + 1
+#		curr_df[len, ] <- combined_mini_contig_list[[i]][j, ]
+#	}
+#	combined_mini_contig_list[[i]] <- curr_df
+#}
+#usethis::use_data(combined_mini_contig_list)
diff --git a/R/processing.R b/R/processing.R
@@ -9,12 +9,16 @@
 #' stripBarcode(contig_list[[1]], column = 1, connector = "_", num_connects = 1)
 #' @export
 #' @return list with the suffixes of the barcodes removed.
-stripBarcode <- function(contigs, column = 1, connector = "_", 
-                            num_connects = 3) {
-    count <- as.data.frame(t(data.frame(strsplit(contigs[,column], 
-                            paste("['", connector, "']", sep="")), 
-                            stringsAsFactors = FALSE)), 
-                            stringsAsFactors = FALSE)[num_connects]
+stripBarcode <- function(contigs, column = 1, connector = "_", num_connects = 3)
+{
+    count <- as.data.frame(
+        t(data.frame(
+            strsplit(contigs[,column], paste("['", connector, "']", sep="")), 
+            stringsAsFactors = FALSE
+        )), 
+        stringsAsFactors = FALSE
+    )[num_connects]
+
     contigs[,column] <- count
     return(contigs)
 }

diff --git a/R/scRepertoire-package.R b/R/scRepertoire-package.R
@@ -0,0 +1,8 @@
+#' @keywords internal
+"_PACKAGE"
+
+## usethis namespace: start
+#' @importFrom Rcpp sourceCpp
+#' @useDynLib scRepertoire, .registration = TRUE
+## usethis namespace: end
+NULL
diff --git a/R/screp_example.R b/R/screp_example.R