Skip to content

Commit

Permalink
Merge pull request hyginn#21 from wonjunetai/master
Browse files Browse the repository at this point in the history
importFilterHypermutators function
  • Loading branch information
hyginn authored Mar 31, 2017
2 parents 7e0e1f7 + 74d2436 commit 945b76b
Show file tree
Hide file tree
Showing 6 changed files with 372 additions and 0 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(fastMapUpdate)
export(findUUID)
export(getProvenance)
export(getUUID)
export(importFilterHypermutators)
export(importM.COSMIC)
export(importNet.STRING)
export(isHGNCsymbol)
Expand Down
225 changes: 225 additions & 0 deletions R/importFilterHypermutators.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
# importFilterHypermutators.R

#' Filters out hypermutators
#'
#' \code{importFilterHypermutators} identifies the number of SNV and CNA
#' mutations for each sample and removes the samples from rSNV and rCNA
#' datasets. Default threshold for removal is 400 mutations per sample.
#'
#' @param rSNVFileIn A vector of local file names of rSNV
#' @param rCNAFileIn A vector of local file names of rCNA
#' @param dOut Directory to store output, defaults to getwd().
#' @param xS Mutation threshold. 400 by default.
#' @param silent Controls whether output to console should be suppressed. FALSE
#' by default.
#' @param writeLog Controls whether writing the result to the global logfile is
#' enabled. TRUE by default.
#' @param writeDetailedLog Flag for extra details about log. TRUE by default.
#'
#' @examples
#' \dontrun{
#' importFilterHypermutators(rSNVFileIn, rCNAFileIn, dOut, xS)
#' }
#'
#' @export
importFilterHypermutators <- function(rSNVFileIn = c(),
rCNAFileIn = c(),
dOut = getwd(),
xS = 400,
silent = FALSE,
writeLog = TRUE,
writeDetailedLog = TRUE) {

## VALIDATE PARAMS ########
# make sure that files actually exist
for (file in rSNVFileIn) {
r <- .checkArgs(file, like = "FILE_E", checkSize = TRUE)
if (length(r) > 0) {
stop(r)
}
}

for (file in rCNAFileIn) {
r <- .checkArgs(file, like = "FILE_E", checkSize = TRUE)
if (length(r) > 0) {
stop(r)
}
}

# make sure that output directory is valid
r <- .checkArgs(dOut, like = "DIR", checkSize = TRUE)
if(length(r) > 0) {
stop(r)
}

r <- c(r, .checkArgs(xS, like = 1, checkSize = TRUE))
if (length(r) > 0) {
stop(r)
}

# set up hashes
hashTable <- new.env(hash = TRUE)
totalSamples <- 0

## COUNT MUTATIONS ########
# for each file in rCNAFileIn
for (file in rCNAFileIn) {
# readRDS, count gene CNAs and add to hash for sample @ CNA and nMUT
rCNA <- readRDS(file)
for (sample in colnames(rCNA)[4:length(colnames(rCNA))]) {
totalSamples <- totalSamples + 1
# if key is in hash, increment CNA and nMUT for every gene if there is an abberation
for (copyNumberValue in rCNA[[sample]]) {
copyNumberValue <- as.double(copyNumberValue)
if (copyNumberValue != 0) {
if (!is.null(hashTable[[sample]])) {
prevCNACount <- hashTable[[sample]]$CNA
prevTotalCount <- hashTable[[sample]]$total
assign(sample, list(CNA = prevCNACount + 1, SNV = 0, total = prevTotalCount + 1), envir = hashTable)
} else {
assign(sample, list(CNA = 1, SNV = 0, total = 1), envir = hashTable)
}
}
}
}
}

# for each file in rSNVFileIn
for (file in rSNVFileIn) {
# readRDS, increment counter for SNV and nMUT
rSNV <- readRDS(file)
for (i in 1:length(rSNV$Tumor_Sample_Barcode)) {
# need to substitute dashes with period for consistency in files
sample <- gsub("-", ".", rSNV$Tumor_Sample_Barcode[i])
totalSamples <- totalSamples + 1

# Present a progress bar.
if (!silent) {
.pBar(i, length(rSNV$Tumor_Sample_Barcode))
}

# if key is in hash, increment SNV and nMUT
if (!is.null(hashTable[[sample]])) {
prevCNACount <- hashTable[[sample]]$CNA
prevSNVCount <- hashTable[[sample]]$SNV
prevTotalCount <- hashTable[[sample]]$total
assign(sample, list(CNA = prevCNACount, SNV = prevSNVCount + 1, total = prevTotalCount + 1), envir = hashTable)
} else {
assign(sample, list(CNA = 0, SNV = 1, total = 1), envir = hashTable)
}
}
}

## ASSESS MUTATIONS AND LOG STATISTICS #######

# log global statistics:
numSamplesBothSNVAndCNA <- 0
numSamplesOnlyCNA <- 0
numSamplesOnlySNV <- 0
numSamplesNoChange <- 0
numSamplesExceedThresh <- 0
numRemovedSamples <- 0
removedSamples <- c()

for (sample in ls(hashTable)) {
# num samples with only CNA
if (hashTable[[sample]]$SNV == 0 && hashTable[[sample]]$CNA > 0) {
numSamplesOnlyCNA <- numSamplesOnlyCNA + 1
}
# num samples with only SNV
if (hashTable[[sample]]$CNA == 0 && hashTable[[sample]]$SNV > 0) {
numSamplesOnlySNV <- numSamplesOnlySNV + 1
}
# num samples with both SNV and CNA
if (hashTable[[sample]]$CNA > 0 && hashTable[[sample]]$SNV > 0) {
numSamplesBothSNVAndCNA <- numSamplesBothSNVAndCNA + 1
}
# num samples that exceeded threshold and need to be removed
if (hashTable[[sample]]$total > xS) {
numRemovedSamples <- numRemovedSamples + 1
removedSamples <- c(removedSamples, sample)
}
# num samples with no change
if (hashTable[[sample]]$total <= xS) {
numSamplesNoChange <- numSamplesNoChange + 1
}
}

## PROCESS FILES AND UPDATE METADATA ##########

# for each rCNAFileIn
for (file in rCNAFileIn) {
rCNA <- readRDS(file)

# remove sample if sample in `removedSamples`
newRCNA <- rCNA[, !(names(rCNA) %in% removedSamples)]

# update metadata
getUUID("newRCNA", overwrite = TRUE)
# ToDo: record updated UUID in log-file
# ToDo: test that UUID is _actually_ being updated.


# save new CNAFile with "filtered" prepended to filename
saveRDS(newRCNA, file = paste(dOut, "/filteredHypermutators_", basename(file), sep = ""))
}

# for each rSNVFileIn
for (file in rSNVFileIn) {
rSNV <- readRDS(file)

# for each sample (need to gsub("-", ".", sample)), if sample in `removedSamples` remove it
newRSNV <- rSNV[, !(gsub("-", ".", rSNV$Tumor_Sample_Barcode) %in% removedSamples)]

# update metadata
getUUID("newRSNV", overwrite = TRUE)
# ToDo: record updated UUID in log-file
# ToDo: test that UUID is _actually_ being updated.

# save new SNVFile
saveRDS(newRSNV, file = paste(dOut, "/filteredHypermutators_", basename(file), sep = ""))
}

## LOGGING ###############

if(writeLog) {

logTitle <- "importFilterHypermutators"

# Compile function call record
logCall <- character()
logCall[1] <- "importFilterHypermutators("
logCall[2] <- sprintf("rSNVFileIn = (%s)", paste(rSNVFileIn, collapse(", ")))
logCall[3] <- sprintf("rCNAFileIn = (%s)", paste(rCNAFileIn, collapse(", ")))
logCall[4] <- sprintf("dOut = \"%s\", ", dOut)
logCall[5] <- sprintf("xS = \"%s\", ", as.character(xS))
logCall[6] <- sprintf("silent = %s, ", as.character(silent))
logCall[7] <- sprintf("writeLog = %s)", as.character(writeLog))
logCall <- paste0(logCall, collapse = "")

# Record progress information
logNotes <- character()
logNotes <- c(logNotes, sprintf("Removed %s of %s samples", numRemovedSamples, totalSamples))

if (writeDetailedLog) {
logNotes <- c(logNotes, sprintf("%s of samples had both SNV and CNA", numSamplesBothSNVAndCNA/totalSamples))
logNotes <- c(logNotes, sprintf("%s of samples had only CNA", numSamplesOnlyCNA/totalSamples))
logNotes <- c(logNotes, sprintf("%s of samples had only SNV", numSamplesOnlySNV/totalSamples))
logNotes <- c(logNotes, sprintf("%s of samples had no change", numSamplesNoChange/totalSamples))
logNotes <- c(logNotes, sprintf("%s of samples exceeded threshold of %s", numSamplesExceedThresh/totalSamples, xS))

# Accumulate all removed samples
for (sample in removedSamples) {
logNotes <- c(logNotes, sprintf("%s was removed", sample))
}
}

# # send info to log file
logEvent(eventTitle = logTitle,
eventCall = logCall,
notes = logNotes)
}

}

# [END]
38 changes: 38 additions & 0 deletions man/importFilterHypermutators.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file added tests/testthat/devCNA.rds
Binary file not shown.
Binary file added tests/testthat/devSNV.rds
Binary file not shown.
108 changes: 108 additions & 0 deletions tests/testthat/testFilter.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# testFilter
#
#
context("Tests for importFilterHypermutators.R")

# ==== BEGIN SETUP AND PREPARE =================================================
OLOG <- as.character(getOption("rete.logfile")) # save original logfile name
logFileName(fPath = tempdir(), setOption = TRUE) # make tempdir() the log dir
logName <- unlist(getOption("rete.logfile"))
if (file.exists(logName)) { file.remove(logName)}

SNVfileName <- 'devSNV.rds'
CNAfileName <- 'devCNA.rds'

filteredSNVfileName <- paste(getwd(), "/filteredHypermutators_", basename(SNVfileName), sep = "")
filteredCNAfileName <- paste(getwd(), "/filteredHypermutators_", basename(CNAfileName), sep = "")
# ==== END SETUP AND PREPARE ===================================================


# ==== importFilterHypermutators() =============================================

# set up a tempdir and tempfiles for outputs of importFilterHypermutators
test_that("importFilterHypermutators works correctly on valid input", {
# run importFilterHypermutators on valid input
expect_error(importFilterHypermutators(rSNVFileIn = c(SNVfileName),
rCNAFileIn = c(CNAfileName),
dOut = getwd(),
xS = 400,
silent = FALSE,
writeLog = TRUE), NA)

if (file.exists(filteredSNVfileName)) { file.remove(filteredSNVfileName)}
if (file.exists(filteredCNAfileName)) { file.remove(filteredCNAfileName)}
})

test_that("importFilterHypermutators rejects invalid dOut arguments", {
# check for NULL object
expect_error(importFilterHypermutators(rSNVFileIn = c(SNVfileName),
rCNAFileIn = c(CNAfileName),
dOut = NULL,
xS = 400,
silent = FALSE,
writeLog = TRUE))
# check for paths that don't exist
expect_error(importFilterHypermutators(rSNVFileIn = c(SNVfileName),
rCNAFileIn = c(CNAfileName),
dOut = 'no/such/path',
xS = 400,
silent = FALSE,
writeLog = TRUE))
})

test_that("importFilterHypermutators rejects invalid xS arguments", {
# check for including NULL objects
expect_error(importFilterHypermutators(rSNVFileIn = c(SNVfileName),
rCNAFileIn = c(CNAfileName),
xS = NULL,
silent = FALSE,
writeLog = TRUE))
# check character
expect_error(importFilterHypermutators(rSNVFileIn = c(SNVfileName),
rCNAFileIn = c(CNAfileName),
xS = "400",
silent = FALSE,
writeLog = TRUE))
})

test_that("importFilterHypermutators correctly removes hypermutators", {
# run importFilterHypermutators with a single hypermutator
testCNA <- readRDS(CNAfileName)
testSNV <- readRDS(SNVfileName)

# start off with a known number of samples in each CNA and SNV file
expect_equal(length(colnames(testCNA)[4:length(colnames(testCNA))]), 579)
expect_equal(length(testSNV$Tumor_Sample_Barcode), 15)

expect_error(importFilterHypermutators(rSNVFileIn = c(SNVfileName),
rCNAFileIn = c(CNAfileName),
xS = 15,
silent = FALSE,
writeLog = TRUE), NA)

# check output file
processedCNA <- readRDS(filteredCNAfileName)
processedSNV <- readRDS(filteredSNVfileName)

# expect that 458 samples are removed from rCNA (579 - 458 = 121)
expect_equal(length(colnames(processedCNA)[4:length(colnames(processedCNA))]), 121)

# expect that no samples are removed from rSNV
expect_equal(length(processedSNV$Tumor_Sample_Barcode), 15)

# test cleanup
if (file.exists(filteredSNVfileName)) { file.remove(filteredSNVfileName)}
if (file.exists(filteredCNAfileName)) { file.remove(filteredCNAfileName)}
})

# ==== BEGIN TEARDOWN AND RESTORE ==============================================
logName <- unlist(getOption("rete.logfile"))
if (file.exists(logName)) { file.remove(logName)}
options("rete.logfile" = OLOG)

# another check for function artifacts
if (file.exists(filteredSNVfileName)) { file.remove(filteredSNVfileName)}
if (file.exists(filteredCNAfileName)) { file.remove(filteredCNAfileName)}
# ==== END TEARDOWN AND RESTORE ===============================================

# [END]

0 comments on commit 945b76b

Please sign in to comment.