diff --git a/DESCRIPTION b/DESCRIPTION index 0fa9a949..3424dc57 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -82,4 +82,7 @@ Imports: XVector, yaml Suggests: - knitr + knitr, + testthat, + mockery, + readr diff --git a/R/cleanup.R b/R/cleanup.R index 4fe074ee..9bc646b6 100755 --- a/R/cleanup.R +++ b/R/cleanup.R @@ -160,9 +160,10 @@ removeEmptyRows <- function(prot, by_column = "DomArch") { prot <- prot %>% as_tibble() %>% # filter(grepl("\\*", {{by_column}})) %>% # Keep only rows with Query (*) for GenContext - filter(!grepl("^-$", {{ by_column }})) %>% # remove "-" - filter(!grepl("^NA$", {{ by_column }})) %>% # remove "NA" - filter(!grepl("^$", {{ by_column }})) # remove empty rows + filter(!grepl("^-$", .[[by_column]])) %>% # remove "-" + filter(!grepl("^NA$", .[[by_column]])) %>% # remove "NA" + filter(!grepl("^$", .[[by_column]])) %>% # remove empty rows + filter(!grepl("^\\s*$", .[[by_column]])) # remove rows with only spaces return(prot) } diff --git a/R/fa2domain.R b/R/fa2domain.R index 6dc6f622..55517c85 100644 --- a/R/fa2domain.R +++ b/R/fa2domain.R @@ -22,6 +22,16 @@ runIPRScan <- function( # destPartition = "LocalQ", # destQoS = "shortjobs" ) { + # Validate inputs + if (is.null(filepath_fasta) || filepath_fasta == "") { + stop("filepath_fasta cannot be NULL or empty") + } + if (is.null(filepath_out) || filepath_out == "") { + stop("filepath_out cannot be NULL or empty") + } + if (!all(appl %in% c("Pfam", "Gene3D"))) { + stop("Invalid application specified") + } # construct interproscan command cmd_iprscan <- stringr::str_glue( "iprscan -i {filepath_fasta} -b {filepath_out} --cpu 4 -f TSV ", @@ -283,7 +293,7 @@ getDomainsFromFA <- function( if (verbose) { msg <- stringr::str_glue( "accession number: {header} had no domains for the ", - "selected analyes: {paste(analysis, collapse = ',')}\n" + "selected analyses: {paste(analysis, collapse = ',')}\n" ) warning(msg) } diff --git a/man/acc2FA.Rd b/man/acc2FA.Rd new file mode 100644 index 00000000..1a7a27e9 --- /dev/null +++ b/man/acc2FA.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CHANGED-pre-msa-tree.R +\name{acc2FA} +\alias{acc2FA} +\title{acc2FA converts protein accession numbers to a fasta format.} +\usage{ +acc2FA(accessions, outpath, plan = "sequential") +} +\arguments{ +\item{accessions}{Character vector containing protein accession numbers to generate fasta sequences for. +Function may not work for vectors of length > 10,000} + +\item{outpath}{\link{str} Location where fasta file should be written to.} + +\item{plan}{} +} +\description{ +Resulting fasta file is written to the outpath. +} +\examples{ +\dontrun{ +acc2FA(accessions = c("ACU53894.1", "APJ14606.1", "ABK37082.1"), outpath = "my_proteins.fasta") +Entrez:accessions <- rep("ANY95992.1", 201) |> acc2FA(outpath = "entrez.fa") +EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> acc2FA(outpath = "ebi.fa") +} +} +\author{ +Samuel Chen, Janani Ravi +} +\keyword{accnum,} +\keyword{fasta} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 00000000..2a5dc27b --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(MolEvolvR) + +test_check("MolEvolvR") \ No newline at end of file diff --git a/tests/testthat/test-cleanup.R b/tests/testthat/test-cleanup.R new file mode 100644 index 00000000..c465ed57 --- /dev/null +++ b/tests/testthat/test-cleanup.R @@ -0,0 +1,588 @@ +context("cleanup") +test_that("cleanup", { + # cleanup + # Test with normal string + expect_equal(cleanString("Hello World"), "Hello_World") + + # Test with multiple spaces + expect_equal(cleanString("Hello World"), "Hello_World") + + # Test with special characters + expect_equal(cleanString("Hello@World!"), "HelloWorld") + + # Test with alphanumeric characters and underscore + expect_equal(cleanString("Test_String 123"), "Test_String_123") + + # Test with dots + expect_equal(cleanString("Version 1.0.0"), "Version_1.0.0") + + # Test with empty string + expect_equal(cleanString(""), "") + + # Test with only spaces + expect_equal(cleanString(" "), "_") + + # Test with non-alphanumeric characters + expect_equal(cleanString("~!@#$%^&*()"), "") + + # Test with mixed characters + expect_equal(cleanString("Hello !@#$% World"), "Hello__World") + + # Test with trailing and leading spaces + expect_equal(cleanString(" Test "), "_Test_") + + # Test with numbers and underscores + expect_equal(cleanString("Name_123 Test"), "Name_123_Test") + + # extractAccNum + # Test with a string containing a pipe character + expect_equal(extractAccNum("ID|ABC1234 Some Description"), "ABC1234") + + # Test with a string containing multiple spaces + expect_equal(extractAccNum("ID|DEF5678 More Info"), "DEF5678") + + # Test with a string without a pipe character + expect_equal(extractAccNum("ABC9876 Some Description"), "ABC9876") + + # Test with a string that has leading spaces + expect_equal(extractAccNum(" ID|GHI1357 Description"), "GHI1357") + + # Test with a string that has trailing spaces + expect_equal(extractAccNum("ID|JKL2468 Description "), "JKL2468") + + # Test with only an accession number + expect_equal(extractAccNum("XYZ1234"), "XYZ1234") + + # Test with a string with only spaces + expect_equal(extractAccNum(" "), "") + + # Test with a string that contains special characters + expect_equal(extractAccNum("ID|MNO5678_Extra Info"), "MNO5678_Extra") + + # ensureUniqAccNum + # Test with unique accession numbers + accnums1 <- c("ABC1234", "DEF5678", "GHI9012") + expect_equal(ensureUniqAccNum(accnums1), c("ABC1234_1", "DEF5678_1", + "GHI9012_1")) + + # Test with duplicate accession numbers + accnums2 <- c("ABC1234", "ABC1234", "DEF5678", "DEF5678", "GHI9012") + expect_equal(ensureUniqAccNum(accnums2), + c("ABC1234_1", "ABC1234_2", "DEF5678_1", + "DEF5678_2", "GHI9012_1")) + + # Test with all identical accession numbers + accnums3 <- c("XYZ9999", "XYZ9999", "XYZ9999") + expect_equal(ensureUniqAccNum(accnums3), + c("XYZ9999_1", "XYZ9999_2", "XYZ9999_3")) + + # Test with empty input + accnums4 <- character(0) + expect_equal(ensureUniqAccNum(accnums4), character(0)) + + # Test with a single accession number + accnums5 <- c("SINGLE_ACC") + expect_equal(ensureUniqAccNum(accnums5), c("SINGLE_ACC_1")) + + # Test with mixed duplicate and unique accession numbers + accnums6 <- c("A", "B", "A", "C", "B", "B") + expect_equal(ensureUniqAccNum(accnums6), + c("A_1", "A_2", "B_1", "B_2", "B_3", "C_1")) + + # cleanFAHeaders + fasta_sample <- c( + ">sp|P12345|ProteinA Description 1", + ">sp|P67890|ProteinB Description 2", + ">sp|P12345|ProteinA Description 3", + ">sp|P67890|ProteinB Description 4" + ) + names(fasta_sample) <- fasta_sample # Set names to headers + + # Run the function + cleaned_fasta <- cleanFAHeaders(fasta_sample) + + # Expected headers after processing + expected_headers <- c("P12345_1", "P12345_2", "P67890_1", "P67890_2") + + # Check if the names of cleaned_fasta match expected_headers + expect_equal(names(cleaned_fasta), expected_headers) + + # Check that the contents of cleaned_fasta remain unchanged + expect_equal(as.vector(cleaned_fasta), as.vector(fasta_sample)) + + fasta_unique <- c( + ">sp|P12345|UniqueProteinA", + ">sp|P67890|UniqueProteinB" + ) + names(fasta_unique) <- fasta_unique + + cleaned_unique_fasta <- cleanFAHeaders(fasta_unique) + + expected_unique_headers <- c("P12345_1", "P67890_1") + expect_equal(names(cleaned_unique_fasta), expected_unique_headers) + + # Sample input data + prot_data <- tibble::tibble( + DomArch = c("ABC123", "-", "NA", "", "XYZ789", " "), + other_col = c(1, 2, 3, 4, 5, 6) + ) + + # Expected output after removing rows + expected_output <- tibble::tibble( + DomArch = c("ABC123", "XYZ789"), + other_col = c(1, 5) + ) + + # Run the function + result <- removeEmptyRows(prot_data, by_column = "DomArch") + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Case 1: No rows removed + prot_data_no_removal <- tibble::tibble( + DomArch = c("ABC123", "XYZ789"), + other_col = c(1, 2) + ) + expect_equal(removeEmptyRows(prot_data_no_removal), prot_data_no_removal) + + # Case 2: All rows removed + prot_data_all_removed <- tibble::tibble( + DomArch = c("-", "NA", "", " "), + other_col = c(1, 2, 3, 4) + ) + expect_equal(removeEmptyRows(prot_data_all_removed), + tibble::tibble(DomArch = character(0), other_col = numeric(0))) + + # Case 3: Empty data frame + prot_data_empty <- tibble::tibble(DomArch = character(0), + other_col = numeric(0)) + expect_equal(removeEmptyRows(prot_data_empty), prot_data_empty) + + # Input data with repeated domains + prot_data <- tibble::tibble( + DomArch = c("A B B C", "X X Y", "P P P Q", "R R R S"), + other_col = c(1, 2, 3, 4) + ) + + # Input data with repeated domains + prot_data <- tibble::tibble( + DomArch = c("A A A", "B B", "C C C D D"), + other_col = c(1, 2, 3) + ) + + # Input data with repeated and single question marks + prot_data <- tibble::tibble( + GenContext = c("A ? ? B", "? ?", "C ?? C", "D ? > ? D"), + other_col = c(1, 2, 3, 4) + ) + + # Input data with single question marks only + prot_data <- tibble::tibble( + GenContext = c("?", "? ? ?", "A ? B"), + other_col = c(1, 2, 3) + ) + + # Expected output after replacing single question marks + expected_output <- tibble::tibble( + GenContext = c("X", "X(s)", "A X B"), + other_col = c(1, 2, 3) + ) + + # Run the function + result <- replaceQuestionMarks(prot_data, by_column = "GenContext") + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Input data containing asterisks + query_data <- tibble::tibble( + GenContext = c("A * B", "*C*D*", "E*F*"), + other_col = c(1, 2, 3) + ) + + # Expected output after removing asterisks + expected_output <- tibble::tibble( + GenContext = c("A B", "CD", "EF"), + other_col = c(1, 2, 3) + ) + + # Run the function + result <- removeAsterisks(query_data, colname = "GenContext") + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Input data with no asterisks + query_data <- tibble::tibble( + GenContext = c("A B", "C D", "E F"), + other_col = c(1, 2, 3) + ) + + # Expected output (no changes) + expected_output <- query_data + + # Run the function + result <- removeAsterisks(query_data, colname = "GenContext") + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Sample input data + prot <- tibble::tibble( + DomArch = c("A", "B", "A", "C", "D"), + value = c(1, 2, 3, 4, 5) + ) + + # Expected output after removing rows where `DomArch` appears only once + expected_output <- tibble::tibble( + DomArch = c("A", "A"), + value = c(1, 3) + ) + + # Run the function + result <- removeTails(prot, by_column = "DomArch", keep_domains = FALSE) + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Input data with no single occurrence rows + prot <- tibble::tibble( + DomArch = c("A", "A", "B", "B"), + value = c(1, 2, 3, 4) + ) + + # Expected output (should remain unchanged) + expected_output <- prot + + # Run the function + result <- removeTails(prot, by_column = "DomArch", keep_domains = FALSE) + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Sample input data with special characters and extra spaces in the species names + prot <- tibble::tibble( + Species.orig = c("Escherichia coli sp.", + "Bacillus str. subtilis", + "Lactobacillus = plantarum", + "Staphylococcus aureus"), + value = c(1, 2, 3, 4) + ) + + # Expected output after cleaning species names + expected_output <- tibble::tibble( + Species.orig = c("Escherichia coli sp.", + "Bacillus str. subtilis", + "Lactobacillus = plantarum", + "Staphylococcus aureus"), + value = c(1, 2, 3, 4), + Species = c("Escherichia coli sp", + "Bacillus str subtilis", + "Lactobacillus plantarum", + "Staphylococcus aureus") + ) + + # Run the function + result <- cleanSpecies(prot, removeEmptyRows = FALSE) + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Sample input data with an empty Species entry + prot <- tibble::tibble( + Species.orig = c("Escherichia coli sp.", "", + "Lactobacillus = plantarum", + "Staphylococcus aureus"), + value = c(1, 2, 3, 4) + ) + + # Expected output after cleaning and removing empty rows + expected_output <- tibble::tibble( + Species.orig = c("Escherichia coli sp.", + "Lactobacillus = plantarum", + "Staphylococcus aureus"), + value = c(1, 3, 4), + Species = c("Escherichia coli sp", + "Lactobacillus plantarum", + "Staphylococcus aureus") + ) + + # Run the function with removeEmptyRows = TRUE + result <- cleanSpecies(prot, removeEmptyRows = TRUE) + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Sample input data with original ClustName + prot <- tibble::tibble( + ClustName.orig = c("SIG+TM+TM", "ABC+DEF", "XYZ+SIG", "TM+TM"), + value = c(1, 2, 3, 4) + ) + + # Domains to rename + domains_rename <- tibble::tibble( + old = c("SIG", "ABC"), + new = c("Signal", "ABC_Transporter") + ) + + # Domains to keep + domains_keep <- tibble::tibble( + domains = c("Signal", "ABC_Transporter") + ) + + # Expected output after renaming and filtering + expected_output <- tibble::tibble( + ClustName.orig = c("SIG+TM+TM", "ABC+DEF", "XYZ+SIG"), + value = c(1, 2, 3), + ClustName = c("Signal+TM+TM", "ABC_Transporter+DEF", "XYZ+Signal") + ) + + # Run the function + result <- cleanClusters(prot, domains_rename, domains_keep, + condenseRepeatedDomains = FALSE, + removeTails = FALSE, + removeEmptyRows = FALSE) + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Sample input data with ClustName containing tails + prot <- tibble::tibble( + ClustName.orig = c("SIG+TM+1", "ABC+DEF", "XYZ+SIG+2"), + value = c(1, 2, 3) + ) + + # Domains to rename (empty for this test) + domains_rename <- tibble::tibble( + old = character(0), + new = character(0) + ) + + # Domains to keep (empty for this test) + domains_keep <- tibble::tibble( + domains = character(0) + ) + + # Expected output after removing tails + expected_output <- tibble::tibble( + ClustName.orig = c("ABC+DEF"), + value = c(2), + ClustName = c("ABC+DEF") + ) + + # Sample input data + prot <- tibble::tibble( + DomArch.orig = c("SIG+TM+TM", "ABC+DEF", "XYZ+SIG", "TM+TM"), + value = c(1, 2, 3, 4) + ) + + # Domains to rename + domains_rename <- tibble::tibble( + old = c("SIG", "ABC"), + new = c("Signal", "ABC_Transporter") + ) + + # Domains to keep + domains_keep <- tibble::tibble( + domains = c("Signal", "ABC_Transporter") + ) + + # Expected output after renaming and filtering + expected_output <- tibble::tibble( + DomArch.orig = c("SIG+TM+TM", "ABC+DEF", "XYZ+SIG"), + value = c(1, 2, 3), + DomArch = c("Signal+TM+TM", "ABC_Transporter+DEF", "XYZ+Signal") + ) + + # Run the function + result <- cleanDomainArchitecture(prot, old = "DomArch.orig", new = "DomArch", + domains_keep = domains_keep, + domains_rename = domains_rename, + condenseRepeatedDomains = FALSE, + removeTails = FALSE, + removeEmptyRows = FALSE) + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Sample input data with repeated domains + prot <- tibble::tibble( + DomArch.orig = c("SIG+TM+TM+TM", "ABC+ABC+DEF", "XYZ+SIG+SIG"), + value = c(1, 2, 3) + ) + + # Domains to rename (empty for this test) + domains_rename <- tibble::tibble( + old = character(0), + new = character(0) + ) + + # Domains to keep + domains_keep <- tibble::tibble( + domains = c("SIG", "ABC") + ) + + # Expected output after condensing repeated domains + expected_output <- tibble::tibble( + DomArch.orig = c("SIG+TM+TM+TM", "ABC+ABC+DEF", "XYZ+SIG+SIG"), + value = c(1, 2, 3), + DomArch = c("SIG+TM(s)", "ABC(s)+DEF", "XYZ+SIG(s)") + ) + + # Run the function with condenseRepeatedDomains = TRUE + result <- cleanDomainArchitecture(prot, old = "DomArch.orig", new = "DomArch", + domains_keep = domains_keep, + domains_rename = domains_rename, + condenseRepeatedDomains = TRUE, + removeTails = FALSE, + removeEmptyRows = FALSE) + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Sample input data with an empty DomArch entry + prot <- tibble::tibble( + DomArch.orig = c("SIG+TM+TM", "", "ABC+DEF"), + value = c(1, 2, 3) + ) + + # Domains to rename (empty for this test) + domains_rename <- tibble::tibble( + old = character(0), + new = character(0) + ) + + # Domains to keep + domains_keep <- tibble::tibble( + domains = c("SIG", "ABC") + ) + + # Expected output after removing empty rows + expected_output <- tibble::tibble( + DomArch.orig = c("SIG+TM+TM", "ABC+DEF"), + value = c(1, 3), + DomArch = c("SIG+TM+TM", "ABC+DEF") + ) + + # Run the function with removeEmptyRows = TRUE + result <- cleanDomainArchitecture(prot, old = "DomArch.orig", new = "DomArch", + domains_keep = domains_keep, + domains_rename = domains_rename, + condenseRepeatedDomains = FALSE, + removeTails = FALSE, + removeEmptyRows = TRUE) + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Sample input data with question marks + prot <- tibble::tibble( + DomArch.orig = c("SIG+TM???", "ABC+???DEF", "XYZ+SIG"), + value = c(1, 2, 3) + ) + + # Domains to rename (empty for this test) + domains_rename <- tibble::tibble( + old = character(0), + new = character(0) + ) + + # Domains to keep + domains_keep <- tibble::tibble( + domains = c("SIG", "ABC") + ) + + # Expected output after replacing question marks + expected_output <- tibble::tibble( + DomArch.orig = c("SIG+TM???", "ABC+???DEF", "XYZ+SIG"), + value = c(1, 2, 3), + DomArch = c("SIG+TMXXX", "ABC+XXXDEF", "XYZ+SIG") + ) + + # Run the function with question mark replacement + result <- cleanDomainArchitecture(prot, old = "DomArch.orig", new = "DomArch", + domains_keep = domains_keep, + domains_rename = domains_rename, + condenseRepeatedDomains = FALSE, + removeTails = FALSE, + removeEmptyRows = FALSE) + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Sample input data + prot <- tibble::tibble( + GeneDescription = c("Gene A.", + "Protein B%2C protein C.", + "Enzyme%2C catalytic."), + value = c(1, 2, 3) + ) + + # Expected output after cleaning + expected_output <- tibble::tibble( + GeneDescription = c("Gene A.", + "Protein B%2C protein C.", + "Enzyme%2C catalytic."), + value = c(1, 2, 3), + GeneDesc = c("Gene A.", + "Protein B, protein C.", + "Enzyme, catalytic.") + ) + + # Run the function + result <- cleanGeneDescription(prot, "GeneDescription") + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Sample input data + prot <- tibble::tibble( + AccNum = c("A1", "A1", "B1", "B1", "C1"), + Description = c("Short", + "Longer Description", + "Medium", "Shortest", "Unique") + ) + + # Expected output after selecting longest duplicates + expected_output <- tibble::tibble( + AccNum = c("A1", "B1", "C1"), + Description = c("Longer Description", "Shortest", "Unique") + ) + + # Run the function + result <- selectLongestDuplicate(prot, "Description") + + # Check if the result matches the expected output + expect_equal(result, expected_output) + + # Sample input data + prot <- tibble::tibble( + Lineage = c("Bacteria; Firmicutes; Bacilli; Lactobacillales", + "Bacteria; Proteobacteria; Gammaproteobacteria", + "Archaea; Euryarchaeota; Methanobacteria") + ) + + # Rename mapping + lins_rename <- tibble::tibble( + old = c("Bacteria", "Firmicutes", "Archaea"), + new = c("Bacterium", "Firmicute", "Archaean") + ) + + # Expected output after renaming + expected_output <- tibble::tibble( + Lineage = c("Bacterium; Firmicute; Bacilli; Lactobacillales", + "Bacterium; ProteoBacterium; GammaproteoBacterium", + "Archaean; Euryarchaeota; MethanoBacterium") + ) + + # Run the function + result <- cleanLineage(prot, lins_rename) + + # Check if the result matches the expected output + expect_equal(result, expected_output) + +}) \ No newline at end of file diff --git a/tests/testthat/test-fa2domain.R b/tests/testthat/test-fa2domain.R new file mode 100644 index 00000000..2f508c3a --- /dev/null +++ b/tests/testthat/test-fa2domain.R @@ -0,0 +1,225 @@ +context("fa2domain") +test_that("fa2domain", { + library(mockery) + library(readr) + library(glue) + # runIPRScan + # Define file paths using system.file to locate files in the package + filepath_fasta <- system.file("tests", "example_fasta.fa", package = "MolEvolvR") + filepath_out <- tempfile() # Temporary file for output + + # Set application options + mock_appl_single <- "Pfam" + mock_appl_multiple <- c("Pfam", "Gene3D") + + # Create a sample TSV file in extdata and read it + sample_tsv_path <- system.file("tests", "example_iprscan_valid.tsv", package = "MolEvolvR") + + # Read the TSV file into a dataframe + sample_tsv <- read.csv(sample_tsv_path, sep = "\t", header = TRUE) + + # Mock the system function to avoid running the real command + mock_system <- mock(0L) # Simulate successful system call + + # Patch the system and readIPRScanTSV functions + stub(runIPRScan, "system", mock_system) + stub(runIPRScan, "readIPRScanTSV", function(x) read.csv(sample_tsv_path, sep = "\t")) + + ## TEST 1: Command construction for single application + result_single <- runIPRScan(filepath_fasta, filepath_out, appl = mock_appl_single) + expected_cmd_single <- glue("iprscan -i {filepath_fasta} -b {filepath_out} --cpu 4 -f TSV ", + "--appl {mock_appl_single}") + + # Capture the actual command from the mock + actual_cmd_single <- mock_args(mock_system)[[1]] + + # Verify that the expected command matches the actual command + expect_equal(as.character(unlist(actual_cmd_single)), as.character(expected_cmd_single)) + + # Clear the mock calls for the next test + mock_system <- mock(0L) + stub(runIPRScan, "system", mock_system) + + ## TEST 3: Real result from reading TSV file + expect_equal(result_single, sample_tsv) + + ## TEST 4: Error handling when system command fails + mock_system_fail <- mock(1L) # Simulate non-zero exit code + stub(runIPRScan, "system", mock_system_fail) + + # Expect a warning and return NULL on failure + expect_warning(result_fail <- runIPRScan(filepath_fasta, filepath_out, appl = mock_appl_single), + regexp = "interproscan exited with non-zero code") + expect_null(result_fail) + + ## TEST 5: Error handling for missing or invalid inputs + # Invalid `filepath_fasta` + expect_error(runIPRScan(NULL, filepath_out, appl = mock_appl_single), + "filepath_fasta cannot be NULL or empty") + + # Invalid `filepath_out` + expect_error(runIPRScan(filepath_fasta, NULL, appl = mock_appl_single), + "filepath_out cannot be NULL or empty") + + # Invalid `appl` + expect_error(runIPRScan(filepath_fasta, filepath_out, appl = "InvalidApp"), + "Invalid application specified") + + # readIPRScanTSV + # Read the TSV file using the function + df_ipr <- readIPRScanTSV(sample_tsv_path) + + # Check that the returned object is a data frame + expect_s3_class(df_ipr, "data.frame") + + # getIPRScanColNames + # Call the function to get the column names + col_names <- getIPRScanColNames() + + # Check that the result is a character vector + expect_type(col_names, "character") + + # Define the expected column names + expected_col_names <- c( + "AccNum", "SeqMD5Digest", "SLength", "Analysis", + "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score", + "Status", "RunDate", "IPRAcc", "IPRDesc" + ) + + # Check that the column names match exactly + expect_equal(col_names, expected_col_names) + expect_type(col_names, "character") + + # Ensure there are exactly 13 columns + expect_length(col_names, 13) + + # getIPRScanColTypes + col_types <- getIPRScanColTypes() + + # Check that col_types is of the expected class + # readr::cols() returns col_spec object + expect_s3_class(col_types, "col_spec") + + # Verify that each column has the correct type + expect_equal(col_types$cols$AccNum, col_character()) + expect_equal(col_types$cols$SeqMD5Digest, col_character()) + expect_equal(col_types$cols$SLength, col_integer()) + expect_equal(col_types$cols$Analysis, col_character()) + expect_equal(col_types$cols$DB.ID, col_character()) + expect_equal(col_types$cols$SignDesc, col_character()) + expect_equal(col_types$cols$StartLoc, col_integer()) + expect_equal(col_types$cols$StopLoc, col_integer()) + expect_equal(col_types$cols$Score, col_double()) + expect_equal(col_types$cols$Status, col_character()) + expect_equal(col_types$cols$RunDate, col_character()) + expect_equal(col_types$cols$IPRAcc, col_character()) + expect_equal(col_types$cols$IPRDesc, col_character()) + + # Optionally, check that there are no additional columns defined + expect_length(col_types$cols, 13) + + # createIPRScanDomainTable + + # Load the sample FASTA file + fasta <- Biostrings::readAAStringSet(filepath_fasta) + + # Read the sample InterProScan TSV file + df_iprscan <- readIPRScanTSV(sample_tsv_path) + + # Example accession number for testing + accnum <- df_iprscan$AccNum[1] + + # Test case 1: Valid inputs + df_iprscan_domains <- createIPRScanDomainTable(accnum, fasta, df_iprscan) + + # Check that the output is a data frame + expect_s3_class(df_iprscan_domains, "data.frame") + + # Validate the structure of the output + expect_true(all(c("AccNum", "DB.ID", "StartLoc", "StopLoc", "seq_domain", + "id_domain") %in% names(df_iprscan_domains))) + + # Validate the content of the seq_domain column + # Ensure no empty sequences + expect_true(all(nchar(df_iprscan_domains$seq_domain) > 0)) + + # Validate the id_domain structure + expect_true(all(grepl("^(~*\\w+(-\\w+-\\d+_\\d+)?)+$", df_iprscan_domains$id_domain))) + + # Test case 2: No matching accession number + empty_df <- createIPRScanDomainTable("non_existent_accnum", fasta, df_iprscan) + expect_s3_class(empty_df, "data.frame") + expect_equal(nrow(empty_df), 0) + + # Test case 3: No domains in input data frame + empty_iprscan <- df_iprscan[0, ] # Create an empty df_iprscan + empty_domains_df <- createIPRScanDomainTable(accnum, fasta, empty_iprscan) + expect_s3_class(empty_domains_df, "data.frame") + expect_equal(nrow(empty_domains_df), 0) + + # convertIPRScanDomainTable2FA + + # Test case 1: Valid domain data + fasta_domains <- convertIPRScanDomainTable2FA(df_iprscan_domains) + + # Check that the output is an AAStringSet + expect_s4_class(fasta_domains, "AAStringSet") + + # Check that the correct number of sequences are returned + expect_equal(length(fasta_domains), nrow(df_iprscan_domains)) + + # Check that the names of the sequences match the id_domain column + expect_equal(names(fasta_domains), as.character(df_iprscan_domains$id_domain)) + + # Test case 2: Empty input data frame + empty_domains <- convertIPRScanDomainTable2FA(data.frame()) + expect_s4_class(empty_domains, "AAStringSet") + expect_equal(length(empty_domains), 0) + + # Test case 3: Data frame with no domains + empty_df_iprscan <- df_iprscan[0, ] # Create an empty df_iprscan + empty_domains_df <- convertIPRScanDomainTable2FA(empty_df_iprscan) + expect_s4_class(empty_domains_df, "AAStringSet") + expect_equal(length(empty_domains_df), 0) + + # getDomainsFromFA + # Test case 1: Valid input + fasta_domains <- getDomainsFromFA(fasta, df_iprscan) + + # Check that the output is an AAStringSet + expect_s4_class(fasta_domains, "AAStringSet") + + # Check that the output contains the expected sequences + expect_true(length(fasta_domains) > 0) # Ensure there are some domains extracted + + # Test case 2: Empty input FASTA + empty_fasta <- Biostrings::AAStringSet() + empty_fasta_domains <- getDomainsFromFA(empty_fasta, df_iprscan) + + expect_s4_class(empty_fasta_domains, "AAStringSet") + expect_equal(length(empty_fasta_domains), 0) + + # Test case 3: Empty input df_iprscan + empty_iprscan <- data.frame() # Create an empty df_iprscan + empty_domains_iprscan <- getDomainsFromFA(fasta, empty_iprscan) + + expect_s4_class(empty_domains_iprscan, "AAStringSet") + expect_equal(length(empty_domains_iprscan), 0) + + # Test case 4: Verbose output + analysis <- c("Pfam", "Gene3D") + expect_warning( + getDomainsFromFA(fasta, empty_iprscan, verbose = TRUE), + regexp = stringr::str_glue( + "accession number: aaeB_6~~~aaeB_4 had no domains for the selected analyses: ", + "{paste(unique(analysis), collapse = ',')}\n" + ) + ) + + # Test case 5: Verbose output for some valid accession numbers + fasta_domains_verbose <- getDomainsFromFA(fasta, df_iprscan, verbose = TRUE) + + # Check that the output is still an AAStringSet + expect_s4_class(fasta_domains_verbose, "AAStringSet") + +}) \ No newline at end of file