Merge pull request #48 from phac-nml/merging

Merged branch with all of Sam and Rylan's changes
phac-nml · Dec 13, 2017 · e4295ba · e4295ba
2 parents f5a37cc + 0dbfb22
commit e4295ba
Show file tree

Hide file tree

Showing 22 changed files with 403 additions and 6,287 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # ECTyper (an easy typer)
 **ECTyper** wraps a standalone serotyping module.  
-Support _fasta_ and _fastq_ format
+Supports _fasta_ and _fastq_ file formats
 
 # Dependencies:
 - python 3.6.3.*
@@ -30,12 +30,37 @@ Support _fasta_ and _fastq_ format
           `python setup.py install`
 
 # Basic Usage
-1. Put all you fasta/fastq file in one folder (concatenate paired files if you want to result to be considered as single entity)
+1. Put all of your fasta/fastq files in one folder (concatenate paired files if you want the result to be considered a single entity)
 1. `ectyper -i [file path]`
-1. View result on console or in `output/output.csv`
-* If you want to enable species identification, you might need to wait for **ectyper** to download reference sequence when it is the first time you run
+1. View the results on the console or in `output/[datatime]/output.csv`
 
 # Example Usage
-* `ectyper -i ecoliA.fasta`  for single file
-* `ectyper -i ecoliA.fasta,ecoliB.fastq,ecoliC.fna`	for multiple file  
-* `ectyper -i ecoli_folder`	for folder
+* `ectyper -i ecoliA.fasta`  for a single file
+* `ectyper -i ecoliA.fasta,ecoliB.fastq,ecoliC.fna`	for multiple files  
+* `ectyper -i ecoli_folder`	for a folder
+
+# Advanced Usage
+```
+usage: ectyper [-h] -i INPUT [-d PERCENTIDENTITY] [-l PERCENTLENGTH]
+               [--verify] [-s] [-v] [-o OUTPUT]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -i INPUT, --input INPUT
+                        Location of new file(s). Can be a single file or a
+                        directory
+  -d PERCENTIDENTITY, --percentIdentity PERCENTIDENTITY
+                        Percentage of identity wanted to use against the
+                        database. From 0 to 100, default is 90%.
+  -l PERCENTLENGTH, --percentLength PERCENTLENGTH
+                        Percentage of length wanted to use against the
+                        database. From 0 to 100, default is 50%.
+  --verify              Enable E. Coli. verification
+  -s, --species         Enable species identification when non-ecoli genome is
+                        found Note: refseq downloading is required when
+                        running this option for the first time.
+  -v, --verbose         Enable detailed output
+  -o OUTPUT, --output OUTPUT
+                        Directory location of output files.
+```
+* The first time species identification is enabled you will need to wait for **ectyper** to download the reference sequence.
diff --git a/ectyper/blastFunctions.py b/ectyper/blastFunctions.py
@@ -16,9 +16,12 @@ def create_blast_db(filelist, temp_dir):
     Creating a blast DB using the makeblastdb command.
     The database is created in the temporary folder of the system.
 
-    :param filelist: genome list that was given by the user on the commandline.
-    :param temp_dir: temp directory to store blastdb
-    :return full path of DB
+    Args:
+        filelist: list of genomes that was given by the user on the command line.
+        temp_dir: temporary directory to store the blastdb in.
+
+    Returns:
+        Full path to the DB
     """
     blast_db_path = os.path.join(temp_dir, 'ectyper_blastdb')
 
@@ -34,15 +37,18 @@ def create_blast_db(filelist, temp_dir):
     return blast_db_path
 
 
-def run_blast(query_file, blast_db, args, chunk_size):
+def run_blast(query_file, blast_db, args):
     """
     Execute a blastn run given the query files and blastdb
 
-    :param query_file: one or both of the VF / Serotype input files
-    :param blast_db: validated fasta files from the user, in DB form
-    :param args: parsed commandline options from the user
-    :param chunk_size: number of genome in database
-    :return: the blast output file
+    Args:
+        query_file (str): one or both of the VF / Serotype input files
+        blast_db (str): validated fasta files from the user, in DB form
+        args (Namespace object): parsed commadnline options from the user
+        chunck_size: number of genomes in the database
+
+    Returns:
+        The blast output file
     """
     percent_identity = args.percentIdentity
     percent_length = args.percentLength
@@ -61,7 +67,6 @@ def run_blast(query_file, blast_db, args, chunk_size):
         '-qcov_hsp_perc', str(percent_length),
         '-max_hsps', '1', # each allele only need to hit once
         # use default max_target_seqs=500
-        # '-max_target_seqs', str(chunk_size*5), # at most 5 genome hit per query
         "-outfmt",
         '6 qseqid qlen sseqid length pident sstart send sframe qcovhsp',
         "-word_size", "11"
@@ -77,9 +82,12 @@ def run_blast_for_identification(query_file, blast_db):
     Execute a blastn run given the query files and blastdb
     with special configuration for high performance identification
 
-    :param query_file: one or both of the VF / Serotype input files
-    :param blast_db: validated fasta files from the user, in DB form
-    :return: the blast output file
+    Args:
+        query_file: one or both of the VF / Serotype input files
+        blast_db: validated fasta files from the user, in DB form
+
+    Returns:
+        blast_output_file (str): path to the blast output file
     """
 
     LOG.debug('Running blast query {0} against database {1} '.format(

diff --git a/ectyper/commandLineOptions.py b/ectyper/commandLineOptions.py
@@ -5,10 +5,15 @@
 
 def parse_command_line(args=None):
     """
-    The options
-    for both the serotyper, and virulence finder.
+    The options for both the serotyper, and virulence finder.
     The returned object is used by both, but the options do not
     necessarily apply to both.
+
+    Args:
+        args: Optional args to be passed to argparse.parse_args()
+
+    Returns:
+        The populated argparse Namespace
     """
 
     def check_percentage(value):
@@ -64,8 +69,7 @@ def check_percentage(value):
     )
 
     parser.add_argument(
-        '-v',
-        '--verbose',
+        '--detailed',
         action='store_true',
         help='Enable detailed output'
     )

diff --git a/ectyper/definitions.py b/ectyper/definitions.py
@@ -14,11 +14,6 @@
 SEROTYPE_ALLELE_JSON = os.path.join(DATA_DIR, 'ectyper_dict.json')
 COMBINED = os.path.join(DATA_DIR, 'combined.fasta')
 
-LEGACY_SEROTYPE_FILE = os.path.join(DATA_DIR, 'legacy_ectyper_data.fasta')
-LEGACY_SEROTYPE_ALLELE_JSON = os.path.join(DATA_DIR, 'legacy_ectyper_dict.json')
-LEGACY_COMBINED = os.path.join(DATA_DIR, 'legacy_combined.fasta')
-
-REFERENCE_INDEX = os.path.join(DATA_DIR, 'bowtie_index/serotype_dict')
 ECOLI_MARKERS = os.path.join(DATA_DIR, 'ecoli_specific_markers.fasta')
 SAMTOOLS = 'samtools'
 REFSEQ_SUMMARY = os.path.join(DATA_DIR, 'assembly_summary_refseq.txt')

diff --git a/ectyper/ectyper.py b/ectyper/ectyper.py
@@ -87,14 +87,8 @@ def run_program():
         predictionFunctions.report_result(predictions_file)
 
 def download_refseq():
+    '''Download refseq file with progress bar
     '''
-    Download refseq file with progress bar
-    '''
-    def download_file(url, dst):
-        '''
-        download file with progress bar
-        '''
-        urlretrieve(url, dst, reporthook)
 
 
     def reporthook(blocknum, blocksize, totalsize):
@@ -117,12 +111,24 @@ def reporthook(blocknum, blocksize, totalsize):
     if not os.path.isfile(definitions.REFSEQ_SKETCH):
         refseq_url = 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh'
         LOG.info("No refseq found. Downloading reference file for species identification...")
-        download_file(refseq_url, definitions.REFSEQ_SKETCH)
+        urlretrieve(refseq_url, definitions.REFSEQ_SKETCH, reporthook)
         LOG.info("Download complete.")
 
 def create_tmp_files(temp_dir, output_dir=None):
-    """
-    Return a dictionary of temporary files used by ectyper
+    """Create a dictionary of temporary files used by ectyper
+
+    Args:
+        temp_dir: program scope temporary directory
+        output_dir(str, optional):
+            directory to store output
+
+    Return:
+        a dictionary of temporary files
+        example:
+            {'assemble_temp_dir': 'test/temp/assemblies',
+             'fasta_temp_dir': 'test/temp/fastas',
+             'output_dir': os.path.abspath('output')+'/',
+             'output_file': os.path.abspath('output/output.csv')}
     """
 
     # Get the correct files and directories
@@ -160,12 +166,18 @@ def create_tmp_files(temp_dir, output_dir=None):
 
 
 def run_prediction(genome_files, args, predictions_file):
-    '''
-    Core prediction functionality
-    :param genome_files:
-    :param args:
-    :param predictions_file:
-    :returns predictions_file
+    '''Core prediction functionality
+    
+    Args:
+        genome_files:
+            list of genome files
+        args:
+            commandline arguments
+        predictions_file:
+            filename of prediction output
+    
+    Returns:
+        predictions_file with prediction written in it
     '''
     query_file = definitions.SEROTYPE_FILE
     ectyper_dict_file = definitions.SEROTYPE_ALLELE_JSON
@@ -182,21 +194,25 @@ def run_prediction(genome_files, args, predictions_file):
             blast_db = blastFunctions.create_blast_db(chunk, temp_dir)
 
             LOG.info("Start blast alignment on database #{0}".format(index + 1))
-            blast_output_file = blastFunctions.run_blast(
-                query_file, blast_db, args, len(chunk))
+            blast_output_file = blastFunctions.run_blast(query_file, blast_db, args)
             LOG.info("Start serotype prediction for database #{0}".format(index + 1))
             predictions_file = predictionFunctions.predict_serotype(
                 blast_output_file, ectyper_dict_file, predictions_file,
-                args.verbose)
+                args.detailed)
         return predictions_file
 
 
 def get_raw_files(raw_files):
-    """
-    Take all the raw files, and filter not fasta / fastq
-
-    :param raw_files:
-    :return (raw_fasta_files, raw_fastq_files)
+    """Take all the raw files, and filter not fasta / fastq
+    
+    Args:
+        raw_files(str): list of files from user input
+    
+    Returns:
+        A dictitionary collection of fasta and fastq files
+        example:
+        {'raw_fasta_files':[],
+         'raw_fastq_files':[]}
     """
     fasta_files = []
     fastq_files = []
@@ -215,11 +231,21 @@ def get_raw_files(raw_files):
 
 
 def filter_for_ecoli_files(raw_dict, temp_files, verify=False, species=False):
-    """
-    :param raw_dict{fasta:list_of_files, fastq:list_of_files}:
-    :parapm temp_file:
-    :param verify:
-    :param species:
+    """filter ecoli, identify species, assemble fastq
+    Assemble fastq files to fasta files,
+    then filter all files by reference method if verify is enabled,
+    if identified as non-ecoli, identify species by mash method if species is enabled.
+    
+    Args:
+        raw_dict{fasta:list_of_files, fastq:list_of_files}:
+            dictionary collection of fasta and fastq files
+        temp_file: temporary directory
+        verify(bool):
+            whether to perform ecoli verification
+        species(bool):
+            whether to perform species identification for non-ecoli genome
+    Returns:
+        List of filtered and assembled genome files in fasta format
     """
     final_files = []
     for f in raw_dict.keys():
@@ -236,15 +262,22 @@ def filter_for_ecoli_files(raw_dict, temp_files, verify=False, species=False):
     return final_files
 
 def filter_file_by_species(genome_file, genome_format, temp_dir, verify=False, species=False):
-    '''
-    Core species recognition functionality
-    :param genome_file:
-    :param genome_format:
-    :param temp_dir:
-    :param verify:
-    :param mash:
-    :returns filtered_file
-    '''
+    """filter ecoli, identify species, assemble fastq
+    Assemble fastq file to fasta file,
+    then filter the file by reference method if verify is enabled,
+    if identified as non-ecoli, identify species by mash method if species is enabled.
+    
+    Args:
+        genome_file: input genome file
+        genome_format(str): fasta or fastq
+        temp_file: temporary directory
+        verify(bool):
+            whether to perform ecoli verification
+        species(bool):
+            whether to perform species identification for non-ecoli genome
+    Returns:
+        The filtered and assembled genome files in fasta format
+    """
     combined_file = definitions.COMBINED
     filtered_file = None
     if genome_format == 'fastq':