Skip to content

Commit

Permalink
Merge pull request #48 from phac-nml/merging
Browse files Browse the repository at this point in the history
Merged branch with all of Sam and Rylan's changes
  • Loading branch information
chadlaing authored Dec 13, 2017
2 parents f5a37cc + 0dbfb22 commit e4295ba
Show file tree
Hide file tree
Showing 22 changed files with 403 additions and 6,287 deletions.
39 changes: 32 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# ECTyper (an easy typer)
**ECTyper** wraps a standalone serotyping module.
Support _fasta_ and _fastq_ format
Supports _fasta_ and _fastq_ file formats

# Dependencies:
- python 3.6.3.*
Expand Down Expand Up @@ -30,12 +30,37 @@ Support _fasta_ and _fastq_ format
`python setup.py install`

# Basic Usage
1. Put all you fasta/fastq file in one folder (concatenate paired files if you want to result to be considered as single entity)
1. Put all of your fasta/fastq files in one folder (concatenate paired files if you want the result to be considered a single entity)
1. `ectyper -i [file path]`
1. View result on console or in `output/output.csv`
* If you want to enable species identification, you might need to wait for **ectyper** to download reference sequence when it is the first time you run
1. View the results on the console or in `output/[datatime]/output.csv`

# Example Usage
* `ectyper -i ecoliA.fasta` for single file
* `ectyper -i ecoliA.fasta,ecoliB.fastq,ecoliC.fna` for multiple file
* `ectyper -i ecoli_folder` for folder
* `ectyper -i ecoliA.fasta` for a single file
* `ectyper -i ecoliA.fasta,ecoliB.fastq,ecoliC.fna` for multiple files
* `ectyper -i ecoli_folder` for a folder

# Advanced Usage
```
usage: ectyper [-h] -i INPUT [-d PERCENTIDENTITY] [-l PERCENTLENGTH]
[--verify] [-s] [-v] [-o OUTPUT]
optional arguments:
-h, --help show this help message and exit
-i INPUT, --input INPUT
Location of new file(s). Can be a single file or a
directory
-d PERCENTIDENTITY, --percentIdentity PERCENTIDENTITY
Percentage of identity wanted to use against the
database. From 0 to 100, default is 90%.
-l PERCENTLENGTH, --percentLength PERCENTLENGTH
Percentage of length wanted to use against the
database. From 0 to 100, default is 50%.
--verify Enable E. Coli. verification
-s, --species Enable species identification when non-ecoli genome is
found Note: refseq downloading is required when
running this option for the first time.
-v, --verbose Enable detailed output
-o OUTPUT, --output OUTPUT
Directory location of output files.
```
* The first time species identification is enabled you will need to wait for **ectyper** to download the reference sequence.
34 changes: 21 additions & 13 deletions ectyper/blastFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@ def create_blast_db(filelist, temp_dir):
Creating a blast DB using the makeblastdb command.
The database is created in the temporary folder of the system.
:param filelist: genome list that was given by the user on the commandline.
:param temp_dir: temp directory to store blastdb
:return full path of DB
Args:
filelist: list of genomes that was given by the user on the command line.
temp_dir: temporary directory to store the blastdb in.
Returns:
Full path to the DB
"""
blast_db_path = os.path.join(temp_dir, 'ectyper_blastdb')

Expand All @@ -34,15 +37,18 @@ def create_blast_db(filelist, temp_dir):
return blast_db_path


def run_blast(query_file, blast_db, args, chunk_size):
def run_blast(query_file, blast_db, args):
"""
Execute a blastn run given the query files and blastdb
:param query_file: one or both of the VF / Serotype input files
:param blast_db: validated fasta files from the user, in DB form
:param args: parsed commandline options from the user
:param chunk_size: number of genome in database
:return: the blast output file
Args:
query_file (str): one or both of the VF / Serotype input files
blast_db (str): validated fasta files from the user, in DB form
args (Namespace object): parsed commadnline options from the user
chunck_size: number of genomes in the database
Returns:
The blast output file
"""
percent_identity = args.percentIdentity
percent_length = args.percentLength
Expand All @@ -61,7 +67,6 @@ def run_blast(query_file, blast_db, args, chunk_size):
'-qcov_hsp_perc', str(percent_length),
'-max_hsps', '1', # each allele only need to hit once
# use default max_target_seqs=500
# '-max_target_seqs', str(chunk_size*5), # at most 5 genome hit per query
"-outfmt",
'6 qseqid qlen sseqid length pident sstart send sframe qcovhsp',
"-word_size", "11"
Expand All @@ -77,9 +82,12 @@ def run_blast_for_identification(query_file, blast_db):
Execute a blastn run given the query files and blastdb
with special configuration for high performance identification
:param query_file: one or both of the VF / Serotype input files
:param blast_db: validated fasta files from the user, in DB form
:return: the blast output file
Args:
query_file: one or both of the VF / Serotype input files
blast_db: validated fasta files from the user, in DB form
Returns:
blast_output_file (str): path to the blast output file
"""

LOG.debug('Running blast query {0} against database {1} '.format(
Expand Down
12 changes: 8 additions & 4 deletions ectyper/commandLineOptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,15 @@

def parse_command_line(args=None):
"""
The options
for both the serotyper, and virulence finder.
The options for both the serotyper, and virulence finder.
The returned object is used by both, but the options do not
necessarily apply to both.
Args:
args: Optional args to be passed to argparse.parse_args()
Returns:
The populated argparse Namespace
"""

def check_percentage(value):
Expand Down Expand Up @@ -64,8 +69,7 @@ def check_percentage(value):
)

parser.add_argument(
'-v',
'--verbose',
'--detailed',
action='store_true',
help='Enable detailed output'
)
Expand Down
5 changes: 0 additions & 5 deletions ectyper/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,6 @@
SEROTYPE_ALLELE_JSON = os.path.join(DATA_DIR, 'ectyper_dict.json')
COMBINED = os.path.join(DATA_DIR, 'combined.fasta')

LEGACY_SEROTYPE_FILE = os.path.join(DATA_DIR, 'legacy_ectyper_data.fasta')
LEGACY_SEROTYPE_ALLELE_JSON = os.path.join(DATA_DIR, 'legacy_ectyper_dict.json')
LEGACY_COMBINED = os.path.join(DATA_DIR, 'legacy_combined.fasta')

REFERENCE_INDEX = os.path.join(DATA_DIR, 'bowtie_index/serotype_dict')
ECOLI_MARKERS = os.path.join(DATA_DIR, 'ecoli_specific_markers.fasta')
SAMTOOLS = 'samtools'
REFSEQ_SUMMARY = os.path.join(DATA_DIR, 'assembly_summary_refseq.txt')
Expand Down
109 changes: 71 additions & 38 deletions ectyper/ectyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,8 @@ def run_program():
predictionFunctions.report_result(predictions_file)

def download_refseq():
'''Download refseq file with progress bar
'''
Download refseq file with progress bar
'''
def download_file(url, dst):
'''
download file with progress bar
'''
urlretrieve(url, dst, reporthook)


def reporthook(blocknum, blocksize, totalsize):
Expand All @@ -117,12 +111,24 @@ def reporthook(blocknum, blocksize, totalsize):
if not os.path.isfile(definitions.REFSEQ_SKETCH):
refseq_url = 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh'
LOG.info("No refseq found. Downloading reference file for species identification...")
download_file(refseq_url, definitions.REFSEQ_SKETCH)
urlretrieve(refseq_url, definitions.REFSEQ_SKETCH, reporthook)
LOG.info("Download complete.")

def create_tmp_files(temp_dir, output_dir=None):
"""
Return a dictionary of temporary files used by ectyper
"""Create a dictionary of temporary files used by ectyper
Args:
temp_dir: program scope temporary directory
output_dir(str, optional):
directory to store output
Return:
a dictionary of temporary files
example:
{'assemble_temp_dir': 'test/temp/assemblies',
'fasta_temp_dir': 'test/temp/fastas',
'output_dir': os.path.abspath('output')+'/',
'output_file': os.path.abspath('output/output.csv')}
"""

# Get the correct files and directories
Expand Down Expand Up @@ -160,12 +166,18 @@ def create_tmp_files(temp_dir, output_dir=None):


def run_prediction(genome_files, args, predictions_file):
'''
Core prediction functionality
:param genome_files:
:param args:
:param predictions_file:
:returns predictions_file
'''Core prediction functionality
Args:
genome_files:
list of genome files
args:
commandline arguments
predictions_file:
filename of prediction output
Returns:
predictions_file with prediction written in it
'''
query_file = definitions.SEROTYPE_FILE
ectyper_dict_file = definitions.SEROTYPE_ALLELE_JSON
Expand All @@ -182,21 +194,25 @@ def run_prediction(genome_files, args, predictions_file):
blast_db = blastFunctions.create_blast_db(chunk, temp_dir)

LOG.info("Start blast alignment on database #{0}".format(index + 1))
blast_output_file = blastFunctions.run_blast(
query_file, blast_db, args, len(chunk))
blast_output_file = blastFunctions.run_blast(query_file, blast_db, args)
LOG.info("Start serotype prediction for database #{0}".format(index + 1))
predictions_file = predictionFunctions.predict_serotype(
blast_output_file, ectyper_dict_file, predictions_file,
args.verbose)
args.detailed)
return predictions_file


def get_raw_files(raw_files):
"""
Take all the raw files, and filter not fasta / fastq
:param raw_files:
:return (raw_fasta_files, raw_fastq_files)
"""Take all the raw files, and filter not fasta / fastq
Args:
raw_files(str): list of files from user input
Returns:
A dictitionary collection of fasta and fastq files
example:
{'raw_fasta_files':[],
'raw_fastq_files':[]}
"""
fasta_files = []
fastq_files = []
Expand All @@ -215,11 +231,21 @@ def get_raw_files(raw_files):


def filter_for_ecoli_files(raw_dict, temp_files, verify=False, species=False):
"""
:param raw_dict{fasta:list_of_files, fastq:list_of_files}:
:parapm temp_file:
:param verify:
:param species:
"""filter ecoli, identify species, assemble fastq
Assemble fastq files to fasta files,
then filter all files by reference method if verify is enabled,
if identified as non-ecoli, identify species by mash method if species is enabled.
Args:
raw_dict{fasta:list_of_files, fastq:list_of_files}:
dictionary collection of fasta and fastq files
temp_file: temporary directory
verify(bool):
whether to perform ecoli verification
species(bool):
whether to perform species identification for non-ecoli genome
Returns:
List of filtered and assembled genome files in fasta format
"""
final_files = []
for f in raw_dict.keys():
Expand All @@ -236,15 +262,22 @@ def filter_for_ecoli_files(raw_dict, temp_files, verify=False, species=False):
return final_files

def filter_file_by_species(genome_file, genome_format, temp_dir, verify=False, species=False):
'''
Core species recognition functionality
:param genome_file:
:param genome_format:
:param temp_dir:
:param verify:
:param mash:
:returns filtered_file
'''
"""filter ecoli, identify species, assemble fastq
Assemble fastq file to fasta file,
then filter the file by reference method if verify is enabled,
if identified as non-ecoli, identify species by mash method if species is enabled.
Args:
genome_file: input genome file
genome_format(str): fasta or fastq
temp_file: temporary directory
verify(bool):
whether to perform ecoli verification
species(bool):
whether to perform species identification for non-ecoli genome
Returns:
The filtered and assembled genome files in fasta format
"""
combined_file = definitions.COMBINED
filtered_file = None
if genome_format == 'fastq':
Expand Down
Loading

0 comments on commit e4295ba

Please sign in to comment.