diff --git a/CHANGELOG.md b/CHANGELOG.md index 839593a..dde6ada 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Changelog +# mpwt 0.8.2 (2022-12-16) + +## Modify: + +- decrease the number of calls to get Pathway Tools version. +- error message when not finding Pathway Tools version (issue #84). +- remove unused code. + +## Fix: + +- issue with compressed files not being check in output folder +- issue when no arguments are given as input. +- issue when mpwt does not exit correctly when detecting an error during check input step. + # mpwt 0.8.1 (2022-09-30) ## Fix: diff --git a/README.rst b/README.rst index c4ba454..0472506 100755 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ .. image:: https://img.shields.io/badge/doi-10.7554/eLife.61968-blueviolet.svg :target: https://doi.org/10.7554/eLife.61968 -.. image:: https://img.shields.io/badge/Pathway%20Tools-26.0-brightgreen +.. image:: https://img.shields.io/badge/Pathway%20Tools-26.5-brightgreen :target: https://bioinformatics.ai.sri.com/ptools/release-notes.html mpwt: Multiprocessing Pathway Tools diff --git a/mpwt/__init__.py b/mpwt/__init__.py index 7d19350..1adf047 100755 --- a/mpwt/__init__.py +++ b/mpwt/__init__.py @@ -17,4 +17,4 @@ from mpwt.utils import cleaning, cleaning_input, find_ptools_path, list_pgdb, pubmed_citations, remove_pgdbs from mpwt.to_pathologic import create_pathologic_file -__version__='0.8.1' \ No newline at end of file +__version__='0.8.2' \ No newline at end of file diff --git a/mpwt/__main__.py b/mpwt/__main__.py index b968864..57c2684 100755 --- a/mpwt/__main__.py +++ b/mpwt/__main__.py @@ -316,7 +316,7 @@ def run_mpwt(): permission = args.permission # If no argument print the help. - if len(sys.argv) == 1: + if len(sys.argv) == 1 or (len(sys.argv) == 2 and verbose): parser.print_help() sys.exit(1) diff --git a/mpwt/mpwt_workflow.py b/mpwt/mpwt_workflow.py index 6dd7652..4d8dc4f 100755 --- a/mpwt/mpwt_workflow.py +++ b/mpwt/mpwt_workflow.py @@ -31,7 +31,7 @@ from mpwt import utils from mpwt.pwt_wrapper import run_pwt, run_pwt_flat, run_move_pgdb from mpwt.results_check import check_dat, check_mpwt_pathologic_runs -from mpwt.pathologic_input import check_input_and_existing_pgdb, pwt_input_files, create_only_flat_lisp, create_flat_creation_script, read_taxon_id +from mpwt.pathologic_input import check_input_and_existing_pgdb, pwt_input_files, create_only_flat_lisp from multiprocessing import Pool logger = logging.getLogger(__name__) @@ -79,6 +79,10 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None if error: sys.exit(1) + # Check if patho_inference is launched with input_folder. + if patho_inference and not input_folder: + sys.exit('To use --patho/patho_inference you need to add the -f/input_folder argument.') + # Check if patho_hole_filler or patho_log are launched with patho_inference. if (patho_hole_filler and not patho_inference) or (patho_log and not patho_inference): sys.exit('To use either --hf/patho_hole_filler or --log/patho_log, you need to add the --patho/patho_inference argument.') @@ -161,7 +165,7 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None no_download_articles, flat_creation, dat_extraction, xml_extraction, owl_extraction, col_extraction, size_reduction, number_cpu_to_use, patho_log, - pathway_score, taxon_file, permission) + pathway_score, taxon_file, permission, ptools_version) def close_mpwt(mpwt_pool, no_download_articles, pathway_score=None, old_pathway_score=None): @@ -211,7 +215,7 @@ def run_mpwt(run_folder=None, input_folder=None, run_input_files_creation=None, run_output_folder=None, output_folder=None, run_patho_inference=None, pathologic_options=None, run_flat_creation=None, move_options=None, - taxon_file=None, permission=None): + taxon_file=None, permission=None, ptools_version=None): """ Single run of mpwt on one folder. Used in multiprocessing in independent_mpwt. @@ -227,6 +231,7 @@ def run_mpwt(run_folder=None, input_folder=None, run_input_files_creation=None, move_options (list): list of bool for: dat_extraction, size_reduction, xml_extraction, owl_extraction, col_extraction taxon_file (str): pathname to the mpwt taxon ID file permission (str): Choose permission access to PGDB in ptools-local and output files, either 'all' or 'group' (by default it is user) + ptools_version (tuple, None): Version number of Pathway Tools (obtained from get_ptools_version funciton). Returns: run_folder (str): name of the folder containing input files input_error_status (bool): if True an error occurs during pathologic input files creation @@ -243,8 +248,6 @@ def run_mpwt(run_folder=None, input_folder=None, run_input_files_creation=None, flat_error_status = False move_error_status = False - ptools_version = utils.get_ptools_version() - if input_folder: run_folder_path = os.path.join(input_folder, run_folder) @@ -293,7 +296,8 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None, no_download_articles=None, flat_creation=None, dat_extraction=None, xml_extraction=None, owl_extraction=None, col_extraction=None, size_reduction=None, number_cpu_to_use=None, patho_log=None, - pathway_score=None, taxon_file=None, permission=None): + pathway_score=None, taxon_file=None, permission=None, + ptools_version= None): """ Function managing the workflow for independent run of mpwt. Each process of Pathway Tools on an organism are run separatly so if one failed the other that passed will succeed. @@ -318,6 +322,7 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None, pathway_score (float): score between 0 and 1 to accept or reject pathway taxon_file (str): pathname to the mpwt taxon ID file permission (str): Choose permission access to PGDB in ptools-local and output files, either 'all' or 'group' (by default it is user). + ptools_version (tuple, None): Version number of Pathway Tools (obtained from get_ptools_version funciton). """ logger.info('---------- Launching mpwt ----------') ptools_local_path = utils.find_ptools_path() @@ -355,6 +360,9 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None, if input_folder: run_ids = [folder_id for folder_id in next(os.walk(input_folder))[1]] run_patho_flat_ids, run_flat_ids = check_input_and_existing_pgdb(run_ids, input_folder, output_folder, number_cpu_to_use) + if run_patho_flat_ids is None and run_flat_ids is None: + logger.critical('/!\\ Issue during input check.') + sys.exit() # Create path for lisp if there is no folder given. # Create the input for the creation of BioPAX/attribute-values files. @@ -420,8 +428,13 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None, run_output_folder = False logger.info('/!\\ {0} contains already {1}, output files will not be moved.'.format(output_folder, run_id)) + # Test for compressed PGDB. + if os.path.exists(run_id_output_folder+'.zip'): + run_output_folder = False + logger.info('/!\\ {0} contains already compressed {1}, output files will not be moved.'.format(output_folder, run_id)) + multiprocess_run_mpwt = [run_id, run_input_folder, run_input_files_creation, run_output_folder, output_folder, run_patho_inference, pathologic_options, - run_flat_creation, move_options, taxon_file, permission] + run_flat_creation, move_options, taxon_file, permission, ptools_version] multiprocess_run_mpwts.append(multiprocess_run_mpwt) @@ -474,4 +487,3 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None, end_time = time.time() logger.info('-------------- mpwt has finished in {0:.2f}s! Thank you for using it. --------------'.format(end_time - start_time)) - diff --git a/mpwt/pathologic_input.py b/mpwt/pathologic_input.py index 0e2be4b..faf1474 100755 --- a/mpwt/pathologic_input.py +++ b/mpwt/pathologic_input.py @@ -131,7 +131,8 @@ def check_input_and_existing_pgdb(run_ids, input_folder, output_folder, number_c if output_folder: if os.path.exists(output_folder): if os.path.isdir(output_folder): - already_present_outputs = [output_pgdb for output_pgdb in os.listdir(output_folder)] + # To handle PGDB created with size_reduction option, remove .zip extension. + already_present_outputs = [output_pgdb.replace('.zip', '') for output_pgdb in os.listdir(output_folder)] new_run_ids = clean_run_ids - set(already_present_outputs) new_run_ids = list(new_run_ids) for pgdb in already_present_outputs: @@ -520,39 +521,39 @@ def create_flats_and_lisp(run_folder, taxon_file): elif all([True for species_file in os.listdir(run_folder) if '.pf' in species_file or '.fasta' in species_file or '.fsa' in species_file]): genetic_writer = csv.writer(genetic_file, delimiter='\t', lineterminator='\n') for species_file in os.listdir(run_folder): - if '.pf' in species_file: - species_file_name = os.path.splitext(species_file)[0] - genetic_writer.writerow(['NAME', species_file.replace('.pf', '')]) - genetic_writer.writerow(['ID', species_file.replace('.pf', '')]) - genetic_writer.writerow(['ANNOT-FILE', species_file]) - fasta_path = os.path.join(run_folder, species_file.replace('.pf', '.fasta')) - fsa_path = os.path.join(run_folder, species_file.replace('.pf', '.fsa')) - if os.path.exists(fasta_path): - genetic_writer.writerow(['SEQ-FILE', species_file.replace('.pf', '.fasta')]) - elif os.path.exists(fsa_path): - genetic_writer.writerow(['SEQ-FILE', species_file.replace('.pf', '.fsa')]) - - if species_file_name in taxon_datas: - if 'circular' in taxon_datas[species_file_name]: - circular = taxon_datas[species_file_name]['circular'] - genetic_writer.writerow(['CIRCULAR?', circular]) - if 'element_type' in taxon_datas[species_file_name]: - element_type = taxon_datas[species_file_name]['element_type'] - genetic_writer.writerow(['TYPE', element_type]) - if 'codon_table' in taxon_datas[species_file_name]: - codon_table = taxon_datas[species_file_name]['codon_table'] - genetic_writer.writerow(['CODON-TABLE', codon_table]) - else: - if 'circular' in taxon_datas: - circular = taxon_datas['circular'] - genetic_writer.writerow(['CIRCULAR?', circular]) - if 'element_type' in taxon_datas: - element_type = taxon_datas['element_type'] - genetic_writer.writerow(['TYPE', element_type]) - if 'codon_table' in taxon_datas: - codon_table = taxon_datas['codon_table'] - genetic_writer.writerow(['CODON-TABLE', codon_table]) - genetic_writer.writerow(['//']) + if '.pf' in species_file: + species_file_name = os.path.splitext(species_file)[0] + genetic_writer.writerow(['NAME', species_file.replace('.pf', '')]) + genetic_writer.writerow(['ID', species_file.replace('.pf', '')]) + genetic_writer.writerow(['ANNOT-FILE', species_file]) + fasta_path = os.path.join(run_folder, species_file.replace('.pf', '.fasta')) + fsa_path = os.path.join(run_folder, species_file.replace('.pf', '.fsa')) + if os.path.exists(fasta_path): + genetic_writer.writerow(['SEQ-FILE', species_file.replace('.pf', '.fasta')]) + elif os.path.exists(fsa_path): + genetic_writer.writerow(['SEQ-FILE', species_file.replace('.pf', '.fsa')]) + + if species_file_name in taxon_datas: + if 'circular' in taxon_datas[species_file_name]: + circular = taxon_datas[species_file_name]['circular'] + genetic_writer.writerow(['CIRCULAR?', circular]) + if 'element_type' in taxon_datas[species_file_name]: + element_type = taxon_datas[species_file_name]['element_type'] + genetic_writer.writerow(['TYPE', element_type]) + if 'codon_table' in taxon_datas[species_file_name]: + codon_table = taxon_datas[species_file_name]['codon_table'] + genetic_writer.writerow(['CODON-TABLE', codon_table]) + else: + if 'circular' in taxon_datas: + circular = taxon_datas['circular'] + genetic_writer.writerow(['CIRCULAR?', circular]) + if 'element_type' in taxon_datas: + element_type = taxon_datas['element_type'] + genetic_writer.writerow(['TYPE', element_type]) + if 'codon_table' in taxon_datas: + codon_table = taxon_datas['codon_table'] + genetic_writer.writerow(['CODON-TABLE', codon_table]) + genetic_writer.writerow(['//']) if not os.path.exists(lisp_pathname): # Create the lisp script. @@ -563,64 +564,6 @@ def create_flats_and_lisp(run_folder, taxon_file): return all([os.path.isfile(organism_dat), os.path.isfile(genetic_dat), check_lisp_file]) -def read_taxon_id(run_folder): - """ - Search for Taxon ID in genbank or GFF files. - For GenBank file searc for ''taxon:' key in 'db_xref' qualifier. - For GFF file search for 'taxon' in dbxref feature. - - Args: - run_folder (str): path to the input folder - """ - taxon_ids = {} - - for input_folder in os.listdir(run_folder): - input_folder_path = os.path.join(run_folder, input_folder) - for input_file in os.listdir(input_folder_path): - if '.gbk' in input_file: - gbk_pathname = os.path.join(input_folder_path, input_file) - # Take the species name and the taxon id from the genbank file. - with open(gbk_pathname, "r") as gbk: - # Take the first record of the genbank (first contig/chromosome) to retrieve the species name. - first_seq_record = next(SeqIO.parse(gbk, "genbank")) - # Take the source feature of the first record. - # This feature contains the taxon ID in the db_xref qualifier. - src_features = [feature for feature in first_seq_record.features if feature.type == "source"] - for src_feature in src_features: - try: - src_dbxref_qualifiers = src_feature.qualifiers['db_xref'] - for src_dbxref_qualifier in src_dbxref_qualifiers: - if 'taxon:' in src_dbxref_qualifier: - taxon_id = src_dbxref_qualifier.replace('taxon:', '') - except KeyError: - logger.info('No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'.format(gbk_pathname)) - - elif '.gff' in input_file: - gff_pathname = os.path.join(input_folder_path, input_file) - - # Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature. - try: - region_feature = [feature for feature in DataIterator(gff_pathname) if feature.featuretype == 'region'][0] - except IndexError: - raise IndexError('No region feature in the GFF file of {0}, GFF file must have region features.'.format(input_folder)) - - try: - region_feature.attributes['Dbxref'] - except KeyError: - raise KeyError('No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.'.format(input_folder)) - - for dbxref in region_feature.attributes['Dbxref']: - if 'taxon' in dbxref: - taxon_id = dbxref.split('taxon:')[1] - - elif '.pf' in input_file: - logger.info('No taxon ID associated to a PathoLogic Format. {0} will have a missing taxon_id'.format(input_folder)) - taxon_id = "missing" - taxon_ids[input_folder] = taxon_id - - return taxon_ids - - def pwt_input_files(run_folder, taxon_file): """ Check if files needed by Pathway Tools are available, if not create them. diff --git a/mpwt/utils.py b/mpwt/utils.py index f573970..7721ee5 100755 --- a/mpwt/utils.py +++ b/mpwt/utils.py @@ -85,6 +85,11 @@ def get_ptools_version(): if 'Pathway Tools version ' in ptools_line: ptools_version = tuple([int(nb_version) for nb_version in ptools_line.split('Pathway Tools version ')[1].split(' :::')[0].split('.')]) + if ptools_version is None: + logger.critical('mpwt could not find the version of Pathway Tools.') + logger.critical('It is possibly an issue with the installation of Pathway Tools (maybe it is not in the PATH). Or it can be due to a change in the output of pathway-tools -id command.') + sys.exit() + return ptools_version diff --git a/mpwt_pipeline.svg b/mpwt_pipeline.svg deleted file mode 100644 index a3e4259..0000000 --- a/mpwt_pipeline.svg +++ /dev/null @@ -1,1452 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - PathoLogic, GenbBank, GFF - PathoLogic, GenBank, GFF - taxon_id.tsv - - - - ptools-local folder - PGDBs creation - attribute-values flat files creation - --flat - output folder - - attribute-values flat files - - input folder - PathoLogic input files - - - - - - - - Species A - Species B - Species C - - - input folder - - --patho - -f - - - - - - Hole Filler: --hf - Operon Predictor: --op - Transport Inference: --tp - - mpwt - - All files - -o - --md - PGDBs storage - - - - - - Pathway Tools - - - --taxon-file - - metabolic-reactions.xml - --mx - biopax owl files - --mo - tabular files - --mc - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -