Merge pull request #85 from AuReMe/mpwt_0.8.2

Mpwt 0.8.2
AuReMe · Dec 16, 2022 · c9a2eed · c9a2eed
2 parents 6511c06 + 9450aff
commit c9a2eed
Show file tree

Hide file tree

Showing 8 changed files with 77 additions and 1,555 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,19 @@
 # Changelog
 
+# mpwt 0.8.2 (2022-12-16)
+
+## Modify:
+
+- decrease the number of calls to get Pathway Tools version.
+- error message when not finding Pathway Tools version (issue #84).
+- remove unused code.
+
+## Fix:
+
+- issue with compressed files not being check in output folder
+- issue when no arguments are given as input.
+- issue when mpwt does not exit correctly when detecting an error during check input step.
+
 # mpwt 0.8.1 (2022-09-30)
 
 ## Fix:

diff --git a/README.rst b/README.rst
@@ -4,7 +4,7 @@
 .. image:: https://img.shields.io/badge/doi-10.7554/eLife.61968-blueviolet.svg
     :target: https://doi.org/10.7554/eLife.61968
 
-.. image:: https://img.shields.io/badge/Pathway%20Tools-26.0-brightgreen
+.. image:: https://img.shields.io/badge/Pathway%20Tools-26.5-brightgreen
     :target: https://bioinformatics.ai.sri.com/ptools/release-notes.html
 
 mpwt: Multiprocessing Pathway Tools

diff --git a/mpwt/__init__.py b/mpwt/__init__.py
@@ -17,4 +17,4 @@
 from mpwt.utils import cleaning, cleaning_input, find_ptools_path, list_pgdb, pubmed_citations, remove_pgdbs
 from mpwt.to_pathologic import create_pathologic_file
 
-__version__='0.8.1'
+__version__='0.8.2'
diff --git a/mpwt/__main__.py b/mpwt/__main__.py
@@ -316,7 +316,7 @@ def run_mpwt():
     permission = args.permission
 
     # If no argument print the help.
-    if len(sys.argv) == 1:
+    if len(sys.argv) == 1 or (len(sys.argv) == 2 and verbose):
         parser.print_help()
         sys.exit(1)
 

diff --git a/mpwt/mpwt_workflow.py b/mpwt/mpwt_workflow.py
@@ -31,7 +31,7 @@
 from mpwt import utils
 from mpwt.pwt_wrapper import run_pwt, run_pwt_flat, run_move_pgdb
 from mpwt.results_check import check_dat, check_mpwt_pathologic_runs
-from mpwt.pathologic_input import check_input_and_existing_pgdb, pwt_input_files, create_only_flat_lisp, create_flat_creation_script, read_taxon_id
+from mpwt.pathologic_input import check_input_and_existing_pgdb, pwt_input_files, create_only_flat_lisp
 from multiprocessing import Pool
 
 logger = logging.getLogger(__name__)
@@ -79,6 +79,10 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
     if error:
         sys.exit(1)
 
+    # Check if patho_inference is launched with input_folder.
+    if patho_inference and not input_folder:
+        sys.exit('To use --patho/patho_inference you need to add the -f/input_folder argument.')
+
     # Check if patho_hole_filler or patho_log are launched with patho_inference.
     if (patho_hole_filler and not patho_inference) or (patho_log and not patho_inference):
         sys.exit('To use either --hf/patho_hole_filler or --log/patho_log, you need to add the --patho/patho_inference argument.')
@@ -161,7 +165,7 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
                         no_download_articles, flat_creation, dat_extraction,
                         xml_extraction, owl_extraction, col_extraction,
                         size_reduction, number_cpu_to_use, patho_log,
-                        pathway_score, taxon_file, permission)
+                        pathway_score, taxon_file, permission, ptools_version)
 
 
 def close_mpwt(mpwt_pool, no_download_articles, pathway_score=None, old_pathway_score=None):
@@ -211,7 +215,7 @@ def run_mpwt(run_folder=None, input_folder=None, run_input_files_creation=None,
                 run_output_folder=None, output_folder=None,
                 run_patho_inference=None, pathologic_options=None,
                 run_flat_creation=None, move_options=None,
-                taxon_file=None, permission=None):
+                taxon_file=None, permission=None, ptools_version=None):
     """ Single run of mpwt on one folder.
     Used in multiprocessing in independent_mpwt.
 
@@ -227,6 +231,7 @@ def run_mpwt(run_folder=None, input_folder=None, run_input_files_creation=None,
         move_options (list): list of bool for: dat_extraction, size_reduction, xml_extraction, owl_extraction, col_extraction
         taxon_file (str): pathname to the mpwt taxon ID file
         permission (str): Choose permission access to PGDB in ptools-local and output files, either 'all' or 'group' (by default it is user)
+        ptools_version (tuple, None): Version number of Pathway Tools (obtained from get_ptools_version funciton).
     Returns:
         run_folder (str): name of the folder containing input files
         input_error_status (bool): if True an error occurs during pathologic input files creation
@@ -243,8 +248,6 @@ def run_mpwt(run_folder=None, input_folder=None, run_input_files_creation=None,
     flat_error_status = False
     move_error_status = False
 
-    ptools_version = utils.get_ptools_version()
-
     if input_folder:
         run_folder_path = os.path.join(input_folder, run_folder)
 
@@ -293,7 +296,8 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
                      no_download_articles=None, flat_creation=None, dat_extraction=None,
                      xml_extraction=None, owl_extraction=None, col_extraction=None,
                      size_reduction=None, number_cpu_to_use=None, patho_log=None,
-                     pathway_score=None, taxon_file=None, permission=None):
+                     pathway_score=None, taxon_file=None, permission=None,
+                     ptools_version= None):
     """
     Function managing the workflow for independent run of mpwt.
     Each process of Pathway Tools on an organism are run separatly so if one failed the other that passed will succeed.
@@ -318,6 +322,7 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
         pathway_score (float): score between 0 and 1 to accept or reject pathway
         taxon_file (str): pathname to the mpwt taxon ID file
         permission (str): Choose permission access to PGDB in ptools-local and output files, either 'all' or 'group' (by default it is user).
+        ptools_version (tuple, None): Version number of Pathway Tools (obtained from get_ptools_version funciton).
     """
     logger.info('---------- Launching mpwt ----------')
     ptools_local_path = utils.find_ptools_path()
@@ -355,6 +360,9 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
     if input_folder:
         run_ids = [folder_id for folder_id in next(os.walk(input_folder))[1]]
         run_patho_flat_ids, run_flat_ids = check_input_and_existing_pgdb(run_ids, input_folder, output_folder, number_cpu_to_use)
+        if run_patho_flat_ids is None and run_flat_ids is None:
+            logger.critical('/!\\ Issue during input check.')
+            sys.exit()
 
     # Create path for lisp if there is no folder given.
     # Create the input for the creation of BioPAX/attribute-values files.
@@ -420,8 +428,13 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
                 run_output_folder = False
                 logger.info('/!\\ {0} contains already {1}, output files will not be moved.'.format(output_folder, run_id))
 
+            # Test for compressed PGDB.
+            if os.path.exists(run_id_output_folder+'.zip'):
+                run_output_folder = False
+                logger.info('/!\\ {0} contains already compressed {1}, output files will not be moved.'.format(output_folder, run_id))
+
         multiprocess_run_mpwt = [run_id, run_input_folder, run_input_files_creation, run_output_folder, output_folder, run_patho_inference, pathologic_options,
-                                run_flat_creation, move_options, taxon_file, permission]
+                                run_flat_creation, move_options, taxon_file, permission, ptools_version]
 
         multiprocess_run_mpwts.append(multiprocess_run_mpwt)
 
@@ -474,4 +487,3 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
     end_time = time.time()
 
     logger.info('-------------- mpwt has finished in {0:.2f}s! Thank you for using it. --------------'.format(end_time - start_time))
-
diff --git a/mpwt/pathologic_input.py b/mpwt/pathologic_input.py
@@ -131,7 +131,8 @@ def check_input_and_existing_pgdb(run_ids, input_folder, output_folder, number_c
     if output_folder:
         if os.path.exists(output_folder):
             if os.path.isdir(output_folder):
-                already_present_outputs = [output_pgdb for output_pgdb in os.listdir(output_folder)]
+                # To handle PGDB created with size_reduction option, remove .zip extension.
+                already_present_outputs = [output_pgdb.replace('.zip', '') for output_pgdb in os.listdir(output_folder)]
                 new_run_ids = clean_run_ids - set(already_present_outputs)
                 new_run_ids = list(new_run_ids)
                 for pgdb in already_present_outputs:
@@ -520,39 +521,39 @@ def create_flats_and_lisp(run_folder, taxon_file):
             elif all([True for species_file in os.listdir(run_folder) if '.pf' in species_file or '.fasta' in species_file or '.fsa' in species_file]):
                 genetic_writer = csv.writer(genetic_file, delimiter='\t', lineterminator='\n')
                 for species_file in os.listdir(run_folder):
-                        if '.pf' in species_file:
-                            species_file_name = os.path.splitext(species_file)[0]
-                            genetic_writer.writerow(['NAME', species_file.replace('.pf', '')])
-                            genetic_writer.writerow(['ID', species_file.replace('.pf', '')])
-                            genetic_writer.writerow(['ANNOT-FILE', species_file])
-                            fasta_path = os.path.join(run_folder, species_file.replace('.pf', '.fasta'))
-                            fsa_path = os.path.join(run_folder, species_file.replace('.pf', '.fsa'))
-                            if os.path.exists(fasta_path):
-                                genetic_writer.writerow(['SEQ-FILE', species_file.replace('.pf', '.fasta')])
-                            elif os.path.exists(fsa_path):
-                                genetic_writer.writerow(['SEQ-FILE', species_file.replace('.pf', '.fsa')])
-
-                            if species_file_name in taxon_datas:
-                                if 'circular' in taxon_datas[species_file_name]:
-                                    circular = taxon_datas[species_file_name]['circular']
-                                    genetic_writer.writerow(['CIRCULAR?', circular])
-                                if 'element_type' in taxon_datas[species_file_name]:
-                                    element_type = taxon_datas[species_file_name]['element_type']
-                                    genetic_writer.writerow(['TYPE', element_type])
-                                if 'codon_table' in taxon_datas[species_file_name]:
-                                    codon_table = taxon_datas[species_file_name]['codon_table']
-                                    genetic_writer.writerow(['CODON-TABLE', codon_table])
-                            else:
-                                if 'circular' in taxon_datas:
-                                    circular = taxon_datas['circular']
-                                    genetic_writer.writerow(['CIRCULAR?', circular])
-                                if 'element_type' in taxon_datas:
-                                    element_type = taxon_datas['element_type']
-                                    genetic_writer.writerow(['TYPE', element_type])
-                                if 'codon_table' in taxon_datas:
-                                    codon_table = taxon_datas['codon_table']
-                                    genetic_writer.writerow(['CODON-TABLE', codon_table])
-                            genetic_writer.writerow(['//'])
+                    if '.pf' in species_file:
+                        species_file_name = os.path.splitext(species_file)[0]
+                        genetic_writer.writerow(['NAME', species_file.replace('.pf', '')])
+                        genetic_writer.writerow(['ID', species_file.replace('.pf', '')])
+                        genetic_writer.writerow(['ANNOT-FILE', species_file])
+                        fasta_path = os.path.join(run_folder, species_file.replace('.pf', '.fasta'))
+                        fsa_path = os.path.join(run_folder, species_file.replace('.pf', '.fsa'))
+                        if os.path.exists(fasta_path):
+                            genetic_writer.writerow(['SEQ-FILE', species_file.replace('.pf', '.fasta')])
+                        elif os.path.exists(fsa_path):
+                            genetic_writer.writerow(['SEQ-FILE', species_file.replace('.pf', '.fsa')])
+
+                        if species_file_name in taxon_datas:
+                            if 'circular' in taxon_datas[species_file_name]:
+                                circular = taxon_datas[species_file_name]['circular']
+                                genetic_writer.writerow(['CIRCULAR?', circular])
+                            if 'element_type' in taxon_datas[species_file_name]:
+                                element_type = taxon_datas[species_file_name]['element_type']
+                                genetic_writer.writerow(['TYPE', element_type])
+                            if 'codon_table' in taxon_datas[species_file_name]:
+                                codon_table = taxon_datas[species_file_name]['codon_table']
+                                genetic_writer.writerow(['CODON-TABLE', codon_table])
+                        else:
+                            if 'circular' in taxon_datas:
+                                circular = taxon_datas['circular']
+                                genetic_writer.writerow(['CIRCULAR?', circular])
+                            if 'element_type' in taxon_datas:
+                                element_type = taxon_datas['element_type']
+                                genetic_writer.writerow(['TYPE', element_type])
+                            if 'codon_table' in taxon_datas:
+                                codon_table = taxon_datas['codon_table']
+                                genetic_writer.writerow(['CODON-TABLE', codon_table])
+                        genetic_writer.writerow(['//'])
 
     if not os.path.exists(lisp_pathname):
     # Create the lisp script.
@@ -563,64 +564,6 @@ def create_flats_and_lisp(run_folder, taxon_file):
     return all([os.path.isfile(organism_dat), os.path.isfile(genetic_dat), check_lisp_file])
 
 
-def read_taxon_id(run_folder):
-    """
-    Search for Taxon ID in genbank or GFF files.
-    For GenBank file searc for ''taxon:' key in 'db_xref' qualifier.
-    For GFF file search for 'taxon' in dbxref feature.
-
-    Args:
-        run_folder (str): path to the input folder
-    """
-    taxon_ids = {}
-
-    for input_folder in os.listdir(run_folder):
-        input_folder_path = os.path.join(run_folder, input_folder)
-        for input_file in os.listdir(input_folder_path):
-            if '.gbk' in input_file:
-                gbk_pathname = os.path.join(input_folder_path, input_file)
-                # Take the species name and the taxon id from the genbank file.
-                with open(gbk_pathname, "r") as gbk:
-                    # Take the first record of the genbank (first contig/chromosome) to retrieve the species name.
-                    first_seq_record = next(SeqIO.parse(gbk, "genbank"))
-                    # Take the source feature of the first record.
-                    # This feature contains the taxon ID in the db_xref qualifier.
-                    src_features = [feature for feature in first_seq_record.features if feature.type == "source"]
-                    for src_feature in src_features:
-                        try:
-                            src_dbxref_qualifiers = src_feature.qualifiers['db_xref']
-                            for src_dbxref_qualifier in src_dbxref_qualifiers:
-                                if 'taxon:' in src_dbxref_qualifier:
-                                    taxon_id = src_dbxref_qualifier.replace('taxon:', '')
-                        except KeyError:
-                            logger.info('No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'.format(gbk_pathname))
-
-            elif '.gff' in input_file:
-                gff_pathname = os.path.join(input_folder_path, input_file)
-
-                # Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature.
-                try:
-                    region_feature = [feature for feature in DataIterator(gff_pathname) if feature.featuretype == 'region'][0]
-                except IndexError:
-                    raise IndexError('No region feature in the GFF file of {0}, GFF file must have region features.'.format(input_folder))
-
-                try:
-                    region_feature.attributes['Dbxref']
-                except KeyError:
-                    raise KeyError('No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.'.format(input_folder))
-
-                for dbxref in region_feature.attributes['Dbxref']:
-                    if 'taxon' in dbxref:
-                        taxon_id = dbxref.split('taxon:')[1]
-
-            elif '.pf' in input_file:
-                logger.info('No taxon ID associated to a PathoLogic Format. {0} will have a missing taxon_id'.format(input_folder))
-                taxon_id = "missing"
-        taxon_ids[input_folder] = taxon_id
-
-    return taxon_ids
-
-
 def pwt_input_files(run_folder, taxon_file):
     """
     Check if files needed by Pathway Tools are available, if not create them.

diff --git a/mpwt/utils.py b/mpwt/utils.py
@@ -85,6 +85,11 @@ def get_ptools_version():
         if 'Pathway Tools version ' in ptools_line:
             ptools_version = tuple([int(nb_version) for nb_version in ptools_line.split('Pathway Tools version ')[1].split('  :::')[0].split('.')])
 
+    if ptools_version is None:
+        logger.critical('mpwt could not find the version of Pathway Tools.')
+        logger.critical('It is possibly an issue with the installation of Pathway Tools (maybe it is not in the PATH). Or it can be due to a change in the output of pathway-tools -id command.')
+        sys.exit()
+
     return ptools_version