Merge pull request #74 from AuReMe/mpwt_0.7.1

Mpwt 0.7.1
AuReMe · Mar 18, 2022 · d4345a0 · d4345a0
2 parents c6bc771 + 5e49cb3
commit d4345a0
Show file tree

Hide file tree

Showing 6 changed files with 98 additions and 39 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,21 @@
 # Changelog
 
+# mpwt 0.7.1 (2022-03-18)
+
+## Add:
+
+- support for specifying reference PGDB (with taxon_id file) to be used in PathoLogic.
+- warning message when there is a missing pathologic.log file during log file creation (before mpwt crahses with a python error associated with `species_pathologic_informations`).
+
+## Fix:
+
+- an issue where mpwt does not stop if there is an error in pwt_input_files.
+- input files not created if mpwt uses PGDB from ptools-local.
+
+## Modify:
+
+- update readme.
+
 # mpwt 0.7.0 (2022-02-03)
 
 This version should be compatible with Pathway Tools 25.5.

diff --git a/README.rst b/README.rst
@@ -210,7 +210,14 @@ You have to provide one nucleotide sequence (either '.fasta' or '.fsa' extension
     >scaffold_1
     ATGATGCTGATACTGACTTAGCAT
 
-Also to add the taxon ID we need the **taxon_id.tsv** (a tsv file with two values: the name of the folder containing the PF files and the taxon ID corresponding).
+You also need to add the taxon ID in the **taxon_id.tsv** (a tsv file with two values: the name of the folder containing the PF files and the taxon ID corresponding).
+
+taxon_id.tsv file
++++++++++++++++++
+
+This tabulated file is required when using PathoLogic Format as input. But it can also bee used to give more informations to Pathway Tools.
+
+A simple file looks like this:
 
 +------------+------------+
 |species     |taxon_id    |
@@ -222,17 +229,20 @@ If you don't have taxon ID in your Genbank or GFF file, you can add one in this
 
 You can also add more informations for the genetic elements like **circularity of genome** (Y or N), **type of genetic element** (:CHRSM, :PLASMID, :MT (mitochondrial chromosome), :PT (chloroplast chromosome), or :CONTIG) or **codon table** (see the corresponding code below).
 
+You can also specify reference PGDB. This can be useful if you have PGDB with manual curation, especially with reactions or pathways not present in MetaCyc.
+These reactions or pathways will be added into MetaCyc before reaction and pathways prediction (if the reactions or pathways are supported by evidence other than computational ones).
+
 Example:
 
-+------------+------------+------------+------------+------------+-------------------+
-|species     |taxon_id    |  circular  |element_type| codon_table| corresponding_file|
-+============+============+============+============+============+===================+
-|species_1   |10          |    Y       | :CHRSM     |1           |                   |
-+------------+------------+------------+------------+------------+-------------------+
-|species_4   |4           |    N       | :CHRSM     |1           |  scaffold_1       |
-+------------+------------+------------+------------+------------+-------------------+
-|species_4   |4           |    N       | :MT        |1           |  scaffold_2       |
-+------------+------------+------------+------------+------------+-------------------+
++------------+------------+------------+------------+------------+-------------------+----------------+
+|species     |taxon_id    |  circular  |element_type| codon_table| corresponding_file| reference_pgdb |
++============+============+============+============+============+===================+================+
+|species_1   |10          |    Y       | :CHRSM     |1           |                   |    pgdb_id     |
++------------+------------+------------+------------+------------+-------------------+----------------+
+|species_4   |4           |    N       | :CHRSM     |1           |  scaffold_1       |                |
++------------+------------+------------+------------+------------+-------------------+----------------+
+|species_4   |4           |    N       | :MT        |1           |  scaffold_2       |                |
++------------+------------+------------+------------+------------+-------------------+----------------+
 
 As you can see for **PF file** (species_4) you can use the column **corresponding_file** to add information for each PF files.
 
@@ -788,7 +798,7 @@ Output
 ~~~~~~
 
 If you did not use the output argument, results (PGDB with/without BioPAX/flat files) will be inside your ptools-local folder ready to be used with Pathway Tools.
-Have in mind that mpwt does not create the cellular overview and does not used the hole-filler. So if you want these results you should run them after.
+Have in mind that mpwt does not create the cellular overview. So if you want these results you should run them after.
 
 The different file formats created are described on `Pathway Tools data-file format site <https://bioinformatics.ai.sri.com/ptools/flatfile-format.html>`__.
 

diff --git a/mpwt/__init__.py b/mpwt/__init__.py
@@ -17,4 +17,4 @@
 from mpwt.utils import cleaning, cleaning_input, find_ptools_path, list_pgdb, pubmed_citations, remove_pgdbs
 from mpwt.to_pathologic import create_pathologic_file
 
-__version__='0.7.0'
+__version__='0.7.1'
diff --git a/mpwt/mpwt_workflow.py b/mpwt/mpwt_workflow.py
@@ -156,9 +156,11 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
 def close_mpwt(mpwt_pool, no_download_articles, pathway_score=None, old_pathway_score=None):
     """End multiprocessing Pool and restore ptools-init.dat
 
-    mpwt_pool (multiprocessing Pool): mpwt multiprocessing Pool
-    no_download_articles (bool): turning off loading of PubMed citations (True/False)
-    pathway_score (float): score between 0 and 1 to accept or reject pathway
+    Args:
+        mpwt_pool (multiprocessing Pool): mpwt multiprocessing Pool
+        no_download_articles (bool): turning off loading of PubMed citations (True/False)
+        pathway_score (float): score between 0 and 1 to accept or reject pathway
+        old_pathway_score (float): original value of pathway score
     """
     mpwt_pool.close()
     mpwt_pool.join()
@@ -212,12 +214,19 @@ def run_mpwt(run_folder=None, input_folder=None, run_input_files_creation=None,
         run_flat_creation (bool): if True flat files will be created
         move_options (list): list of bool for: dat_extraction, size_reduction, xml_extraction, owl_extraction, col_extraction
         taxon_file (str): pathname to the mpwt taxon ID file
-        permission (str): Choose permission access to PGDB in ptools-local and output files, either 'all' or 'group' (by default it is user).
+        permission (str): Choose permission access to PGDB in ptools-local and output files, either 'all' or 'group' (by default it is user)
+    Returns:
+        run_folder (str): name of the folder containing input files
+        input_error_status (bool): if True an error occurs during pathologic input files creation
+        patho_error_status (bool): if True an error occurs during PathoLogic run
+        flat_error_status (bool): if True an error occurs during flat fiels creation
+        move_error_status (bool): if True an error occurs when moving output files
     """
     ptools_local_path = utils.find_ptools_path()
     pgdbs_folder_path = os.path.join(*[ptools_local_path, 'pgdbs', 'user'])
     species_pgdb_folder = os.path.join(pgdbs_folder_path, run_folder.lower() + 'cyc')
 
+    input_error_status = False
     patho_error_status = False
     flat_error_status = False
     move_error_status = False
@@ -226,31 +235,33 @@ def run_mpwt(run_folder=None, input_folder=None, run_input_files_creation=None,
         run_folder_path = os.path.join(input_folder, run_folder)
 
     if run_input_files_creation:
-        pwt_input_files(run_folder_path, taxon_file)
+        input_error_status = pwt_input_files(run_folder_path, taxon_file)
+        if input_error_status:
+            return run_folder, input_error_status, patho_error_status, flat_error_status, move_error_status
 
     if run_patho_inference:
         patho_error_status = run_pwt(run_folder_path, *pathologic_options)
         if patho_error_status:
-            return run_folder, patho_error_status, flat_error_status, move_error_status
+            return run_folder, input_error_status, patho_error_status, flat_error_status, move_error_status
 
     if run_flat_creation:
         flat_error_status = run_pwt_flat(run_folder_path)
         check_dat(run_folder_path, species_pgdb_folder)
         if flat_error_status:
-            return run_folder, patho_error_status, flat_error_status, move_error_status
+            return run_folder, input_error_status, patho_error_status, flat_error_status, move_error_status
 
     if permission:
         give_permission(permission, species_pgdb_folder)
 
     if run_output_folder:
         move_error_status = run_move_pgdb(run_folder, species_pgdb_folder, output_folder, *move_options)
         if move_error_status:
-            return run_folder, patho_error_status, flat_error_status, move_error_status
+            return run_folder, input_error_status, patho_error_status, flat_error_status, move_error_status
 
     if permission and output_folder:
         give_permission(permission, output_folder)
 
-    return run_folder, patho_error_status, flat_error_status, move_error_status
+    return run_folder, input_error_status, patho_error_status, flat_error_status, move_error_status
 
 
 def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
@@ -325,12 +336,12 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
     # Create the input for the creation of BioPAX/attribute-values files.
     if (flat_creation and not input_folder) or (output_folder and not input_folder):
         # Create a temporary folder in ptools-local where lisp script will be stored.
-        tmp_folder = os.path.join(ptools_local_path, 'tmp')
-        if not os.path.exists(tmp_folder):
-            os.mkdir(tmp_folder)
+        tmp_folder_path = os.path.join(ptools_local_path, 'tmp')
+        if not os.path.exists(tmp_folder_path):
+            os.mkdir(tmp_folder_path)
 
         # Create a lisp script file for each PGDB in the ptools-local folder.
-        run_ids = list(create_only_flat_lisp(pgdbs_folder_path, tmp_folder))
+        run_ids = list(create_only_flat_lisp(pgdbs_folder_path, tmp_folder_path))
         if flat_creation:
             run_flat_ids = list(run_ids)
         else:
@@ -350,16 +361,18 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
     tmp_folder = False
 
     for run_id in run_ids:
+        if input_folder:
+            run_input_files_creation = True
+            run_input_folder = input_folder
         # For species without PGDB in ptools-local, launch input files creations, PathoLogic reconstruction, flat files creation and moving output files (according to user input)
         if run_patho_flat_ids and run_id in run_patho_flat_ids:
-            run_input_files_creation = True
             if patho_inference:
                 run_patho_inference = True
             if flat_creation:
                 run_flat_creation = True
             if output_folder:
                 run_output_folder = True
-        # For speccies with PGDB in ptools-local, launch only flat files creation and moving output files (according to user input)
+        # For species with PGDB in ptools-local, launch only flat files creation and moving output files (according to user input)
         if run_flat_ids and run_id in run_flat_ids:
             if flat_creation:
                 run_flat_creation = True
@@ -372,14 +385,14 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
             # If flat_creation, flat files of these PGDBs will be created and moved to the output folder.
             if flat_creation:
                 run_flat_creation = True
-                input_folder = os.path.join(ptools_local_path, 'tmp')
+                run_input_folder = os.path.join(ptools_local_path, 'tmp')
             if output_folder:
                 if not os.path.exists(os.path.join(output_folder, run_id)):
                     run_output_folder = True
                 else:
                     logger.info('{0} contains already {1}, output files will not be moved'.format(output_folder, run_id))
 
-        multiprocess_run_mpwt = [run_id, input_folder, run_input_files_creation, run_output_folder, output_folder, run_patho_inference, pathologic_options,
+        multiprocess_run_mpwt = [run_id, run_input_folder, run_input_files_creation, run_output_folder, output_folder, run_patho_inference, pathologic_options,
                                 run_flat_creation, move_options, taxon_file, permission]
 
         multiprocess_run_mpwts.append(multiprocess_run_mpwt)

diff --git a/mpwt/pathologic_input.py b/mpwt/pathologic_input.py
@@ -330,6 +330,10 @@ def extract_taxon_id(run_folder, pgdb_id, taxon_id, taxon_file):
                     if codon_table is not None:
                         taxon_datas['codon_table'] = codon_table
 
+                if 'reference_pgdb' in data:
+                    if data['reference_pgdb'] != '':
+                        taxon_datas['reference_pgdbs'] = data['reference_pgdb'].split(',')
+
     if pgdb_id not in known_species and taxon_id == '':
         logger.critical('Missing pgdb ID for {0} in {1}.'.format(pgdb_id, taxon_id_path))
         return True, None, None
@@ -489,6 +493,9 @@ def create_flats_and_lisp(run_folder, taxon_file):
         organism_writer.writerow(['STORAGE', "FILE"])
         organism_writer.writerow(['NCBI-TAXON-ID', taxon_id])
         organism_writer.writerow(['NAME', species_name])
+        if 'reference_pgdbs' in taxon_datas:
+            for reference_pgdb in taxon_datas['reference_pgdbs']:
+                organism_writer.writerow(['REF-ORGID', reference_pgdb])
 
     # Create the genetic-elements dat file.
     with open(genetic_dat, 'w', encoding='utf-8') as genetic_file:
@@ -634,7 +641,7 @@ def pwt_input_files(run_folder, taxon_file):
         missing_string = 'Missing {0}'.format('; '.join(required_files.difference(files_in))) + '. Inputs file created for {0}'.format(species_folder)
         check_datas_lisp = create_flats_and_lisp(run_folder, taxon_file)
         if check_datas_lisp is None:
-            logger.critical('Error with the creation of input files of {0}.'.format(run_folder))
+            logger.critical('|Input Check|{0}| Error with the creation of input files of {1}.'.format(species_folder, run_folder))
             error_found = True
             return error_found
 

diff --git a/mpwt/results_check.py b/mpwt/results_check.py
@@ -153,17 +153,30 @@ def check_mpwt_pathologic_runs(species_input_folder_paths, patho_log_folder):
 
     failed_inferences = []
     passed_inferences = []
+    no_pathologic_files = []
     for species_input_folder_path in species_input_folder_paths:
         patho_log = os.path.join(species_input_folder_path, 'pathologic.log')
-        species_pathologic_informations = extract_pathologic(patho_log)
-
-        mpwt_pathologic_informations.append(species_pathologic_informations)
-        if species_pathologic_informations[3] is not None:
-            failed_inferences.append(species_pathologic_informations[0])
-        elif species_pathologic_informations[4] is not None:
-            passed_inferences.append(species_pathologic_informations[0])
-        elif species_pathologic_informations[3] is None and species_pathologic_informations[4] is not None:
-            failed_inferences.append(species_pathologic_informations[0])
+        if os.path.exists(patho_log):
+            species_pathologic_informations = extract_pathologic(patho_log)
+
+            mpwt_pathologic_informations.append(species_pathologic_informations)
+            if species_pathologic_informations[3] is not None:
+                failed_inferences.append(species_pathologic_informations[0])
+            elif species_pathologic_informations[4] is not None:
+                passed_inferences.append(species_pathologic_informations[0])
+            elif species_pathologic_informations[3] is None and species_pathologic_informations[4] is not None:
+                failed_inferences.append(species_pathologic_informations[0])
+        else:
+            logger.info('|Output Check|WARNING: No pathologic.log file for {0}, could not write log.'.format(species_input_folder_path))
+            base_name = os.path.basename(species_input_folder_path)
+            no_pathologic_files.append(base_name)
+            log_str = ''
+            log_str += '------------ Species: '
+            log_str += base_name + '\n'
+            log_str += 'No pathologic.log\n'
+            log_str += '------------\n\n'
+            mpwt_pathologic_informations.append([base_name, *['']*10, log_str, [['No pathologic.log', '', '', '', '']]])
+
 
     number_passed_inference = len(passed_inferences)
     number_failed_inference = len(failed_inferences)