Skip to content

Commit

Permalink
Merge pull request #85 from AuReMe/mpwt_0.8.2
Browse files Browse the repository at this point in the history
Mpwt 0.8.2
  • Loading branch information
ArnaudBelcour authored Dec 16, 2022
2 parents 6511c06 + 9450aff commit c9a2eed
Show file tree
Hide file tree
Showing 8 changed files with 77 additions and 1,555 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# Changelog

# mpwt 0.8.2 (2022-12-16)

## Modify:

- decrease the number of calls to get Pathway Tools version.
- error message when not finding Pathway Tools version (issue #84).
- remove unused code.

## Fix:

- issue with compressed files not being check in output folder
- issue when no arguments are given as input.
- issue when mpwt does not exit correctly when detecting an error during check input step.

# mpwt 0.8.1 (2022-09-30)

## Fix:
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
.. image:: https://img.shields.io/badge/doi-10.7554/eLife.61968-blueviolet.svg
:target: https://doi.org/10.7554/eLife.61968

.. image:: https://img.shields.io/badge/Pathway%20Tools-26.0-brightgreen
.. image:: https://img.shields.io/badge/Pathway%20Tools-26.5-brightgreen
:target: https://bioinformatics.ai.sri.com/ptools/release-notes.html

mpwt: Multiprocessing Pathway Tools
Expand Down
2 changes: 1 addition & 1 deletion mpwt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
from mpwt.utils import cleaning, cleaning_input, find_ptools_path, list_pgdb, pubmed_citations, remove_pgdbs
from mpwt.to_pathologic import create_pathologic_file

__version__='0.8.1'
__version__='0.8.2'
2 changes: 1 addition & 1 deletion mpwt/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def run_mpwt():
permission = args.permission

# If no argument print the help.
if len(sys.argv) == 1:
if len(sys.argv) == 1 or (len(sys.argv) == 2 and verbose):
parser.print_help()
sys.exit(1)

Expand Down
28 changes: 20 additions & 8 deletions mpwt/mpwt_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from mpwt import utils
from mpwt.pwt_wrapper import run_pwt, run_pwt_flat, run_move_pgdb
from mpwt.results_check import check_dat, check_mpwt_pathologic_runs
from mpwt.pathologic_input import check_input_and_existing_pgdb, pwt_input_files, create_only_flat_lisp, create_flat_creation_script, read_taxon_id
from mpwt.pathologic_input import check_input_and_existing_pgdb, pwt_input_files, create_only_flat_lisp
from multiprocessing import Pool

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -79,6 +79,10 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
if error:
sys.exit(1)

# Check if patho_inference is launched with input_folder.
if patho_inference and not input_folder:
sys.exit('To use --patho/patho_inference you need to add the -f/input_folder argument.')

# Check if patho_hole_filler or patho_log are launched with patho_inference.
if (patho_hole_filler and not patho_inference) or (patho_log and not patho_inference):
sys.exit('To use either --hf/patho_hole_filler or --log/patho_log, you need to add the --patho/patho_inference argument.')
Expand Down Expand Up @@ -161,7 +165,7 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
no_download_articles, flat_creation, dat_extraction,
xml_extraction, owl_extraction, col_extraction,
size_reduction, number_cpu_to_use, patho_log,
pathway_score, taxon_file, permission)
pathway_score, taxon_file, permission, ptools_version)


def close_mpwt(mpwt_pool, no_download_articles, pathway_score=None, old_pathway_score=None):
Expand Down Expand Up @@ -211,7 +215,7 @@ def run_mpwt(run_folder=None, input_folder=None, run_input_files_creation=None,
run_output_folder=None, output_folder=None,
run_patho_inference=None, pathologic_options=None,
run_flat_creation=None, move_options=None,
taxon_file=None, permission=None):
taxon_file=None, permission=None, ptools_version=None):
""" Single run of mpwt on one folder.
Used in multiprocessing in independent_mpwt.
Expand All @@ -227,6 +231,7 @@ def run_mpwt(run_folder=None, input_folder=None, run_input_files_creation=None,
move_options (list): list of bool for: dat_extraction, size_reduction, xml_extraction, owl_extraction, col_extraction
taxon_file (str): pathname to the mpwt taxon ID file
permission (str): Choose permission access to PGDB in ptools-local and output files, either 'all' or 'group' (by default it is user)
ptools_version (tuple, None): Version number of Pathway Tools (obtained from get_ptools_version funciton).
Returns:
run_folder (str): name of the folder containing input files
input_error_status (bool): if True an error occurs during pathologic input files creation
Expand All @@ -243,8 +248,6 @@ def run_mpwt(run_folder=None, input_folder=None, run_input_files_creation=None,
flat_error_status = False
move_error_status = False

ptools_version = utils.get_ptools_version()

if input_folder:
run_folder_path = os.path.join(input_folder, run_folder)

Expand Down Expand Up @@ -293,7 +296,8 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
no_download_articles=None, flat_creation=None, dat_extraction=None,
xml_extraction=None, owl_extraction=None, col_extraction=None,
size_reduction=None, number_cpu_to_use=None, patho_log=None,
pathway_score=None, taxon_file=None, permission=None):
pathway_score=None, taxon_file=None, permission=None,
ptools_version= None):
"""
Function managing the workflow for independent run of mpwt.
Each process of Pathway Tools on an organism are run separatly so if one failed the other that passed will succeed.
Expand All @@ -318,6 +322,7 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
pathway_score (float): score between 0 and 1 to accept or reject pathway
taxon_file (str): pathname to the mpwt taxon ID file
permission (str): Choose permission access to PGDB in ptools-local and output files, either 'all' or 'group' (by default it is user).
ptools_version (tuple, None): Version number of Pathway Tools (obtained from get_ptools_version funciton).
"""
logger.info('---------- Launching mpwt ----------')
ptools_local_path = utils.find_ptools_path()
Expand Down Expand Up @@ -355,6 +360,9 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
if input_folder:
run_ids = [folder_id for folder_id in next(os.walk(input_folder))[1]]
run_patho_flat_ids, run_flat_ids = check_input_and_existing_pgdb(run_ids, input_folder, output_folder, number_cpu_to_use)
if run_patho_flat_ids is None and run_flat_ids is None:
logger.critical('/!\\ Issue during input check.')
sys.exit()

# Create path for lisp if there is no folder given.
# Create the input for the creation of BioPAX/attribute-values files.
Expand Down Expand Up @@ -420,8 +428,13 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
run_output_folder = False
logger.info('/!\\ {0} contains already {1}, output files will not be moved.'.format(output_folder, run_id))

# Test for compressed PGDB.
if os.path.exists(run_id_output_folder+'.zip'):
run_output_folder = False
logger.info('/!\\ {0} contains already compressed {1}, output files will not be moved.'.format(output_folder, run_id))

multiprocess_run_mpwt = [run_id, run_input_folder, run_input_files_creation, run_output_folder, output_folder, run_patho_inference, pathologic_options,
run_flat_creation, move_options, taxon_file, permission]
run_flat_creation, move_options, taxon_file, permission, ptools_version]

multiprocess_run_mpwts.append(multiprocess_run_mpwt)

Expand Down Expand Up @@ -474,4 +487,3 @@ def independent_mpwt(input_folder, output_folder=None, patho_inference=None,
end_time = time.time()

logger.info('-------------- mpwt has finished in {0:.2f}s! Thank you for using it. --------------'.format(end_time - start_time))

127 changes: 35 additions & 92 deletions mpwt/pathologic_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ def check_input_and_existing_pgdb(run_ids, input_folder, output_folder, number_c
if output_folder:
if os.path.exists(output_folder):
if os.path.isdir(output_folder):
already_present_outputs = [output_pgdb for output_pgdb in os.listdir(output_folder)]
# To handle PGDB created with size_reduction option, remove .zip extension.
already_present_outputs = [output_pgdb.replace('.zip', '') for output_pgdb in os.listdir(output_folder)]
new_run_ids = clean_run_ids - set(already_present_outputs)
new_run_ids = list(new_run_ids)
for pgdb in already_present_outputs:
Expand Down Expand Up @@ -520,39 +521,39 @@ def create_flats_and_lisp(run_folder, taxon_file):
elif all([True for species_file in os.listdir(run_folder) if '.pf' in species_file or '.fasta' in species_file or '.fsa' in species_file]):
genetic_writer = csv.writer(genetic_file, delimiter='\t', lineterminator='\n')
for species_file in os.listdir(run_folder):
if '.pf' in species_file:
species_file_name = os.path.splitext(species_file)[0]
genetic_writer.writerow(['NAME', species_file.replace('.pf', '')])
genetic_writer.writerow(['ID', species_file.replace('.pf', '')])
genetic_writer.writerow(['ANNOT-FILE', species_file])
fasta_path = os.path.join(run_folder, species_file.replace('.pf', '.fasta'))
fsa_path = os.path.join(run_folder, species_file.replace('.pf', '.fsa'))
if os.path.exists(fasta_path):
genetic_writer.writerow(['SEQ-FILE', species_file.replace('.pf', '.fasta')])
elif os.path.exists(fsa_path):
genetic_writer.writerow(['SEQ-FILE', species_file.replace('.pf', '.fsa')])

if species_file_name in taxon_datas:
if 'circular' in taxon_datas[species_file_name]:
circular = taxon_datas[species_file_name]['circular']
genetic_writer.writerow(['CIRCULAR?', circular])
if 'element_type' in taxon_datas[species_file_name]:
element_type = taxon_datas[species_file_name]['element_type']
genetic_writer.writerow(['TYPE', element_type])
if 'codon_table' in taxon_datas[species_file_name]:
codon_table = taxon_datas[species_file_name]['codon_table']
genetic_writer.writerow(['CODON-TABLE', codon_table])
else:
if 'circular' in taxon_datas:
circular = taxon_datas['circular']
genetic_writer.writerow(['CIRCULAR?', circular])
if 'element_type' in taxon_datas:
element_type = taxon_datas['element_type']
genetic_writer.writerow(['TYPE', element_type])
if 'codon_table' in taxon_datas:
codon_table = taxon_datas['codon_table']
genetic_writer.writerow(['CODON-TABLE', codon_table])
genetic_writer.writerow(['//'])
if '.pf' in species_file:
species_file_name = os.path.splitext(species_file)[0]
genetic_writer.writerow(['NAME', species_file.replace('.pf', '')])
genetic_writer.writerow(['ID', species_file.replace('.pf', '')])
genetic_writer.writerow(['ANNOT-FILE', species_file])
fasta_path = os.path.join(run_folder, species_file.replace('.pf', '.fasta'))
fsa_path = os.path.join(run_folder, species_file.replace('.pf', '.fsa'))
if os.path.exists(fasta_path):
genetic_writer.writerow(['SEQ-FILE', species_file.replace('.pf', '.fasta')])
elif os.path.exists(fsa_path):
genetic_writer.writerow(['SEQ-FILE', species_file.replace('.pf', '.fsa')])

if species_file_name in taxon_datas:
if 'circular' in taxon_datas[species_file_name]:
circular = taxon_datas[species_file_name]['circular']
genetic_writer.writerow(['CIRCULAR?', circular])
if 'element_type' in taxon_datas[species_file_name]:
element_type = taxon_datas[species_file_name]['element_type']
genetic_writer.writerow(['TYPE', element_type])
if 'codon_table' in taxon_datas[species_file_name]:
codon_table = taxon_datas[species_file_name]['codon_table']
genetic_writer.writerow(['CODON-TABLE', codon_table])
else:
if 'circular' in taxon_datas:
circular = taxon_datas['circular']
genetic_writer.writerow(['CIRCULAR?', circular])
if 'element_type' in taxon_datas:
element_type = taxon_datas['element_type']
genetic_writer.writerow(['TYPE', element_type])
if 'codon_table' in taxon_datas:
codon_table = taxon_datas['codon_table']
genetic_writer.writerow(['CODON-TABLE', codon_table])
genetic_writer.writerow(['//'])

if not os.path.exists(lisp_pathname):
# Create the lisp script.
Expand All @@ -563,64 +564,6 @@ def create_flats_and_lisp(run_folder, taxon_file):
return all([os.path.isfile(organism_dat), os.path.isfile(genetic_dat), check_lisp_file])


def read_taxon_id(run_folder):
"""
Search for Taxon ID in genbank or GFF files.
For GenBank file searc for ''taxon:' key in 'db_xref' qualifier.
For GFF file search for 'taxon' in dbxref feature.
Args:
run_folder (str): path to the input folder
"""
taxon_ids = {}

for input_folder in os.listdir(run_folder):
input_folder_path = os.path.join(run_folder, input_folder)
for input_file in os.listdir(input_folder_path):
if '.gbk' in input_file:
gbk_pathname = os.path.join(input_folder_path, input_file)
# Take the species name and the taxon id from the genbank file.
with open(gbk_pathname, "r") as gbk:
# Take the first record of the genbank (first contig/chromosome) to retrieve the species name.
first_seq_record = next(SeqIO.parse(gbk, "genbank"))
# Take the source feature of the first record.
# This feature contains the taxon ID in the db_xref qualifier.
src_features = [feature for feature in first_seq_record.features if feature.type == "source"]
for src_feature in src_features:
try:
src_dbxref_qualifiers = src_feature.qualifiers['db_xref']
for src_dbxref_qualifier in src_dbxref_qualifiers:
if 'taxon:' in src_dbxref_qualifier:
taxon_id = src_dbxref_qualifier.replace('taxon:', '')
except KeyError:
logger.info('No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'.format(gbk_pathname))

elif '.gff' in input_file:
gff_pathname = os.path.join(input_folder_path, input_file)

# Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature.
try:
region_feature = [feature for feature in DataIterator(gff_pathname) if feature.featuretype == 'region'][0]
except IndexError:
raise IndexError('No region feature in the GFF file of {0}, GFF file must have region features.'.format(input_folder))

try:
region_feature.attributes['Dbxref']
except KeyError:
raise KeyError('No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.'.format(input_folder))

for dbxref in region_feature.attributes['Dbxref']:
if 'taxon' in dbxref:
taxon_id = dbxref.split('taxon:')[1]

elif '.pf' in input_file:
logger.info('No taxon ID associated to a PathoLogic Format. {0} will have a missing taxon_id'.format(input_folder))
taxon_id = "missing"
taxon_ids[input_folder] = taxon_id

return taxon_ids


def pwt_input_files(run_folder, taxon_file):
"""
Check if files needed by Pathway Tools are available, if not create them.
Expand Down
5 changes: 5 additions & 0 deletions mpwt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ def get_ptools_version():
if 'Pathway Tools version ' in ptools_line:
ptools_version = tuple([int(nb_version) for nb_version in ptools_line.split('Pathway Tools version ')[1].split(' :::')[0].split('.')])

if ptools_version is None:
logger.critical('mpwt could not find the version of Pathway Tools.')
logger.critical('It is possibly an issue with the installation of Pathway Tools (maybe it is not in the PATH). Or it can be due to a change in the output of pathway-tools -id command.')
sys.exit()

return ptools_version


Expand Down
Loading

0 comments on commit c9a2eed

Please sign in to comment.