From 5594ccf6540c11cc478e70e7e170d62b9c528aa2 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Mon, 20 Nov 2023 10:31:18 +1030 Subject: [PATCH 1/7] fix canu correct error messages --- src/plassembler/__init__.py | 2 +- src/plassembler/utils/external_tools.py | 14 ++++++++++++-- src/plassembler/utils/run_canu.py | 4 ++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/plassembler/__init__.py b/src/plassembler/__init__.py index 60449c8..df82a19 100644 --- a/src/plassembler/__init__.py +++ b/src/plassembler/__init__.py @@ -1497,7 +1497,7 @@ def long( corrected_fasta_to_fastq(canu_reads, corrected_fastqs) except: logger.warning( - "canu correct failed to correct any reads. Advancing with uncorrected reads" + "Advancing with uncorrected reads" ) corrected_fastqs = entropy_filtered_fastq diff --git a/src/plassembler/utils/external_tools.py b/src/plassembler/utils/external_tools.py index 59a22cf..b942de8 100644 --- a/src/plassembler/utils/external_tools.py +++ b/src/plassembler/utils/external_tools.py @@ -115,13 +115,23 @@ def run_tool( "Dnaapler failed to reorient any putative plasmids to begin with repA." ) logger.warning("Continuing with the un-reoriented contigs.") - elif tool.tool_str == "canu": # for dnaapler errors + elif tool.tool_str == "canu": # for canu errors logger.warning( - "canu failed to assemble anything from the unmapped reads." + "Canu failed to assemble anything from the unmapped reads." ) logger.warning( f"If you think your sample should still have plasmids, please check stdout log file: {tool.out_log} and stderr log file: {tool.err_log}" ) + elif tool.tool_str == "canu -correct": # for canu errors + logger.warning( + "Canu failed to correct any reads." + ) + logger.warning( + "This probably means there is low depth, don't be too concerned." + ) + logger.warning( + f"If you are concerned, check stdout log file: {tool.out_log} and stderr log file: {tool.err_log}." + ) else: logger.warning( f"Error calling {tool.command_as_str} (return code {error.returncode})" diff --git a/src/plassembler/utils/run_canu.py b/src/plassembler/utils/run_canu.py index 196d45a..d447847 100644 --- a/src/plassembler/utils/run_canu.py +++ b/src/plassembler/utils/run_canu.py @@ -30,10 +30,10 @@ def run_canu_correct( total_flye_plasmid_length = round(total_flye_plasmid_length / 1000000, 5) try: canu = ExternalTool( - tool="canu", + tool="canu -correct", input="", output="", - params=f" -correct -p canu -d {canu_output_dir} genomeSize={total_flye_plasmid_length}m maxInputCoverage={coverage} stopOnLowCoverage=1 maxThreads={threads} -{canu_nano_or_pacbio} correctedErrorRate={corrected_error_rate} {longreads}", + params=f" -p canu -d {canu_output_dir} genomeSize={total_flye_plasmid_length}m maxInputCoverage={coverage} stopOnLowCoverage=1 maxThreads={threads} -{canu_nano_or_pacbio} correctedErrorRate={corrected_error_rate} {longreads}", logdir=logdir, outfile="", ) From 2f0025ee272d1e0345b15a5b1764905695d94d25 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Mon, 20 Nov 2023 10:34:31 +1030 Subject: [PATCH 2/7] replace space with underscore in log name --- src/plassembler/utils/external_tools.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/plassembler/utils/external_tools.py b/src/plassembler/utils/external_tools.py index b942de8..10d362a 100644 --- a/src/plassembler/utils/external_tools.py +++ b/src/plassembler/utils/external_tools.py @@ -31,7 +31,10 @@ def __init__( logdir.mkdir(parents=True, exist_ok=True) command_hash = hashlib.sha256(self.command_as_str.encode("utf-8")).hexdigest() tool_name = Path(tool).name - logfile_prefix: Path = logdir / f"{tool_name}_{command_hash}" + # to make sure no spaces or - + tool_name_with_underscores = tool_name.replace(" ", "_") + tool_name_with_underscores = tool_name_with_underscores.replace("-", "_") + logfile_prefix: Path = logdir / f"{tool_name_with_underscores}_{command_hash}" self.out_log = f"{logfile_prefix}.out" self.err_log = f"{logfile_prefix}.err" self.outfile = outfile From a5e37d2d652ff5e9e6b0e57ba041938eeddef405 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Mon, 20 Nov 2023 11:36:13 +1030 Subject: [PATCH 3/7] update munging code to new PLSDB --- src/plassembler/utils/db.py | 4 +- src/plassembler/utils/plass_class.py | 105 +++++++++------------------ src/plassembler/utils/run_mash.py | 2 +- 3 files changed, 39 insertions(+), 72 deletions(-) diff --git a/src/plassembler/utils/db.py b/src/plassembler/utils/db.py index 1d44cde..3cc858a 100755 --- a/src/plassembler/utils/db.py +++ b/src/plassembler/utils/db.py @@ -23,12 +23,14 @@ def check_db_installation(db_dir: Path, install_flag: bool): """ # Mash files - mash_db_names = ["plsdb.msh", "plsdb.tsv"] + mash_db_names = ["plsdb_2023_11_03_v2.msh", "plsdb_2023_11_03_v2.tsv"] f1: Path = db_dir / f"{mash_db_names[0]}" f2: Path = db_dir / f"{mash_db_names[1]}" if f1.exists() and f2.exists(): + logger.info(f"PLSDB Database mash sketch at {f1} exists.") + logger.info(f"PLSDB Database tsv metadata file at {f2} exists.") logger.info(f"PLSDB Database at {db_dir} has already been downloaded") else: for file_name in mash_db_names: diff --git a/src/plassembler/utils/plass_class.py b/src/plassembler/utils/plass_class.py index 4870ee3..60d4e8c 100644 --- a/src/plassembler/utils/plass_class.py +++ b/src/plassembler/utils/plass_class.py @@ -520,7 +520,7 @@ def process_mash_tsv(self, plassembler_db_dir): mash_tsv = os.path.join(outdir, "mash.tsv") col_list = [ "contig", - "ACC_NUCCORE", + "NUCCORE_ACC", "mash_distance", "mash_pval", "mash_matching_hashes", @@ -560,7 +560,7 @@ def process_mash_tsv(self, plassembler_db_dir): [ tmp_df.contig, "Yes", - tmp_df.ACC_NUCCORE, + tmp_df.NUCCORE_ACC, tmp_df.mash_distance, tmp_df.mash_pval, tmp_df.mash_matching_hashes, @@ -574,7 +574,7 @@ def process_mash_tsv(self, plassembler_db_dir): columns=[ "contig", "PLSDB_hit", - "ACC_NUCCORE", + "NUCCORE_ACC", "mash_distance", "mash_pval", "mash_matching_hashes", @@ -588,7 +588,7 @@ def process_mash_tsv(self, plassembler_db_dir): columns=[ "contig", "PLSDB_hit", - "ACC_NUCCORE", + "NUCCORE_ACC", "mash_distance", "mash_pval", "mash_matching_hashes", @@ -601,7 +601,7 @@ def process_mash_tsv(self, plassembler_db_dir): plsdb_tsv_file = os.path.join(plassembler_db_dir, "plsdb.tsv") cols = [ "UID_NUCCORE", - "ACC_NUCCORE", + "NUCCORE_ACC", "Description_NUCCORE", "CreateDate_NUCCORE", "Topology_NUCCORE", @@ -668,7 +668,7 @@ def process_mash_tsv(self, plassembler_db_dir): low_memory=False, ) combined_mash_df = tophits_mash_df.merge( - plsdb_tsv, on="ACC_NUCCORE", how="left" + plsdb_tsv, on="NUCCORE_ACC", how="left" ) self.mash_df = combined_mash_df @@ -933,7 +933,7 @@ def process_mash_tsv(self, plassembler_db_dir, plasmid_fasta): mash_tsv = os.path.join(outdir, "mash.tsv") col_list = [ "contig", - "ACC_NUCCORE", + "NUCCORE_ACC", "mash_distance", "mash_pval", "mash_matching_hashes", @@ -973,7 +973,7 @@ def process_mash_tsv(self, plassembler_db_dir, plasmid_fasta): [ tmp_df.contig, "Yes", - tmp_df.ACC_NUCCORE, + tmp_df.NUCCORE_ACC, tmp_df.mash_distance, tmp_df.mash_pval, tmp_df.mash_matching_hashes, @@ -987,7 +987,7 @@ def process_mash_tsv(self, plassembler_db_dir, plasmid_fasta): columns=[ "contig", "PLSDB_hit", - "ACC_NUCCORE", + "NUCCORE_ACC", "mash_distance", "mash_pval", "mash_matching_hashes", @@ -1001,7 +1001,7 @@ def process_mash_tsv(self, plassembler_db_dir, plasmid_fasta): columns=[ "contig", "PLSDB_hit", - "ACC_NUCCORE", + "NUCCORE_ACC", "mash_distance", "mash_pval", "mash_matching_hashes", @@ -1011,67 +1011,32 @@ def process_mash_tsv(self, plassembler_db_dir, plasmid_fasta): tophits_mash_df.loc[contig - 1] = [contig, "", "", "", "", ""] # read in the plasdb tsv to get the description - plsdb_tsv_file = os.path.join(plassembler_db_dir, "plsdb.tsv") + plsdb_tsv_file = os.path.join(plassembler_db_dir, "plsdb_2023_11_03_v2.tsv") + cols = [ - "UID_NUCCORE", - "ACC_NUCCORE", - "Description_NUCCORE", - "CreateDate_NUCCORE", - "Topology_NUCCORE", - "Completeness_NUCCORE", - "TaxonID_NUCCORE", - "Genome_NUCCORE", - "Length_NUCCORE", - "Source_NUCCORE", - "UID_ASSEMBLY", - "Status_ASSEMBLY", - "SeqReleaseDate_ASSEMBLY", - "SubmissionDate_ASSEMBLY", - "Latest_ASSEMBLY", - "UID_BIOSAMPLE", - "ACC_BIOSAMPLE", - "Location_BIOSAMPLE", - "Coordinates_BIOSAMPLE", - "IsolationSource_BIOSAMPLE", - "Host_BIOSAMPLE", - "CollectionDate_BIOSAMPLE", - "Host_DISEASE", - "SamplType_BIOSAMPLE", - "taxon_name", - "taxon_rank", - "lineage", - "taxon_species_id", - "taxon_species_name", - "taxon_genus_id", - "taxon_genus_name", - "taxon_family_id", - "taxon_family_name", - "taxon_order_id", - "taxon_order_name", - "taxon_class_id", - "taxon_class_name", - "taxon_phylum_id", - "taxon_phylum_name", - "taxon_superkingdom_id", - "taxon_superkingdom_name", - "loc_lat", - "loc_lng", - "loc_parsed", - "GC_NUCCORE", - "Identical", - "OldVersion", - "hits_rMLST", - "hitscount_rMLST", - "inclusions", - "Host_BIOSAMPLE_processed", - "Host_DISEASE_processed", - "D1", - "D2", - "plasmidfinder", - "pmlst", - "relaxase_type(s)", - "mpf_type", + ## NUCCORE + "NUCCORE_UID", "NUCCORE_ACC", "NUCCORE_Description", "NUCCORE_CreateDate", + "NUCCORE_Topology", "NUCCORE_Completeness", "NUCCORE_TaxonID", + "NUCCORE_Genome", "NUCCORE_Length", "NUCCORE_GC", "NUCCORE_Source", + ## BIOSAMPLE + "BIOSAMPLE_UID", "BIOSAMPLE_ACC", "BIOSAMPLE_Location", + "BIOSAMPLE_Coordinates", "BIOSAMPLE_IsolationSource", "BIOSAMPLE_Host", + "BIOSAMPLE_CollectionDate", "BIOSAMPLE_HostDisease", "BIOSAMPLE_SampleType", + "BIOSAMPLE_Host_label", "BIOSAMPLE_HostDisease_processed", + ## ASSEMBLY + "ASSEMBLY_UID", "ASSEMBLY_Status", "ASSEMBLY_SeqReleaseDate", + "ASSEMBLY_SubmissionDate", "ASSEMBLY_Lastest", + ## TAXONOMY + "TAXONOMY_taxon_name", "TAXONOMY_taxon_rank", "TAXONOMY_taxon_lineage", + "TAXONOMY_superkingdom", "TAXONOMY_phylum", "TAXONOMY_class", + "TAXONOMY_order", "TAXONOMY_family", "TAXONOMY_genus", + "TAXONOMY_species","TAXONOMY_superkingdom_id", "TAXONOMY_phylum_id", + "TAXONOMY_class_id", "TAXONOMY_order_id", "TAXONOMY_family_id", + "TAXONOMY_genus_id", "TAXONOMY_species_id", + ## rMLST + "rMLST_hits", "rMLST_hitscount" ] + plsdb_tsv = pd.read_csv( plsdb_tsv_file, delimiter="\t", @@ -1081,7 +1046,7 @@ def process_mash_tsv(self, plassembler_db_dir, plasmid_fasta): low_memory=False, ) combined_mash_df = tophits_mash_df.merge( - plsdb_tsv, on="ACC_NUCCORE", how="left" + plsdb_tsv, on="NUCCORE_ACC", how="left" ) self.mash_df = combined_mash_df diff --git a/src/plassembler/utils/run_mash.py b/src/plassembler/utils/run_mash.py index f056681..5988725 100644 --- a/src/plassembler/utils/run_mash.py +++ b/src/plassembler/utils/run_mash.py @@ -41,7 +41,7 @@ def run_mash(out_dir, plassembler_db_dir, logdir): :return: """ - plsdb_sketch: Path = Path(f"{plassembler_db_dir}/plsdb.msh") + plsdb_sketch: Path = Path(f"{plassembler_db_dir}/plsdb_2023_11_03_v2.msh") plasmid_sketch: Path = Path(f"{out_dir}/plasmids.fasta.msh") mash_tsv: Path = Path(f"{out_dir}/mash.tsv") From e8252ab9da739b83c16830977c94dc892ba2cdb6 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Mon, 20 Nov 2023 13:45:00 +1030 Subject: [PATCH 4/7] update to new PLSDB --- pyproject.toml | 2 +- src/plassembler/__init__.py | 4 +- src/plassembler/utils/VERSION | 2 +- src/plassembler/utils/db.py | 8 +- src/plassembler/utils/external_tools.py | 4 +- src/plassembler/utils/plass_class.py | 202 +++++++++++++++--------- 6 files changed, 138 insertions(+), 84 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 46ab3ef..4414504 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "plassembler" -version = "1.4.1" # change VERSION too +version = "1.5.0" # change VERSION too description = "Quickly and accurately assemble plasmids in hybrid sequenced bacterial isolates" authors = ["George Bouras "] license = "MIT" diff --git a/src/plassembler/__init__.py b/src/plassembler/__init__.py index df82a19..79462fa 100644 --- a/src/plassembler/__init__.py +++ b/src/plassembler/__init__.py @@ -1496,9 +1496,7 @@ def long( corrected_fastqs: Path = Path(outdir) / "corrected_plasmid_long.fastq" corrected_fasta_to_fastq(canu_reads, corrected_fastqs) except: - logger.warning( - "Advancing with uncorrected reads" - ) + logger.warning("Advancing with uncorrected reads") corrected_fastqs = entropy_filtered_fastq # remove canu directory diff --git a/src/plassembler/utils/VERSION b/src/plassembler/utils/VERSION index 13175fd..3e1ad72 100644 --- a/src/plassembler/utils/VERSION +++ b/src/plassembler/utils/VERSION @@ -1 +1 @@ -1.4.1 \ No newline at end of file +1.5.0 \ No newline at end of file diff --git a/src/plassembler/utils/db.py b/src/plassembler/utils/db.py index 3cc858a..65b9ab3 100755 --- a/src/plassembler/utils/db.py +++ b/src/plassembler/utils/db.py @@ -50,10 +50,10 @@ def check_db_installation(db_dir: Path, install_flag: bool): def get_database_zenodo(db_dir: Path): logger.info("Downloading Plassembler Database.") - tarball = "plsdb_110222_plassembler_v0.1.4_databases.tar.gz" + tarball = "201123_plassembler_v1.5.0_databases.tar.gz" tar_path = Path(f"{db_dir}/{tarball}") - db_url = "https://zenodo.org/record/7499200/files/plsdb_110222_plassembler_v0.1.4_databases.tar.gz" - requiredmd5 = "f5144045e6e5d0d5a6b7f78d0c08840d" + db_url = "https://zenodo.org/record/10158040/files/201123_plassembler_v1.5.0_databases.tar.gz" + requiredmd5 = "3a24bacc05bb857dc044fc6662b58db7" # remvoe the directory if os.path.exists(db_dir): @@ -117,7 +117,7 @@ def untar(tarball_path: Path, output_path: Path): # get untarred directory untarpath = os.path.join( - output_path, "plsdb_110222_plassembler_v0.1.4_databases" + output_path, "201123_plassembler_v1.5.0_databases" ) # Get a list of all files in the source directory diff --git a/src/plassembler/utils/external_tools.py b/src/plassembler/utils/external_tools.py index 10d362a..05d413f 100644 --- a/src/plassembler/utils/external_tools.py +++ b/src/plassembler/utils/external_tools.py @@ -126,9 +126,7 @@ def run_tool( f"If you think your sample should still have plasmids, please check stdout log file: {tool.out_log} and stderr log file: {tool.err_log}" ) elif tool.tool_str == "canu -correct": # for canu errors - logger.warning( - "Canu failed to correct any reads." - ) + logger.warning("Canu failed to correct any reads.") logger.warning( "This probably means there is low depth, don't be too concerned." ) diff --git a/src/plassembler/utils/plass_class.py b/src/plassembler/utils/plass_class.py index 60d4e8c..61c51a1 100644 --- a/src/plassembler/utils/plass_class.py +++ b/src/plassembler/utils/plass_class.py @@ -598,67 +598,78 @@ def process_mash_tsv(self, plassembler_db_dir): tophits_mash_df.loc[contig - 1] = [contig, "", "", "", "", ""] # read in the plasdb tsv to get the description - plsdb_tsv_file = os.path.join(plassembler_db_dir, "plsdb.tsv") + plsdb_tsv_file = os.path.join(plassembler_db_dir, "plsdb_2023_11_03_v2.tsv") cols = [ - "UID_NUCCORE", + "NUCCORE_UID", "NUCCORE_ACC", - "Description_NUCCORE", - "CreateDate_NUCCORE", - "Topology_NUCCORE", - "Completeness_NUCCORE", - "TaxonID_NUCCORE", - "Genome_NUCCORE", - "Length_NUCCORE", - "Source_NUCCORE", - "UID_ASSEMBLY", - "Status_ASSEMBLY", - "SeqReleaseDate_ASSEMBLY", - "SubmissionDate_ASSEMBLY", - "Latest_ASSEMBLY", - "UID_BIOSAMPLE", - "ACC_BIOSAMPLE", - "Location_BIOSAMPLE", - "Coordinates_BIOSAMPLE", - "IsolationSource_BIOSAMPLE", - "Host_BIOSAMPLE", - "CollectionDate_BIOSAMPLE", - "Host_DISEASE", - "SamplType_BIOSAMPLE", - "taxon_name", - "taxon_rank", - "lineage", - "taxon_species_id", - "taxon_species_name", - "taxon_genus_id", - "taxon_genus_name", - "taxon_family_id", - "taxon_family_name", - "taxon_order_id", - "taxon_order_name", - "taxon_class_id", - "taxon_class_name", - "taxon_phylum_id", - "taxon_phylum_name", - "taxon_superkingdom_id", - "taxon_superkingdom_name", + "NUCCORE_Description", + "NUCCORE_CreateDate", + "NUCCORE_Topology", + "NUCCORE_Completeness", + "NUCCORE_TaxonID", + "NUCCORE_Genome", + "NUCCORE_Length", + "NUCCORE_DuplicatedEntry", + "NUCCORE_Source", + "NUCCORE_BiosampleID", + "BIOSAMPLE_UID", + "BIOSAMPLE_ACC", + "BIOSAMPLE_Location", + "BIOSAMPLE_Coordinates", + "BIOSAMPLE_IsolationSource", + "BIOSAMPLE_Host", + "BIOSAMPLE_CollectionDate", + "BIOSAMPLE_HostDisease", + "BIOSAMPLE_SampleType", + "ASSEMBLY_UID", + "ASSEMBLY_ACC", + "ASSEMBLY_Status", + "ASSEMBLY_coverage", + "ASSEMBLY_SeqReleaseDate", + "ASSEMBLY_SubmissionDate", + "ASSEMBLY_Lastest", + "ASSEMBLY_BiosampleID", + "TAXONOMY_superkingdom", + "TAXONOMY_phylum", + "TAXONOMY_class", + "TAXONOMY_order", + "TAXONOMY_family", + "TAXONOMY_genus", + "TAXONOMY_species", + "TAXONOMY_strain", + "TAXONOMY_UID", + "TAXONOMY_taxon_rank", + "TAXONOMY_taxon_name", + "TAXONOMY_taxon_lineage", + "TAXONOMY_superkingdom_id", + "TAXONOMY_phylum_id", + "TAXONOMY_class_id", + "TAXONOMY_order_id", + "TAXONOMY_family_id", + "TAXONOMY_genus_id", + "TAXONOMY_species_id", + "TAXONOMY_strain_id", + "has_biosample", + "has_assembly", + "has_location", + "rMLST_hits", + "rMLST_hitscount", + "inclusions", + "NUCCORE_GC", + "Length", + "BIOSAMPLE_Host_processed", + "BIOSAMPLE_Host_processed_source", + "BIOSAMPLE_Host_label", + "BIOSAMPLE_HostDisease_processed", "loc_lat", "loc_lng", "loc_parsed", - "GC_NUCCORE", - "Identical", - "OldVersion", - "hits_rMLST", - "hitscount_rMLST", - "inclusions", - "Host_BIOSAMPLE_processed", - "Host_DISEASE_processed", "D1", "D2", "plasmidfinder", "pmlst", - "relaxase_type(s)", - "mpf_type", ] + plsdb_tsv = pd.read_csv( plsdb_tsv_file, delimiter="\t", @@ -1014,27 +1025,74 @@ def process_mash_tsv(self, plassembler_db_dir, plasmid_fasta): plsdb_tsv_file = os.path.join(plassembler_db_dir, "plsdb_2023_11_03_v2.tsv") cols = [ - ## NUCCORE - "NUCCORE_UID", "NUCCORE_ACC", "NUCCORE_Description", "NUCCORE_CreateDate", - "NUCCORE_Topology", "NUCCORE_Completeness", "NUCCORE_TaxonID", - "NUCCORE_Genome", "NUCCORE_Length", "NUCCORE_GC", "NUCCORE_Source", - ## BIOSAMPLE - "BIOSAMPLE_UID", "BIOSAMPLE_ACC", "BIOSAMPLE_Location", - "BIOSAMPLE_Coordinates", "BIOSAMPLE_IsolationSource", "BIOSAMPLE_Host", - "BIOSAMPLE_CollectionDate", "BIOSAMPLE_HostDisease", "BIOSAMPLE_SampleType", - "BIOSAMPLE_Host_label", "BIOSAMPLE_HostDisease_processed", - ## ASSEMBLY - "ASSEMBLY_UID", "ASSEMBLY_Status", "ASSEMBLY_SeqReleaseDate", - "ASSEMBLY_SubmissionDate", "ASSEMBLY_Lastest", - ## TAXONOMY - "TAXONOMY_taxon_name", "TAXONOMY_taxon_rank", "TAXONOMY_taxon_lineage", - "TAXONOMY_superkingdom", "TAXONOMY_phylum", "TAXONOMY_class", - "TAXONOMY_order", "TAXONOMY_family", "TAXONOMY_genus", - "TAXONOMY_species","TAXONOMY_superkingdom_id", "TAXONOMY_phylum_id", - "TAXONOMY_class_id", "TAXONOMY_order_id", "TAXONOMY_family_id", - "TAXONOMY_genus_id", "TAXONOMY_species_id", - ## rMLST - "rMLST_hits", "rMLST_hitscount" + "NUCCORE_UID", + "NUCCORE_ACC", + "NUCCORE_Description", + "NUCCORE_CreateDate", + "NUCCORE_Topology", + "NUCCORE_Completeness", + "NUCCORE_TaxonID", + "NUCCORE_Genome", + "NUCCORE_Length", + "NUCCORE_DuplicatedEntry", + "NUCCORE_Source", + "NUCCORE_BiosampleID", + "BIOSAMPLE_UID", + "BIOSAMPLE_ACC", + "BIOSAMPLE_Location", + "BIOSAMPLE_Coordinates", + "BIOSAMPLE_IsolationSource", + "BIOSAMPLE_Host", + "BIOSAMPLE_CollectionDate", + "BIOSAMPLE_HostDisease", + "BIOSAMPLE_SampleType", + "ASSEMBLY_UID", + "ASSEMBLY_ACC", + "ASSEMBLY_Status", + "ASSEMBLY_coverage", + "ASSEMBLY_SeqReleaseDate", + "ASSEMBLY_SubmissionDate", + "ASSEMBLY_Lastest", + "ASSEMBLY_BiosampleID", + "TAXONOMY_superkingdom", + "TAXONOMY_phylum", + "TAXONOMY_class", + "TAXONOMY_order", + "TAXONOMY_family", + "TAXONOMY_genus", + "TAXONOMY_species", + "TAXONOMY_strain", + "TAXONOMY_UID", + "TAXONOMY_taxon_rank", + "TAXONOMY_taxon_name", + "TAXONOMY_taxon_lineage", + "TAXONOMY_superkingdom_id", + "TAXONOMY_phylum_id", + "TAXONOMY_class_id", + "TAXONOMY_order_id", + "TAXONOMY_family_id", + "TAXONOMY_genus_id", + "TAXONOMY_species_id", + "TAXONOMY_strain_id", + "has_biosample", + "has_assembly", + "has_location", + "rMLST_hits", + "rMLST_hitscount", + "inclusions", + "NUCCORE_GC", + "Length", + "BIOSAMPLE_Host_processed", + "BIOSAMPLE_Host_processed_source", + "BIOSAMPLE_Host_label", + "BIOSAMPLE_HostDisease_processed", + "loc_lat", + "loc_lng", + "loc_parsed", + "D1", + "D2", + "plasmidfinder", + "pmlst", ] plsdb_tsv = pd.read_csv( From dede90580734ea9272df68c065973810d719f2fd Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Mon, 20 Nov 2023 14:01:40 +1030 Subject: [PATCH 5/7] format --- src/plassembler/utils/db.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/plassembler/utils/db.py b/src/plassembler/utils/db.py index 65b9ab3..4393d3b 100755 --- a/src/plassembler/utils/db.py +++ b/src/plassembler/utils/db.py @@ -116,9 +116,7 @@ def untar(tarball_path: Path, output_path: Path): tar_file.extractall(path=str(output_path)) # get untarred directory - untarpath = os.path.join( - output_path, "201123_plassembler_v1.5.0_databases" - ) + untarpath = os.path.join(output_path, "201123_plassembler_v1.5.0_databases") # Get a list of all files in the source directory files_to_move = [ From 2d5ee62fd3d52e418358bf1e18f257211d89fc32 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Mon, 20 Nov 2023 15:13:45 +1030 Subject: [PATCH 6/7] fix test db files --- tests/test_data/Plassembler_Test_DB/plsdb.tsv | 2 -- ...{plsdb.fasta => plsdb_2023_11_03_v2.fasta} | 2 +- .../{plsdb.msh => plsdb_2023_11_03_v2.msh} | Bin 8280 -> 8288 bytes .../plsdb_2023_11_03_v2.tsv | 2 ++ 4 files changed, 3 insertions(+), 3 deletions(-) delete mode 100644 tests/test_data/Plassembler_Test_DB/plsdb.tsv rename tests/test_data/Plassembler_Test_DB/{plsdb.fasta => plsdb_2023_11_03_v2.fasta} (96%) rename tests/test_data/Plassembler_Test_DB/{plsdb.msh => plsdb_2023_11_03_v2.msh} (93%) create mode 100644 tests/test_data/Plassembler_Test_DB/plsdb_2023_11_03_v2.tsv diff --git a/tests/test_data/Plassembler_Test_DB/plsdb.tsv b/tests/test_data/Plassembler_Test_DB/plsdb.tsv deleted file mode 100644 index 506a7a3..0000000 --- a/tests/test_data/Plassembler_Test_DB/plsdb.tsv +++ /dev/null @@ -1,2 +0,0 @@ -UID_NUCCORE ACC_NUCCORE Description_NUCCORE CreateDate_NUCCORE Topology_NUCCORE Completeness_NUCCORE TaxonID_NUCCORE Genome_NUCCORE Length_NUCCORE Source_NUCCORE UID_ASSEMBLY Status_ASSEMBLY SeqReleaseDate_ASSEMBLY SubmissionDate_ASSEMBLY Latest_ASSEMBLY UID_BIOSAMPLE ACC_BIOSAMPLE Location_BIOSAMPLE Coordinates_BIOSAMPLE IsolationSource_BIOSAMPLE Host_BIOSAMPLE CollectionDate_BIOSAMPLE Host_DISEASE SamplType_BIOSAMPLE taxon_name taxon_rank lineage taxon_species_id taxon_species_name taxon_genus_id taxon_genus_name taxon_family_id taxon_family_name taxon_order_id taxon_order_name taxon_class_id taxon_class_name taxon_phylum_id taxon_phylum_name taxon_superkingdom_id taxon_superkingdom_name loc_lat loc_lng loc_parsed GC_NUCCORE Identical OldVersion hits_rMLST hitscount_rMLST inclusions Host_BIOSAMPLE_processed Host_DISEASE_processed D1 D2 plasmidfinder pmlst relaxase_type(s) mpf_type -410655417 NC_018969.1 "Staphylococcus aureus plasmid p19321-P01, complete sequence" 3/11/2012 circular complete 1280 plasmid 2473 RefSeq 14224382 SAMN14224382 Staphylococcus aureus species cellular organisms; Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus 1280 Staphylococcus aureus 1279 Staphylococcus 90964 Staphylococcaceae 1385 Bacillales 91061 Bacilli 1239 Firmicutes 2 Bacteria 30.812778 CP002148.1 0 -4.021508 9.8135605 "rep10_4_repL(pDLK1), GU562624, " \ No newline at end of file diff --git a/tests/test_data/Plassembler_Test_DB/plsdb.fasta b/tests/test_data/Plassembler_Test_DB/plsdb_2023_11_03_v2.fasta similarity index 96% rename from tests/test_data/Plassembler_Test_DB/plsdb.fasta rename to tests/test_data/Plassembler_Test_DB/plsdb_2023_11_03_v2.fasta index 9595971..81c7f33 100644 --- a/tests/test_data/Plassembler_Test_DB/plsdb.fasta +++ b/tests/test_data/Plassembler_Test_DB/plsdb_2023_11_03_v2.fasta @@ -1,4 +1,4 @@ ->NC_018969.1 Staphylococcus aureus plasmid p19321-P01, complete sequence +>CP127756.1 Staphylococcus aureus strain C222 plasmid pC222_1, complete sequence CTAGTCCTTGAAAGAATAATAATCAGATAATGCATTTTCTTGTTTTTCATTTGCCTCTTGCTCAAAGTTC CCAAATTCGAGTAAGAGGTATTTTTGTTTTTGGTCGTCGCCTCTCATTAGTAGTTCAGGGTTTAACATTA ATACTCCAGTTTTTCTTTTTATAATATTTCCTTCTTCTAAGATTTTAAGTGTTGTTATTACTGTTTGTAG diff --git a/tests/test_data/Plassembler_Test_DB/plsdb.msh b/tests/test_data/Plassembler_Test_DB/plsdb_2023_11_03_v2.msh similarity index 93% rename from tests/test_data/Plassembler_Test_DB/plsdb.msh rename to tests/test_data/Plassembler_Test_DB/plsdb_2023_11_03_v2.msh index d7d038dfda0abc3530917595b9ecfb9bc046903d..758dccd12d56cf5495ef4e2e838cfb1cd757e645 100644 GIT binary patch delta 112 zcmccN@W6qUk%56haU!cRX9AGHz`)3mI?>UYD+k%56hVIr$BXAF?Rz`)3mIMLCWD+$OH1>#4H6Z@om{G8(r3@t3pEcFaQ zY8Zk`5(_dabMlk(laot}6%tE}Qh`)KPGWIxW{N_Ap{22rp>BYIp^idwer`cdYDua> RacW^{YF=_G!^S@;@&MqnA Date: Tue, 21 Nov 2023 12:06:42 +1030 Subject: [PATCH 7/7] update docs and readme [skip ci] --- HISTORY.md | 6 ++++++ README.md | 40 ++++++++++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 2e16ac1..f73bf92 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,11 @@ # History +1.5.0 (2023-11-21) +------------------ + +* **If you upgrade to v1.5.0, you will need to update the database using `plassembler download`** +* Plassembler v1.5.0 incorporates a new database thanks to the recent PLSDB release [2023_11_03_v2](https://ccb-microbe.cs.uni-saarland.de/plsdb/). Thanks @[biobrad](https://github.com/biobrad) for the heads up. + 1.4.1 (2023-10-30) ------------------ diff --git a/README.md b/README.md index c74f275..bd44979 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,22 @@ Additionally, I would recommend reading the following guides to bacterial genome * [Perfect Bacterial Assembly Tutorial](https://github.com/rrwick/Perfect-bacterial-genome-tutorial) * [Perfect bacterial assembly Paper](https://doi.org/10.1371/journal.pcbi.1010905) +## Quick Start + +The easiest way to install `plassembler` is via conda: + +`conda install -c bioconda plassembler` + +Followed by database download and installation: + +`plassembler download -d ` + +And finally run `plassembler`: + +`plassembler run -d -l -o -1 < short read R1 fastq> -2 < short read R2 fastq> -c ` + +Please read the [Installation](#installation) section for more details, especially if you are an inexperienced command line user. + ## Manuscript `plassembler` has been recently published in *Bioinformatics*: @@ -32,13 +48,19 @@ George Bouras, Anna E. Sheppard, Vijini Mallawaarachchi, Sarah Vreugde, Plassemb If you use `plassembler`, please see the full [Citations](#citations) section for a list of all programs `plassembler` uses under the hood, in order to fully recognise the creators of these tools for their work. +## Documentation + +The full documentation for Plassembler can be found [here](https://plassembler.readthedocs.io/en/latest). + ## Table of Contents - [plassembler](#plassembler) - [Automated Bacterial Plasmid Assembly Program](#automated-bacterial-plasmid-assembly-program) + - [Quick Start](#quick-start) - [Manuscript](#manuscript) + - [Documentation](#documentation) - [Table of Contents](#table-of-contents) - - [Quick Start](#quick-start) + - [`plassembler` v1.5.0 Update New Database (21 November 2023)](#plassembler-v150-update-new-database-21-november-2023) - [`plassembler` v1.3.0 Updates (24 October 2023)](#plassembler-v130-updates-24-october-2023) - [Why Does Plassembler Exist?](#why-does-plassembler-exist) - [Why Not Just Use Unicycler?](#why-not-just-use-unicycler) @@ -58,21 +80,11 @@ If you use `plassembler`, please see the full [Citations](#citations) section fo - [Bugs and Suggestions](#bugs-and-suggestions) - [Citations](#citations) -## Quick Start - -The easiest way to install `plassembler` is via conda: - -`conda install -c bioconda plassembler` - -Followed by database download and installation: - -`plassembler download -d ` - -And finally run `plassembler`: +## `plassembler` v1.5.0 Update New Database (21 November 2023) -`plassembler run -d -l -o -1 < short read R1 fastq> -2 < short read R2 fastq> -c ` +* **If you upgrade to v1.5.0, you will need to update the database using `plassembler download`** +* Plassembler v1.5.0 incorporates a new expanded database thanks to the recent PLSDB release [2023_11_03_v2](https://ccb-microbe.cs.uni-saarland.de/plsdb/). Thanks @[biobrad](https://github.com/biobrad) for the heads up. -Please read the [Installation](#installation) section for more details, especially if you are an inexperienced command line user. ## `plassembler` v1.3.0 Updates (24 October 2023)