Skip to content

Commit

Permalink
summarise: collapse-to-sample-name: Accept archive lists.
Browse files Browse the repository at this point in the history
Suggested by: Joshua Mitchell.
  • Loading branch information
wwood committed Oct 2, 2024
1 parent badfe68 commit af43d09
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 19 deletions.
10 changes: 7 additions & 3 deletions singlem/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ def add_less_common_pipe_arguments(argument_group):
summarise_otu_table_input_args = summarise_parser.add_argument_group('OTU table input')
summarise_otu_table_input_args.add_argument('--input-otu-tables', '--input-otu-table', nargs='+', help="Summarise these tables")
summarise_otu_table_input_args.add_argument('--input-otu-tables-list', help="Summarise the OTU table files newline separated in this file")
summarise_otu_table_input_args.add_argument('--input-archive-otu-tables', '--input-archive-otu-table', nargs='+', help="Summarise these tables")
summarise_otu_table_input_args.add_argument('--input-archive-otu-tables', '--input-archive-otu-table', nargs='+', help="Summarise these tables", default=[])
summarise_otu_table_input_args.add_argument('--input-archive-otu-table-list',
help="Summarise the archive tables newline separated in this file")
summarise_otu_table_input_args.add_argument('--input-gzip-archive-otu-table-list',
Expand Down Expand Up @@ -419,7 +419,7 @@ def add_less_common_pipe_arguments(argument_group):
read_fraction_uncommon_args = read_fraction_parser.add_argument_group('other options')
read_fraction_uncommon_args.add_argument('--accept-missing-samples', action='store_true', help="If a sample is missing from the input-metagenome-sizes file, skip analysis of it without croaking.")
read_fraction_uncommon_args.add_argument('--output-tsv', help="Output file [default: stdout]")
read_fraction_uncommon_args.add_argument('--output-per-taxon-read-fractions', help="Output a fraction for each taxon to this TSV [default: D o not output anything]")
read_fraction_uncommon_args.add_argument('--output-per-taxon-read-fractions', help="Output a fraction for each taxon to this TSV [default: Do not output anything]")

renew_description = 'Reannotate an OTU table with an updated taxonomy'
renew_parser = bird_argparser.new_subparser('renew', renew_description, parser_group='Tools')
Expand Down Expand Up @@ -823,7 +823,7 @@ def get_min_taxon_coverage(args, subparser='pipe'):
if args.collapse_to_sample_name:
if args.input_otu_tables:
raise Exception("--collapse-to-sample-name currently only works with archive tables")
elif not len(args.input_archive_otu_tables) >= 1:
elif not len(args.input_archive_otu_tables) >= 1 and not args.input_archive_otu_table_list and not args.input_gzip_archive_otu_table_list:
raise Exception("--collapse-to-sample-name currently only works with archive tables as input")
if args.collapse_paired_with_unpaired_archive_otu_table:
if args.input_otu_tables:
Expand Down Expand Up @@ -974,12 +974,16 @@ def get_min_taxon_coverage(args, subparser='pipe'):
with open(args.output_archive_otu_table, 'w') as f:
Summariser.write_collapsed_paired_with_unpaired_otu_table(
archive_otu_tables = args.input_archive_otu_tables,
archive_otu_table_list = args.input_archive_otu_table_list,
gzip_archive_otu_table_list = args.input_gzip_archive_otu_table_list,
output_table_io = f,
set_sample_name = args.collapse_to_sample_name)
elif args.collapse_paired_with_unpaired_archive_otu_table:
with open(args.collapse_paired_with_unpaired_archive_otu_table,'w') as output_io:
Summariser.write_collapsed_paired_with_unpaired_otu_table(
archive_otu_tables = args.input_archive_otu_tables,
archive_otu_table_list = args.input_archive_otu_table_list,
gzip_archive_otu_table_list = args.input_gzip_archive_otu_table_list,
output_table_io = output_io)
elif args.unaligned_sequences_dump_file:
with open(args.unaligned_sequences_dump_file, 'w') as f:
Expand Down
57 changes: 41 additions & 16 deletions singlem/summariser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@
import Bio
import pandas as pd
import polars as pl
import gzip

from .otu_table import OtuTable
from .rarefier import Rarefier
from .ordered_set import OrderedSet
from .archive_otu_table import ArchiveOtuTable
from .taxonomy import QUERY_BASED_ASSIGNMENT_METHOD, DIAMOND_ASSIGNMENT_METHOD, NO_ASSIGNMENT_METHOD
from .condense import CondensedCommunityProfile, WordNode
from .condense import CondensedCommunityProfile

class Summariser:
@staticmethod
Expand Down Expand Up @@ -337,36 +338,60 @@ def print_chunk(printed_header, seq_to_otus, output_table_io):
# output_table_io = open(args.collapse_paired_with_unpaired,'w'))
def write_collapsed_paired_with_unpaired_otu_table(**kwargs):
archive_otu_tables = kwargs.pop('archive_otu_tables')
# archive_otu_table_list = args.input_archive_otu_table_list,
# gzip_archive_otu_table_list = args.input_gzip_archive_otu_table_list,
archive_otu_table_list = kwargs.pop('archive_otu_table_list')
gzip_archive_otu_table_list = kwargs.pop('gzip_archive_otu_table_list')
output_table_io = kwargs.pop('output_table_io')
set_sample_name = kwargs.pop('set_sample_name', None) # For merging OTU tables
if len(kwargs) > 0:
raise Exception("Unexpected arguments detected: %s" % kwargs)

# Read all OTU tables
df = None
for a in archive_otu_tables:
with open(a) as f:
logging.debug("Reading archive table {} into RAM ..".format(a))
ar = ArchiveOtuTable.read(f)
overall_df = None
ar = None

def read_archive_table(df, f, prev_ar):
logging.debug("Reading archive table {} into RAM ..".format(a))
ar = ArchiveOtuTable.read(f)
if df is None:
version = ar.version
fields = ar.fields
alignment_hmm_sha256s = ar.alignment_hmm_sha256s
singlem_package_sha256s = ar.singlem_package_sha256s
# version = ar.version
# fields = ar.fields
# alignment_hmm_sha256s = ar.alignment_hmm_sha256s
# singlem_package_sha256s = ar.singlem_package_sha256s
df = pandas.DataFrame(ar.data)
df.columns = fields
df.columns = ar.fields
else:
if version != ar.version:
if prev_ar.version != ar.version:
raise Exception("Version mismatch between archives")
elif fields != ar.fields:
elif prev_ar.fields != ar.fields:
raise Exception("Fields mismatch between archives")
elif alignment_hmm_sha256s != ar.alignment_hmm_sha256s:
elif prev_ar.alignment_hmm_sha256s != ar.alignment_hmm_sha256s:
raise Exception("Alignment HMM SHA256 mismatch between archives")
elif singlem_package_sha256s != ar.singlem_package_sha256s:
elif prev_ar.singlem_package_sha256s != ar.singlem_package_sha256s:
raise Exception("Singlem package SHA256 mismatch between archives")
df2 = pandas.DataFrame(ar.data)
df2.columns = fields
df2.columns = prev_ar.fields
df = pd.concat([df, df2], ignore_index=True)
return df, ar

for a in archive_otu_tables:
with open(a) as f:
overall_df, ar = read_archive_table(overall_df, f, ar)
if archive_otu_table_list:
with open(archive_otu_table_list) as f:
for a in f:
with open(a.strip()) as g:
overall_df, ar = read_archive_table(overall_df, g, ar)
if gzip_archive_otu_table_list:
with open(gzip_archive_otu_table_list) as f:
lines = f.readlines()
print(lines)
for a in lines:
logging.debug("Reading gzip archive table {} ..".format(a))
with gzip.open(a.strip()) as g:
overall_df, ar = read_archive_table(overall_df, g, ar)
df = overall_df

# Remove suffixes
if set_sample_name is None:
Expand Down
Binary file added test/data/small.v4.json.gz
Binary file not shown.

0 comments on commit af43d09

Please sign in to comment.