From d66fe85627325dac88737166a4435d75cab6de42 Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Sat, 9 Nov 2024 06:38:09 +1000 Subject: [PATCH] condense: taxonomic_profile_coverage: Abstract method from Summariser. --- singlem/condense.py | 34 +++++++++++++++++++++++++++++++++- singlem/summariser.py | 23 +---------------------- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/singlem/condense.py b/singlem/condense.py index 3858b484..b83ad187 100644 --- a/singlem/condense.py +++ b/singlem/condense.py @@ -4,9 +4,10 @@ import numpy as np import extern import sys - from queue import Queue +import polars as pl + from .archive_otu_table import ArchiveOtuTable, ArchiveOtuTableEntry from .metapackage import Metapackage from .taxonomy import * @@ -942,6 +943,37 @@ def each_sample_wise(io): if current_sample is not None: yield CondensedCommunityProfile(current_sample, current_root) + def taxonomic_level_coverage_table(self): + '''Return a pl DataFrame with the coverage and relative abundance of + each taxonomic level. If there are 7 or 8 levels, then the standard + [root], domain, phylum, etc. levels are assumed. Returning a polars + dataframe maybe isn't the most pythonic, and so this might be changed in + the future. But eh for now.''' + name_to_coverage = {} + for node in self.breadth_first_iter(): + node_level = node.calculate_level() + if node_level == 0: + continue + if node_level not in name_to_coverage: + name_to_coverage[node_level] = 0. + name_to_coverage[node_level] += node.coverage + result = pl.DataFrame({ + 'level': list(name_to_coverage.keys()), + 'coverage': list(name_to_coverage.values()) + }).with_columns(pl.lit(self.sample).alias('sample')).with_columns( + ((pl.col('coverage') / pl.col('coverage').sum()).alias('relative_abundance') * 100).round(2), + ) + + if len(result.select(pl.col('level')).group_by('level').count()) in [7, 8]: + # If there's 7 or 8 (including 0) levels, then assume that this is a regular taxonomy going on. + levels = ['root', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'] + level_id_to_level_name = {i: levels[i] for i in range(len(levels))} + result = result.with_columns( + level=pl.col('level').replace_strict(level_id_to_level_name, return_dtype=pl.Utf8) + ) + return result + + class CondensedCommunityProfileKronaWriter: @staticmethod def write_krona(condensed_profiles, output_file): diff --git a/singlem/summariser.py b/singlem/summariser.py index 02f79991..8315e3bb 100644 --- a/singlem/summariser.py +++ b/singlem/summariser.py @@ -556,28 +556,7 @@ def write_taxonomic_level_coverage_table(**kwargs): for profile_file in input_taxonomic_profiles: with open(profile_file) as f: for profile in CondensedCommunityProfile.each_sample_wise(f): - name_to_coverage = {} - for node in profile.breadth_first_iter(): - node_level = node.calculate_level() - if node_level == 0: - continue - if node_level not in name_to_coverage: - name_to_coverage[node_level] = 0. - name_to_coverage[node_level] += node.coverage - result = pl.DataFrame({ - 'level': list(name_to_coverage.keys()), - 'coverage': list(name_to_coverage.values()) - }).with_columns(pl.lit(profile.sample).alias('sample')).with_columns( - ((pl.col('coverage') / pl.col('coverage').sum()).alias('relative_abundance') * 100).round(2), - ) - - if len(result.select(pl.col('level')).group_by('level').count()) in [7, 8]: - # If there's 7 or 8 (including 0) levels, then assume that this is a regular taxonomy going on. - levels = ['root','domain','phylum','class','order','family','genus','species'] - level_id_to_level_name = {i: levels[i] for i in range(len(levels))} - result = result.with_columns( - level = pl.col('level').replace_strict(level_id_to_level_name, return_dtype=pl.Utf8) - ) + result = profile.taxonomic_level_coverage_table() result = result.select([ 'sample',