condense: taxonomic_profile_coverage: Abstract method from Summariser.

wwood · Nov 8, 2024 · d66fe85 · d66fe85
1 parent 3219d65
commit d66fe85
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 23 deletions.
diff --git a/singlem/condense.py b/singlem/condense.py
@@ -4,9 +4,10 @@
 import numpy as np
 import extern
 import sys
-
 from queue import Queue
 
+import polars as pl
+
 from .archive_otu_table import ArchiveOtuTable, ArchiveOtuTableEntry
 from .metapackage import Metapackage
 from .taxonomy import *
@@ -942,6 +943,37 @@ def each_sample_wise(io):
         if current_sample is not None:
             yield CondensedCommunityProfile(current_sample, current_root)
 
+    def taxonomic_level_coverage_table(self):
+        '''Return a pl DataFrame with the coverage and relative abundance of
+        each taxonomic level. If there are 7 or 8 levels, then the standard
+        [root], domain, phylum, etc. levels are assumed. Returning a polars
+        dataframe maybe isn't the most pythonic, and so this might be changed in
+        the future. But eh for now.'''
+        name_to_coverage = {}
+        for node in self.breadth_first_iter():
+            node_level = node.calculate_level()
+            if node_level == 0:
+                continue
+            if node_level not in name_to_coverage:
+                name_to_coverage[node_level] = 0.
+            name_to_coverage[node_level] += node.coverage
+        result = pl.DataFrame({
+            'level': list(name_to_coverage.keys()),
+            'coverage': list(name_to_coverage.values())
+        }).with_columns(pl.lit(self.sample).alias('sample')).with_columns(
+            ((pl.col('coverage') / pl.col('coverage').sum()).alias('relative_abundance') * 100).round(2),
+        )
+
+        if len(result.select(pl.col('level')).group_by('level').count()) in [7, 8]:
+            # If there's 7 or 8 (including 0) levels, then assume that this is a regular taxonomy going on.
+            levels = ['root', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']
+            level_id_to_level_name = {i: levels[i] for i in range(len(levels))}
+            result = result.with_columns(
+                level=pl.col('level').replace_strict(level_id_to_level_name, return_dtype=pl.Utf8)
+            )
+        return result
+
+
 class CondensedCommunityProfileKronaWriter:
     @staticmethod
     def write_krona(condensed_profiles, output_file):

diff --git a/singlem/summariser.py b/singlem/summariser.py
@@ -556,28 +556,7 @@ def write_taxonomic_level_coverage_table(**kwargs):
             for profile_file in input_taxonomic_profiles:
                 with open(profile_file) as f:
                     for profile in CondensedCommunityProfile.each_sample_wise(f):
-                        name_to_coverage = {}
-                        for node in profile.breadth_first_iter():
-                            node_level = node.calculate_level()
-                            if node_level == 0:
-                                continue
-                            if node_level not in name_to_coverage:
-                                name_to_coverage[node_level] = 0.
-                            name_to_coverage[node_level] += node.coverage
-                        result = pl.DataFrame({
-                            'level': list(name_to_coverage.keys()),
-                            'coverage': list(name_to_coverage.values())
-                        }).with_columns(pl.lit(profile.sample).alias('sample')).with_columns(
-                            ((pl.col('coverage') / pl.col('coverage').sum()).alias('relative_abundance') * 100).round(2),
-                        )
-
-                        if len(result.select(pl.col('level')).group_by('level').count()) in [7, 8]:
-                            # If there's 7 or 8 (including 0) levels, then assume that this is a regular taxonomy going on.
-                            levels = ['root','domain','phylum','class','order','family','genus','species']
-                            level_id_to_level_name = {i: levels[i] for i in range(len(levels))}
-                            result = result.with_columns(
-                                level = pl.col('level').replace_strict(level_id_to_level_name, return_dtype=pl.Utf8)
-                            )
+                        result = profile.taxonomic_level_coverage_table()
 
                         result = result.select([
                             'sample',