Skip to content

Commit

Permalink
condense: taxonomic_profile_coverage: Abstract method from Summariser.
Browse files Browse the repository at this point in the history
  • Loading branch information
wwood committed Nov 8, 2024
1 parent 3219d65 commit d66fe85
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 23 deletions.
34 changes: 33 additions & 1 deletion singlem/condense.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import numpy as np
import extern
import sys

from queue import Queue

import polars as pl

from .archive_otu_table import ArchiveOtuTable, ArchiveOtuTableEntry
from .metapackage import Metapackage
from .taxonomy import *
Expand Down Expand Up @@ -942,6 +943,37 @@ def each_sample_wise(io):
if current_sample is not None:
yield CondensedCommunityProfile(current_sample, current_root)

def taxonomic_level_coverage_table(self):
'''Return a pl DataFrame with the coverage and relative abundance of
each taxonomic level. If there are 7 or 8 levels, then the standard
[root], domain, phylum, etc. levels are assumed. Returning a polars
dataframe maybe isn't the most pythonic, and so this might be changed in
the future. But eh for now.'''
name_to_coverage = {}
for node in self.breadth_first_iter():
node_level = node.calculate_level()
if node_level == 0:
continue
if node_level not in name_to_coverage:
name_to_coverage[node_level] = 0.
name_to_coverage[node_level] += node.coverage
result = pl.DataFrame({
'level': list(name_to_coverage.keys()),
'coverage': list(name_to_coverage.values())
}).with_columns(pl.lit(self.sample).alias('sample')).with_columns(
((pl.col('coverage') / pl.col('coverage').sum()).alias('relative_abundance') * 100).round(2),
)

if len(result.select(pl.col('level')).group_by('level').count()) in [7, 8]:
# If there's 7 or 8 (including 0) levels, then assume that this is a regular taxonomy going on.
levels = ['root', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']
level_id_to_level_name = {i: levels[i] for i in range(len(levels))}
result = result.with_columns(
level=pl.col('level').replace_strict(level_id_to_level_name, return_dtype=pl.Utf8)
)
return result


class CondensedCommunityProfileKronaWriter:
@staticmethod
def write_krona(condensed_profiles, output_file):
Expand Down
23 changes: 1 addition & 22 deletions singlem/summariser.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,28 +556,7 @@ def write_taxonomic_level_coverage_table(**kwargs):
for profile_file in input_taxonomic_profiles:
with open(profile_file) as f:
for profile in CondensedCommunityProfile.each_sample_wise(f):
name_to_coverage = {}
for node in profile.breadth_first_iter():
node_level = node.calculate_level()
if node_level == 0:
continue
if node_level not in name_to_coverage:
name_to_coverage[node_level] = 0.
name_to_coverage[node_level] += node.coverage
result = pl.DataFrame({
'level': list(name_to_coverage.keys()),
'coverage': list(name_to_coverage.values())
}).with_columns(pl.lit(profile.sample).alias('sample')).with_columns(
((pl.col('coverage') / pl.col('coverage').sum()).alias('relative_abundance') * 100).round(2),
)

if len(result.select(pl.col('level')).group_by('level').count()) in [7, 8]:
# If there's 7 or 8 (including 0) levels, then assume that this is a regular taxonomy going on.
levels = ['root','domain','phylum','class','order','family','genus','species']
level_id_to_level_name = {i: levels[i] for i in range(len(levels))}
result = result.with_columns(
level = pl.col('level').replace_strict(level_id_to_level_name, return_dtype=pl.Utf8)
)
result = profile.taxonomic_level_coverage_table()

result = result.select([
'sample',
Expand Down

0 comments on commit d66fe85

Please sign in to comment.