Skip to content

Commit

Permalink
more docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
wasade committed Feb 3, 2025
1 parent 2bc1b48 commit 796c759
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 5 deletions.
117 changes: 114 additions & 3 deletions micov/_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,44 @@


class SetOfAll:
# forgot the formal name for this
"""A universal set."""

def __contains__(self, other):
return True


def parse_bed_cov_to_df(data):
"""BED3 -> DataFrame.
Parameters
----------
data : IO-like
The data to parse
Returns
-------
pl.DataFrame
The BED3 data expressed within a DataFrame
"""
return _parse_bed_cov(data, None, None, False)


def _parse_bed_cov(data, feature_drop, feature_keep, lazy):
"""BED3 -> DataFrame.
Parameters
----------
data : IO-like
The data to parse
feature_drop : iterable
Any features to explicitly drop (all others are kept)
feature_keep : iterable
Any features to explicitly keep (all others are dropped)
lazy : bool
Return LazyFrame or DataFrame
"""
first_line = data.readline()
data.seek(0)

Expand Down Expand Up @@ -57,6 +85,18 @@ def _parse_bed_cov(data, feature_drop, feature_keep, lazy):


def parse_qiita_coverages(tgzs, *args, **kwargs):
"""Parse a Qiita-style coverages.tgz file.
Parameters
----------
tgzs : iterable of str
The file paths to process
*args : stuff or None
Forwarded to _parse_qiita_coverages
**kwargs : dict, optional
Forwarded to _parse_qiita_coverages
"""
if not isinstance(tgzs, (list, tuple, set, frozenset)):
tgzs = [tgzs, ]

Expand All @@ -83,6 +123,33 @@ def parse_qiita_coverages(tgzs, *args, **kwargs):
def _parse_qiita_coverages(tgz, compress_size=50_000_000, sample_keep=None,
sample_drop=None, feature_keep=None,
feature_drop=None, append_sample_id=False):
"""Parse an individual Qiita-style coverages.tgz file.
A coverages.tgz file contains BED-3 style coverage information per sample.
Parameters
----------
tgz : str
The path to process
compress_size : int, optional
The number of records to buffer until a compression occurs
sample_keep : iterable, optional
Samples to explicitly keep (all others are dropped)
sample_drop : iterable, optional
Samples to explicitly drop (all others are kept)
feature_keep : iterable, optional
Features to explicitly keep (all others are dropped)
feature_drop : iterable, optiona;
Features to explicilty drop (all others are kept)
append_sample_id : bool
Whether to include in the resulting DataFrame the detected sample IDs
Returns
-------
pl.DataFrame
A dataframe representing the coverage data
"""
# compress_size=None to disable compression
fp = tarfile.open(tgz)

Expand Down Expand Up @@ -130,6 +197,7 @@ def _parse_qiita_coverages(tgz, compress_size=50_000_000, sample_keep=None,


def _single_df(coverages):
"""Map [pl.DataFrame, ...] -> pl.DataFrame."""
if len(coverages) > 1:
df = pl.concat(coverages, rechunk=True)
elif len(coverages) == 0:
Expand All @@ -141,6 +209,7 @@ def _single_df(coverages):


def _check_and_compress(coverages, compress_size):
"""Check whether we have buffered enough, if so compress."""
rowcount = sum([len(df) for df in coverages])
if rowcount > compress_size:
df = compress(_single_df(coverages))
Expand All @@ -149,6 +218,7 @@ def _check_and_compress(coverages, compress_size):


def _test_has_header(line):
"""Test whether a line appears to be a header."""
if isinstance(line, bytes):
line = line.decode('utf-8')

Expand All @@ -167,6 +237,7 @@ def _test_has_header(line):


def _test_has_header_taxonomy(line):
"""Test whether a line appears to be a taxonomy header."""
if isinstance(line, bytes):
line = line.decode('utf-8')

Expand All @@ -185,6 +256,7 @@ def _test_has_header_taxonomy(line):


def parse_genome_lengths(lengths):
"""Parse a TSV representing feature and length information."""
with open(lengths) as fp:
first_line = fp.readline()

Expand All @@ -209,6 +281,7 @@ def parse_genome_lengths(lengths):


def parse_taxonomy(taxonomy):
"""Parse a TSV representing feature and taxonomy information."""
with open(taxonomy) as fp:
first_line = fp.readline()

Expand All @@ -228,6 +301,7 @@ def parse_taxonomy(taxonomy):


def set_taxonomy_as_id(coverages, taxonomy):
"""Add taxonomy information to a coverages DataFrame."""
missing = (set(coverages[COLUMN_GENOME_ID]) -
set(taxonomy[COLUMN_GENOME_ID]))
if len(missing) > 0:
Expand All @@ -243,6 +317,7 @@ def set_taxonomy_as_id(coverages, taxonomy):

# TODO: this is not the greatest method name
def parse_sam_to_df(sam):
"""Minimally parse SAM and compute stop coordinates from CIGAR."""
df = pl.read_csv(sam, separator='\t', has_header=False,
columns=SAM_SUBSET_SCHEMA.column_indices,
comment_prefix='@',
Expand All @@ -256,13 +331,26 @@ def parse_sam_to_df(sam):


def _add_file(tf, name, data):
"""Add a file to a tgz."""
ti = tarfile.TarInfo(name)
ti.size = len(data)
ti.mtime = int(time.time())
tf.addfile(ti, io.BytesIO(data))


def write_qiita_cov(name, paths, lengths):
"""Construct a Qiita-style coverages.tgz.
Parameters
----------
name : str
The path of the tgz to write.
paths : iterable
The paths of the coverage data to include in the tgz.
lengths : pl.DataFrame
The genome -> length information.
"""
tf = tarfile.open(name, "w:gz")

coverages = []
Expand Down Expand Up @@ -308,6 +396,7 @@ def write_qiita_cov(name, paths, lengths):


def parse_sample_metadata(path):
"""Naively parse sample metadata, do not infer types."""
df = pl.read_csv(path, separator='\t', infer_schema_length=0)
return df.rename({df.columns[0]: COLUMN_SAMPLE_ID})

Expand All @@ -332,18 +421,39 @@ def _reader(sam):
yield fp


def _buf_to_bytes(buf):
def _flatten_buf(buf):
"""Map [data_1, ... data_N] -> IOobject(all_data) via simple join."""
if isinstance(buf[0], str):
return io.StringIO(''.join(buf))
else:
return io.BytesIO(b''.join(buf))


def _subset_sam_to_bed(df):
"""Pull a subset of specific columns from a dataframe."""
return df[list(BED_COV_SCHEMA.columns)]


def compress_from_stream(sam, bufsize=100_000_000, disable_compression=False):
"""Compress SAM-like or BED3-like data.
Parameters
----------
sam : file path of buffer (e.g., sys.stdin)
The data to consume.
bufsize : int, optional
The number of records to buffer before a compressing (i.e., collapsing
overlapping intervls).
disable_compression : bool, optional
If true, do not compress the intervals.
Returns
-------
pl.DataFrame
A BED-3 like dataframe describing the feature, start and stop regions
represented by the input SAM data.
"""
if disable_compression:
compress_f = _subset_sam_to_bed
else:
Expand All @@ -370,14 +480,15 @@ def compress_from_stream(sam, bufsize=100_000_000, disable_compression=False):
parse_f = parse_sam_to_df

while len(buf) > 0:
next_df = compress_f(parse_f(_buf_to_bytes(buf)))
next_df = compress_f(parse_f(_flatten_buf(buf)))
current_df = compress_f(pl.concat([current_df, next_df]))
buf = data.readlines(bufsize)

return current_df.rechunk()


def parse_coverage(data, features_to_keep):
"""Parse a simple TSV descriving total coverage."""
cov_df = pl.read_csv(data.read(), separator='\t',
new_columns=GENOME_COVERAGE_SCHEMA.columns,
schema_overrides=GENOME_COVERAGE_SCHEMA.dtypes_dict).lazy()
Expand Down
7 changes: 5 additions & 2 deletions micov/_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ def get_covered(x_start_stop):


def single_sample_position_plot(positions, lengths, output, scale=None):
"""A simple position plot.
"""Construct a metadata-independent position plot.
Parameters
----------
Expand All @@ -444,6 +444,10 @@ def single_sample_position_plot(positions, lengths, output, scale=None):
output : str
A prefix to use on plotting. This can include a directory, for instance,
"foo/bar/theprefix"
scale : int, optional
If specified, represent the genome as `scale` number of buckets. A
bucket is considered represented if any position within the bucket
is covered
"""
positions = (positions
Expand Down Expand Up @@ -505,7 +509,6 @@ def position_plot(metadata, coverage, positions, target, variable, output,
is covered
"""

if scale is not None and scale <= 1:
raise ValueError("`scale` must be greater than 1")

Expand Down

0 comments on commit 796c759

Please sign in to comment.