From 58c120aec33476808e0568ca107e29d97d6ff96f Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 15 Nov 2024 15:52:52 -0800 Subject: [PATCH 1/2] Unscaled cumulative plots, and cumulative with a null curve via Monte Carlo --- micov/_plot.py | 147 ++++++++++++++++++++++++++++++++++++++++++++----- micov/cli.py | 61 +++++++++++++++++++- 2 files changed, 191 insertions(+), 17 deletions(-) diff --git a/micov/_plot.py b/micov/_plot.py index ddbc92c..09f4426 100644 --- a/micov/_plot.py +++ b/micov/_plot.py @@ -16,7 +16,8 @@ def ordered_coverage(coverage, grp, target): .filter(pl.col(COLUMN_GENOME_ID) == target) .sort(COLUMN_PERCENT_COVERED) .with_row_index() - .with_columns(x=pl.col('index') / pl.len())).collect() + .with_columns(x=pl.col('index') / pl.len(), + x_unscaled=pl.col('index'))).collect() def slice_positions(positions, id_): @@ -40,6 +41,119 @@ def per_sample_plots(all_coverage, all_covered_positions, metadata, sample_metadata_column, output, scale=10000) +def per_sample_plots_monte(all_coverage, all_covered_positions, metadata, + sample_metadata_column, output, target_lookup, iters): + for genome in all_coverage[COLUMN_GENOME_ID].unique(): + target_name = target_lookup[genome] + cumulative_monte(metadata, all_coverage, all_covered_positions, genome, + sample_metadata_column, output, target_name, iters) + + +def compute_cumulative(coverage, grp, target, target_positions, lengths): + current = pl.DataFrame([], schema=BED_COV_SCHEMA.dtypes_flat) + grp_coverage = ordered_coverage(coverage, grp, target) + + if len(grp_coverage) == 0: + return None, None + + cur_y = [] + cur_x = grp_coverage['x_unscaled'] + for id_ in grp_coverage[COLUMN_SAMPLE_ID]: + next_ = slice_positions(target_positions, id_).collect() + current = compress(pl.concat([current, next_])) + per_cov = coverage_percent(current, lengths).collect() + cur_y.append(per_cov[COLUMN_PERCENT_COVERED].item(0)) + return cur_x, cur_y + + +def cumulative_monte(metadata, coverage, positions, target, variable, output, + target_name, iters): + plt.figure(figsize=(12, 8)) + labels = [] + + target_positions = positions.filter(pl.col(COLUMN_GENOME_ID) == target) + coverage = coverage.filter(pl.col(COLUMN_GENOME_ID) == target) + cov_samples = coverage.select(pl.col(COLUMN_SAMPLE_ID).unique()) + metadata = metadata.filter(pl.col(COLUMN_SAMPLE_ID).is_in(cov_samples)) + + if len(target_positions) == 0: + raise ValueError() + + if len(coverage) == 0: + raise ValueError() + + lengths = coverage[[COLUMN_GENOME_ID, COLUMN_LENGTH]].unique() + + if len(lengths) > 1: + raise ValueError() + + length = lengths[COLUMN_LENGTH].item(0) + value_order = metadata.select(pl.col(variable) + .unique() + .sort())[variable] + max_n = 0 + for name, color in zip(value_order, range(0, 10)): + color = f'C{color}' + + grp = metadata.filter(pl.col(variable) == name) + + n = len(grp) + if n < 10: + continue + max_n = max(n, max_n) + + cur_x, cur_y = compute_cumulative(coverage, grp, target, + target_positions, lengths) + if cur_x is None: + continue + else: + labels.append(f"{name} (n={len(cur_x)})") + plt.plot(cur_x, cur_y, color=color) + + if not labels: + return + + monte_y = [] + for it in range(iters): + monte = (metadata.select(pl.col(COLUMN_SAMPLE_ID) + .shuffle()) + .head(max_n)) + grp_monte = metadata.filter(pl.col(COLUMN_SAMPLE_ID) + .is_in(monte)) + monte_x, cur_y = compute_cumulative(coverage, grp_monte, target, + target_positions, lengths) + monte_y.append(cur_y) + + monte_y = np.asarray(monte_y) + median = np.median(monte_y, axis=0) + std = np.std(monte_y, axis=0) + + plt.plot(monte_x, median, color='k', linestyle='dotted', linewidth=1, + alpha=0.6) + plt.plot(monte_x, median + std, color='k', linestyle='--', linewidth=1, + alpha=0.6) + plt.plot(monte_x, median - std, color='k', linestyle='--', linewidth=1, + alpha=0.6) + plt.fill_between(monte_x, median - std, median + std, color='k', + alpha=0.1) + labels.append(f'Monte Carlo (n={len(monte_x)})') + + ax = plt.gca() + ax.set_ylabel('Percent genome covered', fontsize=16) + ax.set_xlabel('Within group sample rank by coverage', fontsize=16) + ax.tick_params(axis='both', which='major', labelsize=16) + ax.tick_params(axis='both', which='minor', labelsize=16) + ax.set_xlim(0, max_n - 1) + ax.set_ylim(0, 100) + ax.set_title((f'Cumulative: {target_name}({target}) ' + f'({length}bp)'), fontsize=16) + plt.legend(labels, fontsize=14) + + plt.tight_layout() + plt.savefig(f'{output}.{target_name}.{target}.{variable}.cumulative-monte.png') + plt.close() + + def cumulative(metadata, coverage, positions, target, variable, output): plt.figure(figsize=(12, 8)) labels = [] @@ -61,23 +175,19 @@ def cumulative(metadata, coverage, positions, target, variable, output): length = lengths[COLUMN_LENGTH].item(0) - for (name, ), grp in metadata.group_by([variable, ], maintain_order=True): + value_order = metadata.select(pl.col(variable).unique().sort())[variable] + for name in value_order: + grp = metadata.filter(pl.col(variable) == name) current = pl.DataFrame([], schema=BED_COV_SCHEMA.dtypes_flat) grp_coverage = ordered_coverage(coverage, grp, target) - if len(grp_coverage) == 0: + cur_x, cur_y = compute_cumulative(coverage, grp, target, + target_positions, lengths) + if cur_x is None: continue labels.append(name) - cur_y = [] - cur_x = grp_coverage['x'] - for id_ in grp_coverage[COLUMN_SAMPLE_ID]: - next_ = slice_positions(target_positions, id_).collect() - current = compress(pl.concat([current, next_])) - per_cov = coverage_percent(current, lengths).collect() - cur_y.append(per_cov[COLUMN_PERCENT_COVERED].item(0)) - covs.append(cur_y) plt.plot(cur_x, cur_y) @@ -108,7 +218,11 @@ def non_cumulative(metadata, coverage, target, variable, output): labels = [] covs = [] - for (name, ), grp in metadata.group_by([variable, ], maintain_order=True): + value_order = metadata.select(pl.col(variable) + .unique() + .sort())[variable] + for name in value_order: + grp = metadata.filter(pl.col(variable) == name) grp_coverage = ordered_coverage(coverage, grp, target) if len(grp_coverage) == 0: @@ -190,9 +304,11 @@ def position_plot(metadata, coverage, positions, target, variable, output, scale colors = [] target_positions = positions.filter(pl.col(COLUMN_GENOME_ID) == target).lazy() - for ((name, ), grp), color in zip(metadata.group_by([variable, ], - maintain_order=True), - range(0, 10)): + value_order = metadata.select(pl.col(variable) + .unique() + .sort())[variable] + for name, color in zip(value_order, range(0, 10)): + grp = metadata.filter(pl.col(variable) == name) color = f'C{color}' grp_coverage = ordered_coverage(coverage, grp, target) @@ -254,6 +370,7 @@ def position_plot(metadata, coverage, positions, target, variable, output, scale leg = ax.get_legend() for i, lh in enumerate(leg.legendHandles): lh.set_color(colors[i]) + lh._sizes = [5.0, ] ax.tick_params(axis='both', which='major', labelsize=16) ax.tick_params(axis='both', which='minor', labelsize=16) diff --git a/micov/cli.py b/micov/cli.py index d90ac75..1baa2fb 100644 --- a/micov/cli.py +++ b/micov/cli.py @@ -14,9 +14,12 @@ from ._cov import coverage_percent from ._convert import cigar_to_lens from ._per_sample import per_sample_coverage -from ._plot import per_sample_plots, single_sample_position_plot +from ._plot import (per_sample_plots, per_sample_plots_monte, + single_sample_position_plot) from ._utils import logger -from ._constants import COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, BED_COV_SAMPLEID_SCHEMA +from ._constants import (COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, + BED_COV_SAMPLEID_SCHEMA, + COLUMN_START, COLUMN_CIGAR, COLUMN_STOP) from ._quant import pos_to_bins, make_csv_ready @@ -243,6 +246,60 @@ def per_sample_group(parquet_coverage, sample_metadata, sample_metadata_column, sample_metadata_column, output) +@cli.command() +@click.option('--parquet-coverage', type=click.Path(exists=False), + required=True, help=('Pre-computed coverage data as parquet. ' + 'This should be the basename used, i.e. ' + 'for "foo.coverage.parquet", please use ' + '"foo"')) +@click.option('--sample-metadata', type=click.Path(exists=True), + required=True, + help='A metadata file with the sample metadata') +@click.option('--sample-metadata-column', type=str, + required=True, + help='The column to consider in the sample metadata') +@click.option('--features-to-keep', type=click.Path(exists=True), + required=False, + help='A metadata file with the features to keep') +@click.option('--iters', type=int, default=10, required=False) +@click.option('--target-names', type=str, required=False) +@click.option('--output', type=click.Path(exists=False), required=True) +@click.option('--plot', is_flag=True, default=False, + help='Generate plots from features') +def per_sample_monte(parquet_coverage, sample_metadata, sample_metadata_column, + features_to_keep, output, plot, iters, target_names): + """Generate sample group plots and coverage data with a null curve.""" + + _load_db(parquet_coverage, sample_metadata, features_to_keep) + + all_covered_positions = duckdb.sql("SELECT * from covered_positions").pl() + all_coverage = duckdb.sql("SELECT * FROM coverage").pl() + metadata_pl = duckdb.sql("SELECT * FROM metadata").pl() + + if target_names is not None: + target_names = dict(pl.scan_csv(target_names, + separator='\t', + new_columns=['feature-id', 'lineage'], + has_header=False) + .with_columns(pl.col('lineage') + .str + .split(';') + .list + .get(-1) + .str + .replace_all(r" |\[|\]", "_") + .alias('species')) + .select('feature-id', 'species') + .collect() + .iter_rows()) + else: + sql = "SELECT DISTINCT genome_id FROM coverage" + target_names = {k[0]: k[0] for k in duckdb.sql(sql).fetchall()} + + per_sample_plots_monte(all_coverage, all_covered_positions, metadata_pl, + sample_metadata_column, output, target_names, iters) + + def _load_db(dbbase, sample_metadata, features_to_keep): metadata_pl = parse_sample_metadata(sample_metadata) sample_column = metadata_pl.columns[0] From 107b332e6742a2d2f4adcdc833f3ffd2aa4f57fc Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 15 Nov 2024 16:47:34 -0800 Subject: [PATCH 2/2] lint --- micov/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/micov/cli.py b/micov/cli.py index 1baa2fb..3fc41b9 100644 --- a/micov/cli.py +++ b/micov/cli.py @@ -269,7 +269,6 @@ def per_sample_group(parquet_coverage, sample_metadata, sample_metadata_column, def per_sample_monte(parquet_coverage, sample_metadata, sample_metadata_column, features_to_keep, output, plot, iters, target_names): """Generate sample group plots and coverage data with a null curve.""" - _load_db(parquet_coverage, sample_metadata, features_to_keep) all_covered_positions = duckdb.sql("SELECT * from covered_positions").pl()