Skip to content

Commit

Permalink
chore(alignments-filter): removed all ibis
Browse files Browse the repository at this point in the history
  • Loading branch information
alsmith151 committed Nov 4, 2023
1 parent f291d2f commit c831316
Showing 1 changed file with 27 additions and 32 deletions.
59 changes: 27 additions & 32 deletions capcruncher/cli/alignments_filter.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
import os
import tempfile
import pathlib
import tempfile

import ibis
import numpy.core.multiarray
import pandas as pd
import polars as pl
from loguru import logger

ibis.options.interactive = True


from capcruncher.api.filter import CCSliceFilter, TiledCSliceFilter, TriCSliceFilter
from capcruncher.api.io import parse_bam
from capcruncher.api.statistics import SliceFilterStatsList
Expand All @@ -36,17 +31,19 @@ def merge_annotations(slices: os.PathLike, annotations: os.PathLike) -> pd.DataF
"""

logger.info("Opening annotations")

with pl.StringCache():

df_slices = pl.scan_parquet(slices)
df_annotations = pl.scan_parquet(annotations).rename({"Chromosome": "chrom", "Start": "start", "End": "end"})

df_slices = df_slices.join(df_annotations, on=["slice_name", "chrom", "start"], how="inner")
df_slices = pl.scan_parquet(slices)
df_annotations = pl.scan_parquet(annotations).rename(
{"Chromosome": "chrom", "Start": "start", "End": "end"}
)

df_slices = df_slices.join(
df_annotations, on=["slice_name", "chrom", "start"], how="inner"
)
df_slices = df_slices.unique(subset=["slice_name"])

return df_slices.collect().to_pandas()



def filter(
Expand Down Expand Up @@ -114,23 +111,20 @@ def filter(
"""

with logger.catch():

# Read bam file and merege annotations
tmp = pathlib.Path(output_prefix) / "_tmp.parquet"
if not tmp.parent.exists():
tmp.parent.mkdir(parents=True)


logger.info("Loading bam file")
parse_bam(bam).to_parquet(tmp)

logger.info("Merging bam file with annotations")
df_alignment = merge_annotations(tmp, annotations)

if "blacklist" not in df_alignment.columns:
df_alignment["blacklist"] = 0

tmp.unlink()
with tempfile.TemporaryDirectory() as tmpdir:
tmp = pathlib.Path(tmpdir) / "tmp.parquet"

logger.info("Loading bam file")
# Its faster to write to parquet and then read it back than to join both dataframes with pandas
parse_bam(bam).to_parquet(tmp)

# Join bam file with annotations
logger.info("Merging bam file with annotations")
df_alignment = merge_annotations(tmp, annotations)

# Make sure that the blacklist column is present
if "blacklist" not in df_alignment.columns:
df_alignment["blacklist"] = 0

Check warning on line 127 in capcruncher/cli/alignments_filter.py

View check run for this annotation

Codecov / codecov/patch

capcruncher/cli/alignments_filter.py#L127

Added line #L127 was not covered by tests

# Initialise SliceFilter
# If no custom filtering, will use the class default.
Expand Down Expand Up @@ -169,7 +163,8 @@ def filter(
"capture_slices": "capture",
"capture_capture": "viewpoint",
}
) )
)
)

df_fragments.to_parquet(
f"{output_prefix}.fragments.parquet",
Expand Down

0 comments on commit c831316

Please sign in to comment.