Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

This should be in mro/common for general use #44

Open
github-actions bot opened this issue May 25, 2021 · 0 comments
Open

This should be in mro/common for general use #44

github-actions bot opened this issue May 25, 2021 · 0 comments
Labels

Comments

@github-actions
Copy link

This should be in mro/common for general use

# TODO: This should be in mro/common for general use

#

#
# Copyright (c) 2019 10x Genomics, Inc. All rights reserved.
#

filetype json;
filetype bam;
#
# @include "_sort_and_mark_dups_stages.mro"
#

#
# Copyright (c) 2019 10x Genomics, Inc. All rights reserved.
#
filetype bam;
filetype bam.bai;
filetype tsv.gz;
filetype tsv.gz.tbi;
filetype json;
filetype csv;
#
# @include "_peak_caller_stages.mro"
#

#
# Copyright (c) 2019 10x Genomics, Inc. All rights reserved.
#

filetype bedgraph;
filetype pickle;
filetype tsv.gz;
filetype tsv.gz.tbi;
filetype bed;
filetype json;
#
# @include "_basic_sc_atac_counter_stages.mro"
#

#
# Copyright (c) 2019 10x Genomics, Inc. All rights reserved.
#
filetype tsv.gz;
filetype tsv.gz.tbi;
filetype csv;
filetype json;
filetype bed;
filetype pickle;
filetype h5;
#
# @include "_produce_cell_barcodes_stages.mro"
#

#
# Copyright (c) 2019 10x Genomics, Inc. All rights reserved.
#
filetype tsv.gz;
filetype tsv.gz.tbi;
filetype csv;
filetype json;
filetype bed;
filetype pickle;
filetype h5;
filetype npy.gz;
#
# @include "_sc_atac_metric_collector_stages.mro"
#

#
# Copyright (c) 2019 10x Genomics, Inc. All rights reserved.
#
filetype tsv.gz;
filetype tsv.gz.tbi;
filetype bed;
filetype bam;
filetype csv;
filetype json;
filetype h5;
filetype txt;
filetype pickle;
#
# @include "_peak_annotator_stages.mro"
#

#
# Copyright (c) 2019 10x Genomics, Inc. All rights reserved.
#

filetype bed;
filetype tsv;
filetype h5;
filetype gz;
filetype pickle;
#
# @include "_sc_atac_analyzer_stages.mro"
#

#
# Copyright (c) 2019 10x Genomics, Inc. All rights reserved.
#
filetype tsv;
filetype h5;
filetype pickle;
filetype gz;
filetype bed;
filetype csv;
#
# @include "_sc_atac_reporter_stages.mro"
#

#
# Copyright (c) 2019 10x Genomics, Inc. All rights reserved.
#
filetype json;
filetype html;
filetype csv;
filetype h5;
filetype bam;
#
# @include "_atac_cloupe_stages.mro"
#

#
# Copyright (c) 2019 10x Genomics, Inc. All rights reserved.
#
filetype cloupe;
filetype csv;
filetype json;
filetype h5;
filetype bed;
filetype tsv.gz.tbi;
#
# @include "_preflight_stages.mro"
#

#
# Copyright (c) 2019 10x Genomics, Inc. All rights reserved.
#

filetype csv;
filetype bed;
filetype tsv.gz;
filetype tsv.gz.tbi;

#
# @include "_aligner_stages.mro"
#

# SETUP_CHUNKS chunks up the input fastq data into sets of matched R1, R2, SI, and BC fastq files.
# input_mode specifies how FASTQs were generated. There are two modes:
#
# 1. "BCL_PROCESSOR"
#
# FASTQs produced by the 10X BCL_PROCESSOR pipeline. This mode assumes the FASTQ files obey the internal
# naming conventions and the reads have been interleaved into RA FASTQ files.
#
# 2. "ILMN_BCL2FASTQ"
#
# FASTQs produced directly by Illumina BCL2FASTQ v1.8.4. For this mode, BCL2FASTQ must be configured to emit the
# index2 read, rather than using it for dual-index demultiplexing:
#
# configureBclToFastq.pl --no-eamss --use-bases-mask=Y100,I8,Y14,Y100 --input-dir=<basecalls_dir> \
#     --output-dir=<output_dir> --sample-sheet=<sample_sheet.csv>
#
# The sample sheet must be formatted as per the BCL2FASTQ documentation (10 column csv), and must contain a row for
# each sample index used. The sequencer must have been run in dual index mode, with the second index read (used to
# read the 10X barcode) emitted as the R2 output file. The --use-bases-mask argument should be set to the read
# length used.
stage SETUP_CHUNKS(
    in  string   sample_id        "id of the sample",
    in  map[]    sample_def       "list of dictionary specifying input data",
    in  string   input_mode       "configuration of the input fastqs",
    in  map      downsample       "map specifies either subsample_rate (float) or gigabases (int)",
    out map[]    chunks           "map has barcode, barcode_reverse_complement, sample_index, read1, read2, gem_group, and read_group fields",
    out string[] read_groups      "list of strings representing read groups",
    out json     downsample_info  "info about downsampling result",
    src py       "stages/processing/setup_chunks",
)

# Trims adapter sequences from reads and massages fastq output into a fixed format (interleaved R1 file, etc.)
stage TRIM_READS(
    in  map[]  chunks,
    in  string barcode_whitelist,
    in  int    max_read_num,
    in  map    trim_def,
    in  map    adapters,
    out map[]  chunks,
    out json   bc_counts,
    out json   lot_info,
    out json   read_counts,
    src py     "stages/processing/trim_reads",
) split (
    in  map    chunk,
) using (
    volatile = strict,
)

# Aligns the reads to the input reference, producing chunked bam files
stage ALIGN_READS(
    in  map[]  chunks,
    in  string aligner,
    in  string aligner_method,
    in  string reference_path,
    in  string read_group_sample,
    in  int    num_threads,
    out bam[],
    src py     "stages/processing/align_reads",
) split (
    in  map    chunk,
) using (
    # N.B. No index files are generated for the bam
    volatile = strict,
)

#
# @include "_aligner.mro"
#

# Takes input fastqs and chunks them, trims them, and aligns the trimmed reads to a reference
pipeline _ALIGNER(
    in  string sample_id,
    in  string fastq_mode         "configuration of the input fastqs",
    in  map[]  sample_def,
    in  string reference_path     "this is the reference_path",
    in  string barcode_whitelist  "name of barcode whitelist file",
    in  map    trim_def,
    in  map    adapters,
    in  string read_group_sample  "sample header for BAM file",
    in  map    downsample,
    out bam[]  align,
    out map[]  chunks,
    out json   bc_counts,
    out json   lot_info           "gelbead lot detected",
    out json   read_counts        "total # of read pairs before and after adapter trimming",
    out json   downsample_info    "info on downsampling",
)
{
    call SETUP_CHUNKS(
        sample_id  = self.sample_id,
        input_mode = self.fastq_mode,
        sample_def = self.sample_def,
        downsample = self.downsample,
    ) using (
        volatile = true,
    )

    call TRIM_READS(
        chunks            = SETUP_CHUNKS.chunks,
        max_read_num      = 5000000,
        trim_def          = self.trim_def,
        adapters          = self.adapters,
        barcode_whitelist = self.barcode_whitelist,
    ) using (
        volatile = true,
    )

    call ALIGN_READS(
        chunks            = TRIM_READS.chunks,
        aligner           = "bwa",
        aligner_method    = "MEM",
        reference_path    = self.reference_path,
        read_group_sample = self.read_group_sample,
        num_threads       = 4,
    ) using (
        volatile = true,
    )

    return (
        align           = ALIGN_READS,
        chunks          = TRIM_READS.chunks,
        bc_counts       = TRIM_READS.bc_counts,
        lot_info        = TRIM_READS.lot_info,
        read_counts     = TRIM_READS.read_counts,
        downsample_info = SETUP_CHUNKS.downsample_info,
    )
}

#
# @include "_sort_and_mark_dups_stages.mro"
#

# Attaches raw and corrected barcode sequences to the aligned reads
stage ATTACH_BCS(
    in  string barcode_whitelist,
    in  bam[]  align,
    in  map[]  chunks,
    in  bool   paired_end,
    in  bool   exclude_non_bc_reads,
    in  float  bc_confidence_threshold,
    in  json   bc_counts,
    out bam[]  output,
    out int    perfect_read_count,
    src py     "stages/processing/attach_bcs",
) split (
    in  bam    align_chunk,
    in  map    chunk,
) using (
    # N.B. No index files are generated for the bam
    volatile = strict,
)

stage SORT_READS_BY_POS(
    in  bam[] input,
    out bam   tagsorted_bam,
    src py    "stages/processing/sort_reads_by_pos",
) split (
    in  bam   chunk_input,
) using (
    # N.B. No index files are generated for the bam
    volatile = strict,
)

# Marks duplicates in the reads using barcodes and fragment alignments to detect PCR and optical/diffusion duplicates
stage MARK_DUPLICATES(
    in  bam        input,
    in  string     reference_path,
    in  json       raw_barcode_counts,
    in  string     barcode_whitelist,
    out bam        output,
    out bam.bai    index,
    out csv        singlecell_mapping,
    out tsv.gz     fragments,
    out tsv.gz.tbi fragments_index,
    src py         "stages/processing/mark_duplicates",
) split (
    in  map        lane_map,
    in  string     chunk_start,
    in  string     chunk_end,
    in  int        chunk_num,
) using (
    # N.B. BAM/BED index files are explicitly bound where used
    volatile = strict,
)

#
# @include "_sort_and_mark_dups.mro"
#

# Attaches barcodes to the aligned reads, marks duplicate reads, and produces a barcode-sorted and position-sorted
# output BAM
pipeline _SORT_AND_MARK_DUPS(
    in  bam[]      align,
    in  map[]      chunks,
    in  string     barcode_whitelist,
    in  json       bc_counts,
    in  string     reference_path,
    out bam        possorted_bam        "bam file sorted by position",
    out bam.bai    possorted_bam_index  "position-sorted bam index",
    out tsv.gz     fragments,
    out tsv.gz.tbi fragments_index,
    out csv        singlecell_mapping,
    out bam[]      read_paired_bam,
)
{
    call ATTACH_BCS(
        align                   = self.align,
        chunks                  = self.chunks,
        paired_end              = true,
        barcode_whitelist       = self.barcode_whitelist,
        exclude_non_bc_reads    = false,
        bc_confidence_threshold = 0.975,
        bc_counts               = self.bc_counts,
    ) using (
        volatile = true,
    )

    call SORT_READS_BY_POS(
        input = ATTACH_BCS.output,
    ) using (
        volatile = true,
    )

    call MARK_DUPLICATES(
        input              = SORT_READS_BY_POS.tagsorted_bam,
        reference_path     = self.reference_path,
        barcode_whitelist  = self.barcode_whitelist,
        raw_barcode_counts = self.bc_counts,
    ) using (
        volatile = true,
    )

    return (
        possorted_bam       = MARK_DUPLICATES.output,
        possorted_bam_index = MARK_DUPLICATES.index,
        singlecell_mapping  = MARK_DUPLICATES.singlecell_mapping,
        fragments           = MARK_DUPLICATES.fragments,
        fragments_index     = MARK_DUPLICATES.fragments_index,
        read_paired_bam     = ATTACH_BCS.output,
    )
}

#
# @include "_peak_caller_stages.mro"
#

stage COUNT_CUT_SITES(
    in  path       reference_path,
    in  tsv.gz     fragments,
    in  tsv.gz.tbi fragments_index,
    out bedgraph   cut_sites,
    out pickle     count_dict,
    src py         "stages/processing/count_cut_sites",
) split (
    in  string     contig,
) using (
    # N.B. We explicitly bind the index file
    volatile = strict,
)

stage DETECT_PEAKS(
    in  bedgraph cut_sites,
    in  path     reference_path,
    in  pickle   count_dict,
    out bed      peaks,
    out json     peak_metrics,
    src py       "stages/processing/detect_peaks",
) split (
    in  string   contig,
    in  float[]  params,
    in  float    threshold,
) using (
    mem_gb   = 6,
    # N.B. We explicitly bind the index file
    volatile = strict,
)

#
# @include "_peak_caller.mro"
#

pipeline _PEAK_CALLER(
    in  path       reference_path,
    in  tsv.gz     fragments,
    in  tsv.gz.tbi fragments_index,
    out bedgraph   cut_sites,
    out bed        peaks,
    out json       peak_metrics,
)
{
    call COUNT_CUT_SITES(
        reference_path  = self.reference_path,
        fragments       = self.fragments,
        fragments_index = self.fragments_index,
    )

    call DETECT_PEAKS(
        reference_path = self.reference_path,
        cut_sites      = COUNT_CUT_SITES.cut_sites,
        count_dict     = COUNT_CUT_SITES.count_dict,
    )

    return (
        cut_sites    = COUNT_CUT_SITES.cut_sites,
        peaks        = DETECT_PEAKS.peaks,
        peak_metrics = DETECT_PEAKS.peak_metrics,
    )
}

#
# @include "_basic_sc_atac_counter_stages.mro"
#

stage GENERATE_PEAK_MATRIX(
    in  string reference_path,
    in  tsv.gz fragments,
    in  bed    peaks,
    out h5     raw_matrix,
    out path   raw_matrix_mex,
    src py     "stages/processing/generate_peak_matrix",
) split (
    in  file   barcodes,
) using (
    mem_gb   = 4,
    # N.B. we don't explicitly need the fragment index
    volatile = strict,
)

stage FILTER_PEAK_MATRIX(
    in  h5   raw_matrix,
    in  int  num_analysis_bcs,
    in  int  random_seed,
    in  csv  cell_barcodes,
    out h5   filtered_matrix,
    out path filtered_matrix_mex,
    src py   "stages/processing/filter_peak_matrix",
) split (
) using (
    volatile = strict,
)

#
# @include "_produce_cell_barcodes_stages.mro"
#

stage REMOVE_LOW_TARGETING_BARCODES(
    in  bed        peaks,
    in  tsv.gz     fragments,
    in  tsv.gz.tbi fragments_index,
    in  string     reference_path,
    out json       barcode_counts,
    out json       low_targeting_barcodes,
    out json       low_targeting_summary,
    out json       fragment_lengths,
    out json       covered_bases,
    src py         "stages/processing/cell_calling/remove_low_targeting_barcodes",
) split (
    in  string     contig,
    out pickle     fragment_counts,
    out pickle     targeted_counts,
    out int        peak_coverage,
) using (
    mem_gb   = 4,
    volatile = strict,
)

stage REMOVE_GEL_BEAD_DOUBLET_BARCODES(
    in  tsv.gz     fragments,
    in  tsv.gz.tbi fragments_index,
    in  string     reference_path,
    in  json       barcode_counts,
    out json       gel_bead_doublet_barcodes,
    out json       gel_bead_doublet_summary,
    out csv        connect_matrix,
    src py         "stages/processing/cell_calling/remove_gel_bead_doublet_barcodes",
) split (
    in  string     contig,
    in  file       valid_barcodes,
) using (
    mem_gb   = 4,
    volatile = strict,
)

stage REMOVE_BARCODE_MULTIPLETS(
    in  tsv.gz     fragments,
    in  tsv.gz.tbi fragments_index,
    in  string     reference_path,
    in  string     barcode_whitelist,
    in  json       barcode_counts,
    out json       barcode_multiplets,
    out json       barcode_multiplets_summary,
    src py         "stages/processing/cell_calling/remove_barcode_multiplets",
) split (
    in  string     contig,
    in  string     gem_group,
    out npy.gz     part_a_linkage_matrix,
    out npy.gz     part_b_linkage_matrix,
) using (
    mem_gb   = 4,
    volatile = strict,
)

stage MERGE_EXCLUDED_BARCODES(
    in  json[] barcode_exclusions,
    out json   excluded_barcodes,
    src py     "stages/processing/cell_calling/merge_excluded_barcodes",
)

stage DETECT_CELL_BARCODES(
    in  tsv.gz     fragments,
    in  tsv.gz.tbi fragments_index,
    in  string     barcode_whitelist,
    in  json       excluded_barcodes,
    in  map        force_cells,
    in  string     reference_path,
    in  bed        peaks,
    out csv        cell_barcodes,
    out csv        singlecell,
    out json       cell_calling_summary,
    src py         "stages/processing/cell_calling/detect_cell_barcodes",
) split (
    in  string     contig,
    out pickle     barcode_counts,
    out pickle     targeted_counts,
    out int        fragment_depth,
) using (
    mem_gb   = 4,
    volatile = strict,
)

# TODO: This should be in mro/common for general use
stage MERGE_SUMMARY_METRICS(
    in  json[] summary_jsons,
    out json   merged_summary,
    src py     "stages/processing/cell_calling/merge_summary_metrics",
)

#
# @include "_produce_cell_barcodes.mro"
#

pipeline _PRODUCE_CELL_BARCODES(
    in  bed        peaks,
    in  tsv.gz     fragments,
    in  tsv.gz.tbi fragments_index,
    in  string     reference_path,
    in  string     barcode_whitelist,
    in  map        force_cells,
    out csv        cell_barcodes,
    out csv        singlecell,
    out json       cell_calling_summary,
    out json       excluded_barcodes,
    out json       fragment_lengths,
    out json       covered_bases,
)
{
    call REMOVE_LOW_TARGETING_BARCODES(
        fragments       = self.fragments,
        fragments_index = self.fragments_index,
        peaks           = self.peaks,
        reference_path  = self.reference_path,
    )

    call REMOVE_GEL_BEAD_DOUBLET_BARCODES(
        fragments       = self.fragments,
        fragments_index = self.fragments_index,
        reference_path  = self.reference_path,
        barcode_counts  = REMOVE_LOW_TARGETING_BARCODES.barcode_counts,
    )

    call REMOVE_BARCODE_MULTIPLETS(
        fragments         = self.fragments,
        fragments_index   = self.fragments_index,
        reference_path    = self.reference_path,
        barcode_whitelist = self.barcode_whitelist,
        barcode_counts    = REMOVE_LOW_TARGETING_BARCODES.barcode_counts,
    )

    call MERGE_EXCLUDED_BARCODES(
        barcode_exclusions = [
            REMOVE_BARCODE_MULTIPLETS.barcode_multiplets,
            REMOVE_GEL_BEAD_DOUBLET_BARCODES.gel_bead_doublet_barcodes,
            REMOVE_LOW_TARGETING_BARCODES.low_targeting_barcodes,
        ],
    )

    call DETECT_CELL_BARCODES(
        fragments         = self.fragments,
        fragments_index   = self.fragments_index,
        barcode_whitelist = self.barcode_whitelist,
        force_cells       = self.force_cells,
        excluded_barcodes = MERGE_EXCLUDED_BARCODES.excluded_barcodes,
        reference_path    = self.reference_path,
        peaks             = self.peaks,
    )

    call MERGE_SUMMARY_METRICS as MERGE_CELL_METRICS(
        summary_jsons = [
            REMOVE_LOW_TARGETING_BARCODES.low_targeting_summary,
            REMOVE_GEL_BEAD_DOUBLET_BARCODES.gel_bead_doublet_summary,
            REMOVE_BARCODE_MULTIPLETS.barcode_multiplets_summary,
            DETECT_CELL_BARCODES.cell_calling_summary,
        ],
    )

    return (
        cell_barcodes        = DETECT_CELL_BARCODES.cell_barcodes,
        excluded_barcodes    = MERGE_EXCLUDED_BARCODES.excluded_barcodes,
        singlecell           = DETECT_CELL_BARCODES.singlecell,
        cell_calling_summary = MERGE_CELL_METRICS.merged_summary,
        fragment_lengths     = REMOVE_LOW_TARGETING_BARCODES.fragment_lengths,
        covered_bases        = REMOVE_LOW_TARGETING_BARCODES.covered_bases,
    )
}

#
# @include "_basic_sc_atac_counter.mro"
#

pipeline _BASIC_SC_ATAC_COUNTER(
    in  string     sample_id,
    in  string     fastq_mode                   "configuration of the input fastqs",
    in  map[]      sample_def,
    in  string     reference_path               "this is the reference_path",
    in  string     barcode_whitelist            "name of barcode whitelist file",
    in  map        trim_def,
    in  map        adapters,
    in  map        downsample,
    in  map        force_cells,
    out bam        possorted_bam                "bam file sorted by position",
    out bam.bai    possorted_bam_index          "position-sorted bam index",
    out tsv.gz     fragments,
    out tsv.gz.tbi fragments_index,
    out json       lot_info                     "gelbead lot detected",
    out json       read_counts                  "total # of read pairs before and after adapter trimming",
    out json       downsample_info              "info on downsampling",
    out csv        cell_barcodes,
    out json       excluded_barcodes,
    out json       cell_calling_summary,
    out bed        peaks,
    out bedgraph   cut_sites,
    out csv        singlecell_mapping,
    out csv        singlecell_cells,
    out json       peak_metrics,
    out bam[]      read_paired_bam,
    out h5         raw_peak_bc_matrix,
    out path       raw_peak_bc_matrix_mex,
    out h5         filtered_peak_bc_matrix,
    out path       filtered_peak_bc_matrix_mex,
)
{
    call _ALIGNER(
        sample_id         = self.sample_id,
        fastq_mode        = self.fastq_mode,
        sample_def        = self.sample_def,
        read_group_sample = self.sample_id,
        trim_def          = self.trim_def,
        adapters          = self.adapters,
        reference_path    = self.reference_path,
        barcode_whitelist = self.barcode_whitelist,
        downsample        = self.downsample,
    )

    call _SORT_AND_MARK_DUPS(
        align             = _ALIGNER.align,
        chunks            = _ALIGNER.chunks,
        reference_path    = self.reference_path,
        barcode_whitelist = self.barcode_whitelist,
        bc_counts         = _ALIGNER.bc_counts,
    )

    call _PEAK_CALLER(
        fragments       = _SORT_AND_MARK_DUPS.fragments,
        fragments_index = _SORT_AND_MARK_DUPS.fragments_index,
        reference_path  = self.reference_path,
    )

    call _PRODUCE_CELL_BARCODES(
        fragments         = _SORT_AND_MARK_DUPS.fragments,
        fragments_index   = _SORT_AND_MARK_DUPS.fragments_index,
        peaks             = _PEAK_CALLER.peaks,
        force_cells       = self.force_cells,
        reference_path    = self.reference_path,
        barcode_whitelist = self.barcode_whitelist,
    )

    call GENERATE_PEAK_MATRIX(
        reference_path = self.reference_path,
        fragments      = _SORT_AND_MARK_DUPS.fragments,
        peaks          = _PEAK_CALLER.peaks,
    )

    call FILTER_PEAK_MATRIX(
        num_analysis_bcs = null,
        cell_barcodes    = _PRODUCE_CELL_BARCODES.cell_barcodes,
        raw_matrix       = GENERATE_PEAK_MATRIX.raw_matrix,
        random_seed      = null,
    )

    return (
        possorted_bam               = _SORT_AND_MARK_DUPS.possorted_bam,
        possorted_bam_index         = _SORT_AND_MARK_DUPS.possorted_bam_index,
        singlecell_mapping          = _SORT_AND_MARK_DUPS.singlecell_mapping,
        singlecell_cells            = _PRODUCE_CELL_BARCODES.singlecell,
        lot_info                    = _ALIGNER.lot_info,
        read_counts                 = _ALIGNER.read_counts,
        downsample_info             = _ALIGNER.downsample_info,
        cell_barcodes               = _PRODUCE_CELL_BARCODES.cell_barcodes,
        excluded_barcodes           = _PRODUCE_CELL_BARCODES.excluded_barcodes,
        cell_calling_summary        = _PRODUCE_CELL_BARCODES.cell_calling_summary,
        peak_metrics                = _PEAK_CALLER.peak_metrics,
        cut_sites                   = _PEAK_CALLER.cut_sites,
        peaks                       = _PEAK_CALLER.peaks,
        fragments                   = _SORT_AND_MARK_DUPS.fragments,
        fragments_index             = _SORT_AND_MARK_DUPS.fragments_index,
        read_paired_bam             = _SORT_AND_MARK_DUPS.read_paired_bam,
        raw_peak_bc_matrix          = GENERATE_PEAK_MATRIX.raw_matrix,
        raw_peak_bc_matrix_mex      = GENERATE_PEAK_MATRIX.raw_matrix_mex,
        filtered_peak_bc_matrix     = FILTER_PEAK_MATRIX.filtered_matrix,
        filtered_peak_bc_matrix_mex = FILTER_PEAK_MATRIX.filtered_matrix_mex,
    )
}

#
# @include "_sc_atac_metric_collector_stages.mro"
#

stage ESTIMATE_LIBRARY_COMPLEXITY(
    in  json   sequencing_summary,
    in  tsv.gz fragments,
    in  csv    cell_barcodes,
    out json   bulk_complexity,
    out json   complexity_summary,
    out json   singlecell_complexity,
    src py     "stages/metrics/estimate_library_complexity",
) split (
    in  file   barcodes,
) using (
    mem_gb   = 6,
    volatile = strict,
)

stage GENERATE_SEQUENCING_METRICS(
    in  bam[] input,
    out txt   misc_sm,
    out json  summary,
    src py    "stages/metrics/generate_sequencing_metrics",
) split (
    in  bam   chunk_bam,
) using (
    volatile = strict,
)

stage GENERATE_SINGLECELL_TARGETING(
    in  tsv.gz     fragments,
    in  tsv.gz.tbi fragments_index,
    in  bed        peaks,
    in  string     reference_path,
    out csv        singlecell,
    out json       summary,
    out csv        tss_relpos,
    out csv        ctcf_relpos,
    src py         "stages/metrics/generate_singlecell_targeting",
) split (
    in  string     contig,
    out int        read_count,
    out pickle     target_counts_by_barcode,
    out pickle     chunk_tss,
    out pickle     chunk_ctcf,
) using (
    mem_gb   = 6,
    volatile = strict,
)

stage MERGE_SINGLECELL_METRICS(
    in  string reference_path,
    in  csv    singlecell_mapping,
    in  csv    singlecell_targets,
    in  csv    singlecell_cells,
    out csv    singlecell,
    out json   summary,
    src py     "stages/metrics/merge_singlecell_metrics",
) using (
    mem_gb   = 8,
    volatile = strict,
)

stage REPORT_INSERT_SIZES(
    in  tsv.gz fragments,
    in  bool   exclude_non_nuclear,
    in  string reference_path,
    out csv    insert_sizes,
    out json   insert_summary,
    src py     "stages/metrics/report_insert_sizes",
) split (
    in  file   barcode,
    out file   total,
) using (
    volatile = strict,
)

stage REPORT_TSS_CTCF(
    in  csv  tss_relpos,
    in  csv  ctcf_relpos,
    out json summary_metrics,
    src py   "stages/metrics/report_tss_ctcf",
) using (
    volatile = strict,
)

#
# @include "_sc_atac_metric_collector.mro"
#

pipeline _SC_ATAC_METRIC_COLLECTOR(
    in  bam[]      read_paired_bam,
    in  tsv.gz     fragments,
    in  tsv.gz.tbi fragments_index,
    in  bed        peaks,
    in  string     reference_path         "this is the reference_path",
    in  csv        cell_barcodes,
    in  csv        singlecell_mapping,
    in  csv        singlecell_cells,
    out json       singlecell_results,
    out csv        singlecell,
    out json       enrichment_results,
    out json       basic_summary,
    out json       insert_summary,
    out csv        insert_sizes,
    out json       bulk_complexity,
    out json       singlecell_complexity,
    out json       complexity_summary,
    out csv        tss_relpos,
    out csv        ctcf_relpos,
)
{
    call GENERATE_SINGLECELL_TARGETING(
        fragments       = self.fragments,
        fragments_index = self.fragments_index,
        peaks           = self.peaks,
        reference_path  = self.reference_path,
    )

    call MERGE_SINGLECELL_METRICS(
        reference_path     = self.reference_path,
        singlecell_mapping = self.singlecell_mapping,
        singlecell_cells   = self.singlecell_cells,
        singlecell_targets = GENERATE_SINGLECELL_TARGETING.singlecell,
    )

    call GENERATE_SEQUENCING_METRICS(
        input = self.read_paired_bam,
    )

    call ESTIMATE_LIBRARY_COMPLEXITY(
        sequencing_summary = GENERATE_SEQUENCING_METRICS.summary,
        fragments          = self.fragments,
        cell_barcodes      = self.cell_barcodes,
    )

    call REPORT_INSERT_SIZES(
        fragments           = self.fragments,
        reference_path      = self.reference_path,
        exclude_non_nuclear = true,
    )

    call REPORT_TSS_CTCF(
        tss_relpos  = GENERATE_SINGLECELL_TARGETING.tss_relpos,
        ctcf_relpos = GENERATE_SINGLECELL_TARGETING.ctcf_relpos,
    )

    return (
        ###
        singlecell            = MERGE_SINGLECELL_METRICS.singlecell,
        singlecell_results    = MERGE_SINGLECELL_METRICS.summary,
        ###
        enrichment_results    = REPORT_TSS_CTCF.summary_metrics,
        basic_summary         = GENERATE_SEQUENCING_METRICS.summary,
        insert_summary        = REPORT_INSERT_SIZES.insert_summary,
        insert_sizes          = REPORT_INSERT_SIZES.insert_sizes,
        bulk_complexity       = ESTIMATE_LIBRARY_COMPLEXITY.bulk_complexity,
        singlecell_complexity = ESTIMATE_LIBRARY_COMPLEXITY.singlecell_complexity,
        complexity_summary    = ESTIMATE_LIBRARY_COMPLEXITY.complexity_summary,
        tss_relpos            = GENERATE_SINGLECELL_TARGETING.tss_relpos,
        ctcf_relpos           = GENERATE_SINGLECELL_TARGETING.ctcf_relpos,
    )
}

#
# @include "_peak_annotator_stages.mro"
#

stage ANNOTATE_PEAKS(
    in  bed    peaks,
    in  string reference_path,
    out tsv    peak_annotation,
    src py     "stages/analysis/annotate_peaks",
) split (
    in  int    chunk_start,
    in  int    chunk_end,
) using (
    mem_gb   = 5,
    volatile = strict,
)

stage COMPUTE_GC_DISTRIBUTION(
    in  bed    peaks,
    in  string reference_path,
    out pickle GCdict,
    src py     "stages/analysis/compute_gc_dist",
) split (
) using (
    volatile = strict,
)

stage SCAN_MOTIFS(
    in  pickle globalGCdict,
    in  bed    peaks,
    in  string reference_path,
    in  float  pwm_threshold,
    out bed    peak_motif_hits,
    src py     "stages/analysis/scan_motifs",
) split (
    in  file   GCdict,
) using (
    volatile = strict,
)

stage GENERATE_TF_MATRIX(
    in  path reference_path,
    in  bed  peaks,
    in  bed  peak_motif_hits,
    in  h5   filtered_matrix,
    out h5   filtered_tf_bc_matrix,
    out path filtered_tf_bc_matrix_mex,
    out gz   tf_propZ_matrix,
    src py   "stages/analysis/generate_tf_matrix",
) split (
) using (
    volatile = strict,
)

#
# @include "_peak_annotator.mro"
#

pipeline _PEAK_ANNOTATOR(
    in  string reference_path,
    in  bed    peaks,
    in  h5     filtered_peak_bc_matrix,
    in  float  pwm_threshold,
    out h5     filtered_tf_bc_matrix,
    out path   filtered_tf_bc_matrix_mex,
    out gz     tf_propZ_matrix,
    out tsv    peak_annotation,
)
{
    call ANNOTATE_PEAKS(
        peaks          = self.peaks,
        reference_path = self.reference_path,
    )

    call COMPUTE_GC_DISTRIBUTION(
        peaks          = self.peaks,
        reference_path = self.reference_path,
    )

    call SCAN_MOTIFS(
        globalGCdict   = COMPUTE_GC_DISTRIBUTION.GCdict,
        peaks          = self.peaks,
        reference_path = self.reference_path,
        pwm_threshold  = self.pwm_threshold,
    )

    call GENERATE_TF_MATRIX(
        reference_path  = self.reference_path,
        peaks           = self.peaks,
        filtered_matrix = self.filtered_peak_bc_matrix,
        peak_motif_hits = SCAN_MOTIFS.peak_motif_hits,
    )

    return (
        filtered_tf_bc_matrix     = GENERATE_TF_MATRIX.filtered_tf_bc_matrix,
        filtered_tf_bc_matrix_mex = GENERATE_TF_MATRIX.filtered_tf_bc_matrix_mex,
        tf_propZ_matrix           = GENERATE_TF_MATRIX.tf_propZ_matrix,
        peak_annotation           = ANNOTATE_PEAKS.peak_annotation,
    )
}

#
# @include "_sc_atac_analyzer_stages.mro"
#

stage ANALYZER_PREFLIGHT(
    in  bed      peaks,
    in  h5       filtered_peak_bc_matrix,
    in  string[] factorization,
    in  int      tsne_perplexity,
    in  int      random_seed,
    in  float    tsne_theta,
    in  int      tsne_mom_switch_iter,
    in  int      tsne_stop_lying_iter,
    in  int      tsne_max_dims,
    in  int      tsne_input_pcs,
    in  int      tsne_max_iter,
    in  int      max_clusters,
    in  int      num_components,
    in  int      num_dr_bcs,
    in  int      num_dr_features,
    in  float    neighbor_a,
    in  float    neighbor_b,
    in  int      graphclust_neighbors,
    src py       "stages/preflight/atac_analyzer",
)

stage REDUCE_DIMENSIONS(
    in  h5       filtered_matrix,
    in  string[] factorization,
    in  int      num_dims,
    in  int      num_bcs,
    in  int      num_features,
    in  int      random_seed,
    out path     reduced_data,
    out map      reduction_summary,
    src py       "stages/analysis/reduce_dimensions",
) split (
    in  string   method,
) using (
    volatile = strict,
)

stage CLUSTER_CELLS(
    in  h5       filtered_matrix,
    in  path     reduced_data,
    in  map      reduction_summary,
    in  string[] factorization,
    in  int      minclusters,
    in  int      maxclusters,
    in  int      num_dims,
    in  int      random_seed,
    out path     clustered_data,
    out map      clustering_summary,
    src py       "stages/analysis/cluster_cells",
) split (
    in  int      n_clusters,
) using (
    volatile = strict,
)

stage PROJECT_TSNE(
    in  h5       filtered_matrix,
    in  path     reduced_data,
    in  map      reduction_summary,
    in  int      tsne_perplexity,
    in  int      tsne_max_dims,
    in  int      tsne_input_pcs,
    in  float    tsne_theta,
    in  int      tsne_max_iter,
    in  int      tsne_stop_lying_iter,
    in  int      tsne_mom_switch_iter,
    in  int      random_seed,
    in  string[] factorization,
    out path     tsne,
    out map      tsne_summary,
    src py       "stages/analysis/project_tsne",
) split (
    in  string   method,
    in  int      tsne_dims,
) using (
    volatile = strict,
)

stage RUN_GRAPH_CLUSTERING(
    in  h5       matrix_h5                 "Processed matrix",
    in  string[] factorization,
    in  path     reduced_data,
    in  map      reduction_summary,
    in  int      num_neighbors             "Use this many neighbors",
    in  float    neighbor_a                "Use larger of (a+b*log10(n_cells) neighbors or num_neighbors",
    in  float    neighbor_b                "Use larger of (a+b*log10(n_cells) neighbors or num_neighbors",
    in  int      balltree_leaf_size,
    in  string   similarity_type           "Type of similarity to use (nn or snn)",
    out h5       chunked_neighbors,
    out path     knn_clusters,
    out map      graph_clustering_summary,
    src py       "stages/analysis/run_graph_clustering",
) split (
    in  string   method,
    in  pickle   neighbor_index,
    in  h5       submatrix,
    in  int      row_start,
    in  int      total_rows,
    in  int      k_nearest,
    in  h5       use_bcs,
) using (
    volatile = strict,
)

stage COMBINE_CLUSTERING(
    in  h5   filtered_matrix,
    in  map  clustering_summary,
    in  path clustered_data,
    in  map  graph_clustering_summary,
    in  path knn_clusters,
    out path clustering,
    out map  clustering_summary,
    src py   "stages/analysis/combine_clustering",
) using (
    volatile = strict,
)

stage SUMMARIZE_ANALYSIS(
    in  tsv  peak_annotation,
    in  h5   filtered_peak_bc_matrix,
    in  h5   filtered_tf_bc_matrix,
    in  gz   tf_propZ_matrix,
    in  path reduced_data,
    in  map  reduction_summary,
    in  path clustering,
    in  map  clustering_summary,
    in  path tsne,
    in  map  tsne_summary,
    in  path enrichment_analysis,
    in  map  enrichment_analysis_summary,
    out h5   analysis,
    out path analysis_csv,
    out h5   feature_bc_matrix,
    src py   "stages/analysis/summarize_analysis",
) split (
) using (
    volatile = strict,
)

stage PERFORM_DIFFERENTIAL_ANALYSIS(
    in  bed      peaks,
    in  string   reference_path,
    in  h5       filtered_peak_bc_matrix,
    in  h5       filtered_tf_bc_matrix,
    in  string[] factorization,
    in  path     clustering,
    in  map      clustering_summary,
    out path     enrichment_analysis,
    out map      enrichment_analysis_summary,
    src py       "stages/analysis/perform_differential_analysis",
) split (
    in  string   method,
    in  string   clustering_key,
    in  int      cluster,
    out csv      tmp_diffexp,
) using (
    volatile = strict,
)

#
# @include "_sc_atac_analyzer.mro"
#

pipeline _SC_ATAC_ANALYZER(
    in  string   reference_path,
    in  bed      peaks,
    in  h5       filtered_peak_bc_matrix,
    in  string[] factorization,
    in  int      tsne_perplexity,
    in  int      random_seed,
    in  float    tsne_theta,
    in  int      tsne_mom_switch_iter,
    in  int      tsne_stop_lying_iter,
    in  int      tsne_max_dims,
    in  int      tsne_input_pcs,
    in  int      tsne_max_iter,
    in  int      max_clusters,
    in  int      num_components,
    in  int      num_dr_bcs,
    in  int      num_dr_features,
    in  float    neighbor_a,
    in  float    neighbor_b,
    in  int      graphclust_neighbors,
    out h5       analysis,
    out path     analysis_csv,
    out h5       filtered_tf_bc_matrix,
    out path     filtered_tf_bc_matrix_mex,
    out h5       feature_bc_matrix,
    out tsv      peak_annotation,
)
{
    call ANALYZER_PREFLIGHT(
        peaks                   = self.peaks,
        filtered_peak_bc_matrix = self.filtered_peak_bc_matrix,
        factorization           = self.factorization,
        tsne_perplexity         = self.tsne_perplexity,
        random_seed             = self.random_seed,
        tsne_theta              = self.tsne_theta,
        tsne_mom_switch_iter    = self.tsne_mom_switch_iter,
        tsne_stop_lying_iter    = self.tsne_stop_lying_iter,
        tsne_max_dims           = self.tsne_max_dims,
        tsne_input_pcs          = self.tsne_input_pcs,
        tsne_max_iter           = self.tsne_max_iter,
        max_clusters            = self.max_clusters,
        num_components          = self.num_components,
        num_dr_bcs              = self.num_dr_bcs,
        num_dr_features         = self.num_dr_features,
        neighbor_a              = self.neighbor_a,
        neighbor_b              = self.neighbor_b,
        graphclust_neighbors    = self.graphclust_neighbors,
    ) using (
        volatile = true,
    )

    call _PEAK_ANNOTATOR(
        reference_path          = self.reference_path,
        peaks                   = self.peaks,
        filtered_peak_bc_matrix = self.filtered_peak_bc_matrix,
        pwm_threshold           = null,
    )

    call REDUCE_DIMENSIONS(
        filtered_matrix = self.filtered_peak_bc_matrix,
        factorization   = self.factorization,
        num_dims        = self.num_components,
        num_bcs         = self.num_dr_bcs,
        num_features    = self.num_dr_features,
        random_seed     = self.random_seed,
    )

    call CLUSTER_CELLS(
        filtered_matrix   = self.filtered_peak_bc_matrix,
        reduced_data      = REDUCE_DIMENSIONS.reduced_data,
        reduction_summary = REDUCE_DIMENSIONS.reduction_summary,
        factorization     = self.factorization,
        minclusters       = 2,
        maxclusters       = self.max_clusters,
        num_dims          = null,
        random_seed       = self.random_seed,
    )

    call PROJECT_TSNE(
        filtered_matrix      = self.filtered_peak_bc_matrix,
        reduced_data         = REDUCE_DIMENSIONS.reduced_data,
        reduction_summary    = REDUCE_DIMENSIONS.reduction_summary,
        tsne_perplexity      = self.tsne_perplexity,
        tsne_max_dims        = self.tsne_max_dims,
        tsne_input_pcs       = self.tsne_input_pcs,
        tsne_theta           = self.tsne_theta,
        tsne_max_iter        = self.tsne_max_iter,
        tsne_stop_lying_iter = self.tsne_stop_lying_iter,
        tsne_mom_switch_iter = self.tsne_mom_switch_iter,
        random_seed          = self.random_seed,
        factorization        = self.factorization,
    )

    call RUN_GRAPH_CLUSTERING(
        matrix_h5          = self.filtered_peak_bc_matrix,
        factorization      = self.factorization,
        reduced_data       = REDUCE_DIMENSIONS.reduced_data,
        reduction_summary  = REDUCE_DIMENSIONS.reduction_summary,
        num_neighbors      = self.graphclust_neighbors,
        neighbor_a         = self.neighbor_a,
        neighbor_b         = self.neighbor_b,
        balltree_leaf_size = null,
        similarity_type    = "nn",
    )

    call COMBINE_CLUSTERING(
        filtered_matrix          = self.filtered_peak_bc_matrix,
        clustering_summary       = CLUSTER_CELLS.clustering_summary,
        clustered_data           = CLUSTER_CELLS.clustered_data,
        graph_clustering_summary = RUN_GRAPH_CLUSTERING.graph_clustering_summary,
        knn_clusters             = RUN_GRAPH_CLUSTERING.knn_clusters,
    )

    call PERFORM_DIFFERENTIAL_ANALYSIS(
        reference_path          = self.reference_path,
        peaks                   = self.peaks,
        filtered_peak_bc_matrix = self.filtered_peak_bc_matrix,
        filtered_tf_bc_matrix   = _PEAK_ANNOTATOR.filtered_tf_bc_matrix,
        factorization           = self.factorization,
        clustering              = COMBINE_CLUSTERING.clustering,
        clustering_summary      = COMBINE_CLUSTERING.clustering_summary,
    )

    call SUMMARIZE_ANALYSIS(
        peak_annotation             = _PEAK_ANNOTATOR.peak_annotation,
        filtered_peak_bc_matrix     = self.filtered_peak_bc_matrix,
        filtered_tf_bc_matrix       = _PEAK_ANNOTATOR.filtered_tf_bc_matrix,
        tf_propZ_matrix             = _PEAK_ANNOTATOR.tf_propZ_matrix,
        reduced_data                = REDUCE_DIMENSIONS.reduced_data,
        reduction_summary           = REDUCE_DIMENSIONS.reduction_summary,
        clustering                  = COMBINE_CLUSTERING.clustering,
        clustering_summary          = COMBINE_CLUSTERING.clustering_summary,
        tsne                        = PROJECT_TSNE.tsne,
        tsne_summary                = PROJECT_TSNE.tsne_summary,
        enrichment_analysis         = PERFORM_DIFFERENTIAL_ANALYSIS.enrichment_analysis,
        enrichment_analysis_summary = PERFORM_DIFFERENTIAL_ANALYSIS.enrichment_analysis_summary,
    )

    return (
        analysis                  = SUMMARIZE_ANALYSIS.analysis,
        analysis_csv              = SUMMARIZE_ANALYSIS.analysis_csv,
        filtered_tf_bc_matrix     = _PEAK_ANNOTATOR.filtered_tf_bc_matrix,
        filtered_tf_bc_matrix_mex = _PEAK_ANNOTATOR.filtered_tf_bc_matrix_mex,
        feature_bc_matrix         = SUMMARIZE_ANALYSIS.feature_bc_matrix,
        peak_annotation           = _PEAK_ANNOTATOR.peak_annotation,
    )
}

#
# @include "_sc_atac_reporter_stages.mro"
#

stage SUMMARIZE_REPORTS_SINGLECELL(
    in  string reference_path,
    in  json   complexity_summary,
    in  json   cell_calling_summary,
    in  json   peak_results,
    in  json   basic_results,
    in  json   error_results_summary,
    in  json   insert_summary,
    in  json   singlecell_results,
    in  json   contam_results,
    in  json   downsample_info,
    in  json   enrichment_results,
    out json   analysis_params,
    out json   summary,
    out csv    summary_csv,
    src py     "stages/reporter/summarize_reports_singlecell",
) using (
    mem_gb = 4,
)

stage CREATE_WEBSUMMARY(
    in  string reference_path,
    in  string barcode_whitelist,
    in  json   summary_results,
    in  json   bulk_complexity,
    in  json   singlecell_complexity,
    in  string sample_id,
    in  string sample_desc,
    in  map[]  sample_def,
    in  bool   debug,
    in  csv    singlecell,
    in  csv    insert_sizes,
    in  csv    tss_relpos,
    in  csv    ctcf_relpos,
    in  h5     filtered_peak_bc_matrix,
    in  h5     analysis,
    in  json   excluded_barcodes,
    out html   web_summary,
    src py     "stages/reporter/create_websummary",
) using (
    mem_gb = 16,
)

#
# @include "_sc_atac_reporter.mro"
#

pipeline _SC_ATAC_REPORTER(
    in  string reference_path,
    in  string barcode_whitelist,
    in  json   bulk_complexity,
    in  json   cell_calling_summary,
    in  json   complexity_summary,
    in  json   basic_summary,
    in  json   peak_summary,
    in  json   singlecell_results,
    in  json   insert_summary,
    in  json   downsample_info,
    in  json   singlecell_complexity,
    in  csv    singlecell,
    in  csv    tss_relpos,
    in  csv    ctcf_relpos,
    in  string sample_id,
    in  string sample_desc,
    in  map[]  sample_def,
    in  csv    sc_insert_sizes,
    in  json   enrichment_results,
    in  h5     filtered_peak_bc_matrix,
    in  h5     analysis,
    in  json   excluded_barcodes,
    #
    out json   summary,
    out html   web_summary,
    out csv    summary_csv,
)
{
    call SUMMARIZE_REPORTS_SINGLECELL(
        reference_path        = self.reference_path,
        complexity_summary    = self.complexity_summary,
        cell_calling_summary  = self.cell_calling_summary,
        peak_results          = self.peak_summary,
        basic_results         = self.basic_summary,
        error_results_summary = null,
        insert_summary        = self.insert_summary,
        singlecell_results    = self.singlecell_results,
        contam_results        = null,
        downsample_info       = self.downsample_info,
        enrichment_results    = self.enrichment_results,
    )

    call CREATE_WEBSUMMARY(
        reference_path          = self.reference_path,
        barcode_whitelist       = self.barcode_whitelist,
        singlecell              = self.singlecell,
        tss_relpos              = self.tss_relpos,
        ctcf_relpos             = self.ctcf_relpos,
        sample_id               = self.sample_id,
        sample_desc             = self.sample_desc,
        sample_def              = self.sample_def,
        insert_sizes            = self.sc_insert_sizes,
        summary_results         = SUMMARIZE_REPORTS_SINGLECELL.summary,
        bulk_complexity         = self.bulk_complexity,
        singlecell_complexity   = self.singlecell_complexity,
        analysis                = self.analysis,
        filtered_peak_bc_matrix = self.filtered_peak_bc_matrix,
        excluded_barcodes       = self.excluded_barcodes,
        debug                   = false,
    )

    return (
        summary     = SUMMARIZE_REPORTS_SINGLECELL.summary,
        web_summary = CREATE_WEBSUMMARY.web_summary,
        summary_csv = SUMMARIZE_REPORTS_SINGLECELL.summary_csv,
    )
}

#
# @include "_atac_cloupe_stages.mro"
#

stage CLOUPE_PREPROCESS(
    in  string     pipestance_type,
    in  string     sample_id,
    in  string     sample_desc,
    in  string     reference_path,
    in  h5         analysis,
    in  h5         feature_barcode_matrix,
    in  bed        peaks,
    in  tsv.gz.tbi fragments_index,
    in  json       metrics_json,
    in  csv        aggregation_csv,
    in  json       gem_group_index_json,
    in  bool       no_secondary_analysis,
    out cloupe     output_for_cloupe,
    out json       gem_group_index_json,
    src py         "stages/cloupe/atac_cloupe_preprocess",
) split (
)

#
# @include "_preflight_stages.mro"
#

stage ATAC_COUNTER_PREFLIGHT(
    in  string   sample_id,
    in  string   fastq_mode,
    in  map[]    sample_def,
    in  string   reference_path,
    in  map      force_cells,
    in  string[] factorization,
    in  map      downsample,
    in  bool     check_executables,
    in  map      trim_def,
    src py       "stages/preflight/atac_counter",
) split (
)

stage ATAC_AGGR_PREFLIGHT(
    in  string   sample_id,
    in  string   reference_path,
    in  csv      aggr_csv,
    in  string   normalization,
    in  string[] factorization,
    in  bool     check_executables,
    src py       "stages/preflight/atac_aggr",
) split (
)

stage ATAC_REANALYZER_PREFLIGHT(
    in  string     sample_id,
    in  string     reference_path,
    in  string     barcode_whitelist,
    in  bed        peaks,
    in  csv        parameters,
    in  map        force_cells,
    in  csv        cell_barcodes,
    in  tsv.gz     fragments,
    in  tsv.gz.tbi fragments_index,
    in  csv        aggregation_csv,
    in  bool       check_executables,
    src py         "stages/preflight/atac_reanalyzer",
) split (
)

#
# @include "sc_atac_counter.mro"
#

pipeline SC_ATAC_COUNTER(
    in  string     fastq_mode,
    in  string     sample_id,
    in  map[]      sample_def,
    in  map        downsample,
    in  string     sample_desc,
    in  string     reference_path,
    in  map        trim_def,
    in  string     barcode_whitelist,
    in  map        adapters,
    in  string[]   factorization,
    in  map        force_cells,
    #
    out csv        singlecell,
    out bam        possorted_bam,
    out bam.bai    possorted_bam_index,
    out json       summary,
    out html       web_summary,
    out bed        peaks,
    out h5         raw_peak_bc_matrix,
    out path       raw_peak_bc_matrix_mex,
    out path       analysis_csv,
    out h5         filtered_peak_bc_matrix,
    out path       filtered_peak_bc_matrix_mex,
    out tsv.gz     fragments,
    out tsv.gz.tbi fragments_index,
    out h5         filtered_tf_bc_matrix,
    out path       filtered_tf_bc_matrix_mex,
    out cloupe     cloupe,
    out csv        summary_csv,
    out tsv        peak_annotation,
)
{
    call ATAC_COUNTER_PREFLIGHT as ATAC_COUNTER_PREFLIGHT_LOCAL(
        sample_id         = self.sample_id,
        fastq_mode        = self.fastq_mode,
        sample_def        = self.sample_def,
        reference_path    = self.reference_path,
        force_cells       = self.force_cells,
        factorization     = self.factorization,
        downsample        = self.downsample,
        trim_def          = self.trim_def,
        check_executables = false,
    ) using (
        local     = true,
        preflight = true,
    )

    call ATAC_COUNTER_PREFLIGHT(
        sample_id         = self.sample_id,
        fastq_mode        = self.fastq_mode,
        sample_def        = self.sample_def,
        reference_path    = self.reference_path,
        force_cells       = self.force_cells,
        factorization     = self.factorization,
        downsample        = self.downsample,
        trim_def          = self.trim_def,
        check_executables = true,
    ) using (
        preflight = true,
    )

    call _BASIC_SC_ATAC_COUNTER(
        sample_id         = self.sample_id,
        fastq_mode        = self.fastq_mode,
        sample_def        = self.sample_def,
        trim_def          = self.trim_def,
        adapters          = self.adapters,
        reference_path    = self.reference_path,
        barcode_whitelist = self.barcode_whitelist,
        downsample        = self.downsample,
        force_cells       = self.force_cells,
    )

    call _SC_ATAC_METRIC_COLLECTOR(
        read_paired_bam    = _BASIC_SC_ATAC_COUNTER.read_paired_bam,
        fragments          = _BASIC_SC_ATAC_COUNTER.fragments,
        fragments_index    = _BASIC_SC_ATAC_COUNTER.fragments_index,
        peaks              = _BASIC_SC_ATAC_COUNTER.peaks,
        reference_path     = self.reference_path,
        cell_barcodes      = _BASIC_SC_ATAC_COUNTER.cell_barcodes,
        singlecell_cells   = _BASIC_SC_ATAC_COUNTER.singlecell_cells,
        singlecell_mapping = _BASIC_SC_ATAC_COUNTER.singlecell_mapping,
    )

    call _SC_ATAC_ANALYZER(
        peaks                   = _BASIC_SC_ATAC_COUNTER.peaks,
        filtered_peak_bc_matrix = _BASIC_SC_ATAC_COUNTER.filtered_peak_bc_matrix,
        reference_path          = self.reference_path,
        factorization           = self.factorization,
        tsne_perplexity         = 30,
        tsne_max_dims           = null,
        tsne_input_pcs          = null,
        tsne_max_iter           = null,
        tsne_stop_lying_iter    = null,
        tsne_mom_switch_iter    = null,
        tsne_theta              = null,
        random_seed             = null,
        max_clusters            = 10,
        neighbor_a              = null,
        neighbor_b              = null,
        graphclust_neighbors    = null,
        num_components          = 15,
        num_dr_bcs              = null,
        num_dr_features         = null,
    )

    call CLOUPE_PREPROCESS(
        pipestance_type        = "SC_ATAC_COUNTER_CS",
        reference_path         = self.reference_path,
        sample_id              = self.sample_id,
        sample_desc            = self.sample_desc,
        analysis               = _SC_ATAC_ANALYZER.analysis,
        feature_barcode_matrix = _SC_ATAC_ANALYZER.feature_bc_matrix,
        metrics_json           = _SC_ATAC_METRIC_COLLECTOR.basic_summary,
        peaks                  = _BASIC_SC_ATAC_COUNTER.peaks,
        fragments_index        = _BASIC_SC_ATAC_COUNTER.fragments_index,
        aggregation_csv        = null,
        gem_group_index_json   = null,
        no_secondary_analysis  = false,
    )

    call _SC_ATAC_REPORTER(
        reference_path          = self.reference_path,
        barcode_whitelist       = self.barcode_whitelist,
        bulk_complexity         = _SC_ATAC_METRIC_COLLECTOR.bulk_complexity,
        singlecell_complexity   = _SC_ATAC_METRIC_COLLECTOR.singlecell_complexity,
        cell_calling_summary    = _BASIC_SC_ATAC_COUNTER.cell_calling_summary,
        complexity_summary      = _SC_ATAC_METRIC_COLLECTOR.complexity_summary,
        basic_summary           = _SC_ATAC_METRIC_COLLECTOR.basic_summary,
        peak_summary            = _BASIC_SC_ATAC_COUNTER.peak_metrics,
        singlecell_results      = _SC_ATAC_METRIC_COLLECTOR.singlecell_results,
        insert_summary          = _SC_ATAC_METRIC_COLLECTOR.insert_summary,
        downsample_info         = _BASIC_SC_ATAC_COUNTER.downsample_info,
        singlecell              = _SC_ATAC_METRIC_COLLECTOR.singlecell,
        tss_relpos              = _SC_ATAC_METRIC_COLLECTOR.tss_relpos,
        ctcf_relpos             = _SC_ATAC_METRIC_COLLECTOR.ctcf_relpos,
        sample_id               = self.sample_id,
        sample_desc             = self.sample_desc,
        sample_def              = self.sample_def,
        sc_insert_sizes         = _SC_ATAC_METRIC_COLLECTOR.insert_sizes,
        enrichment_results      = _SC_ATAC_METRIC_COLLECTOR.enrichment_results,
        filtered_peak_bc_matrix = _BASIC_SC_ATAC_COUNTER.filtered_peak_bc_matrix,
        analysis                = _SC_ATAC_ANALYZER.analysis,
        excluded_barcodes       = _BASIC_SC_ATAC_COUNTER.excluded_barcodes,
    )

    return (
        singlecell                  = _SC_ATAC_METRIC_COLLECTOR.singlecell,
        possorted_bam               = _BASIC_SC_ATAC_COUNTER.possorted_bam,
        possorted_bam_index         = _BASIC_SC_ATAC_COUNTER.possorted_bam_index,
        summary                     = _SC_ATAC_REPORTER.summary,
        web_summary                 = _SC_ATAC_REPORTER.web_summary,
        peaks                       = _BASIC_SC_ATAC_COUNTER.peaks,
        raw_peak_bc_matrix          = _BASIC_SC_ATAC_COUNTER.raw_peak_bc_matrix,
        raw_peak_bc_matrix_mex      = _BASIC_SC_ATAC_COUNTER.raw_peak_bc_matrix_mex,
        analysis_csv                = _SC_ATAC_ANALYZER.analysis_csv,
        filtered_peak_bc_matrix     = _BASIC_SC_ATAC_COUNTER.filtered_peak_bc_matrix,
        filtered_peak_bc_matrix_mex = _BASIC_SC_ATAC_COUNTER.filtered_peak_bc_matrix_mex,
        fragments                   = _BASIC_SC_ATAC_COUNTER.fragments,
        fragments_index             = _BASIC_SC_ATAC_COUNTER.fragments_index,
        filtered_tf_bc_matrix       = _SC_ATAC_ANALYZER.filtered_tf_bc_matrix,
        filtered_tf_bc_matrix_mex   = _SC_ATAC_ANALYZER.filtered_tf_bc_matrix_mex,
        cloupe                      = CLOUPE_PREPROCESS.output_for_cloupe,
        summary_csv                 = _SC_ATAC_REPORTER.summary_csv,
        peak_annotation             = _SC_ATAC_ANALYZER.peak_annotation,
    )
}

#
# @include "sc_atac_counter_cs.mro"
#

# Customer-facing (CS) pipeline
pipeline SC_ATAC_COUNTER_CS(
    in  string     fastq_mode                   "Input fastq configuration",
    in  string     sample_id,
    in  map[]      sample_def,
    in  map        downsample,
    in  string     sample_desc                  "Sample description",
    in  string     reference_path               "Path to 10X reference package",
    in  string[]   factorization                "Dimensionality reduction method (lsa, plsa, or pca)",
    in  map        force_cells                  "Force cell calling to a fixed number",
    #
    out csv        singlecell                   "Per-barcode fragment counts & metrics",
    out bam        possorted_bam                "Position sorted BAM file"  "possorted_bam.bam",
    out bam.bai    possorted_bam_index          "Position sorted BAM index"  "possorted_bam.bam.bai",
    out json       summary                      "Summary of all data metrics",
    out html       web_summary                  "HTML file summarizing data & analysis",
    out bed        peaks                        "Bed file of all called peak locations",
    out h5         raw_peak_bc_matrix           "Raw peak barcode matrix in hdf5 format",
    out path       raw_peak_bc_matrix_mex       "Raw peak barcode matrix in mex format"  "raw_peak_bc_matrix",
    out path       analysis_csv                 "Directory of analysis files"  "analysis",
    out h5         filtered_peak_bc_matrix      "Filtered peak barcode matrix in hdf5 format",
    out path       filtered_peak_bc_matrix_mex  "Filtered peak barcode matrix in mex format"  "filtered_peak_bc_matrix",
    out tsv.gz     fragments                    "Barcoded and aligned fragment file"  "fragments.tsv.gz",
    out tsv.gz.tbi fragments_index              "Fragment file index"       "fragments.tsv.gz.tbi",
    out h5         filtered_tf_bc_matrix        "Filtered tf barcode matrix in hdf5 format",
    out path       filtered_tf_bc_matrix_mex    "Filtered tf barcode matrix in mex format"  "filtered_tf_bc_matrix",
    out cloupe     cloupe                       "Loupe Cell Browser input file",
    out csv        summary_csv                  "csv summarizing important metrics and values"  "summary.csv",
    out tsv        peak_annotation              "Annotation of peaks with genes",
)
{
    call SC_ATAC_COUNTER(
        fastq_mode        = self.fastq_mode,
        sample_id         = self.sample_id,
        sample_def        = self.sample_def,
        downsample        = self.downsample,
        sample_desc       = self.sample_desc,
        reference_path    = self.reference_path,
        trim_def          = {
            "R1": {
                "3prime": ["MErc"],
            },
            "R2": {
                "3prime": ["MErc"],
            },
            "discard_untrimmed": false,
        },
        barcode_whitelist = "737K-cratac-v1",
        adapters          = {
            "ME": "AGATGTGTATAAGAGACAG",
            "MErc": "CTGTCTCTTATACACATCT",
        },
        factorization     = self.factorization,
        force_cells       = self.force_cells,
    )

    return (
        singlecell                  = SC_ATAC_COUNTER.singlecell,
        possorted_bam               = SC_ATAC_COUNTER.possorted_bam,
        possorted_bam_index         = SC_ATAC_COUNTER.possorted_bam_index,
        summary                     = SC_ATAC_COUNTER.summary,
        web_summary                 = SC_ATAC_COUNTER.web_summary,
        peaks                       = SC_ATAC_COUNTER.peaks,
        raw_peak_bc_matrix          = SC_ATAC_COUNTER.raw_peak_bc_matrix,
        raw_peak_bc_matrix_mex      = SC_ATAC_COUNTER.raw_peak_bc_matrix_mex,
        analysis_csv                = SC_ATAC_COUNTER.analysis_csv,
        filtered_peak_bc_matrix     = SC_ATAC_COUNTER.filtered_peak_bc_matrix,
        filtered_peak_bc_matrix_mex = SC_ATAC_COUNTER.filtered_peak_bc_matrix_mex,
        fragments                   = SC_ATAC_COUNTER.fragments,
        fragments_index             = SC_ATAC_COUNTER.fragments_index,
        filtered_tf_bc_matrix       = SC_ATAC_COUNTER.filtered_tf_bc_matrix,
        filtered_tf_bc_matrix_mex   = SC_ATAC_COUNTER.filtered_tf_bc_matrix_mex,
        cloupe                      = SC_ATAC_COUNTER.cloupe,
        summary_csv                 = SC_ATAC_COUNTER.summary_csv,
        peak_annotation             = SC_ATAC_COUNTER.peak_annotation,
    )
}

#
# @include "__PBMC_P.mro"
#

call SC_ATAC_COUNTER_CS(
    fastq_mode     = "ILMN_BCL2FASTQ",
    sample_id      = "PBMC_P",
    sample_def     = [{
        "bc_in_read": 1,
        "bc_length": 16,
        "gem_group": null,
        "lanes": null,
        "library": "LibraryNotSpecified",
        "read_path": "/data/isshamie/dropbox/ATACseq/2020_11_18_Croker/igm-storage2.ucsd.edu/201113_A00953_0185_AHN7TMDSXY",
        "sample_indices": ["any"],
        "sample_names": ["BC_10xATAC_PMBC_P"],
    }],
    reference_path = "/data/isshamie/mito_lineage/data/external/GRCh38_MT_blacklist",
    downsample     = null,
    sample_desc    = "",
    factorization  = ["lsa"],
    force_cells    = null,
)
ew file mode 100644
ndex 0000000..9c51e87
++ b/Analysis/mtscATAC/2020_11_18_Croker/PBMC_P/_sitecheck

eb68621b49dd4f77b70f5698ea51b9190ad8b14c

@github-actions github-actions bot added the todo label May 25, 2021
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

0 participants