From 75a640b567f001aa6a5163878cc00cd579d2d60a Mon Sep 17 00:00:00 2001 From: Andrew Thrasher Date: Mon, 25 Nov 2024 15:03:11 -0500 Subject: [PATCH] chore: migrate scrnaseq to external repo (#188) * chore: migrate scrnaseq to external repo --- .github/workflows/docker-build.yaml | 2 +- .github/workflows/pytest.yaml | 6 +- .github/workflows/sprocket-check.yaml | 2 +- .github/workflows/sprocket-lint.yaml | 4 +- docker/cellranger/1.1.1/Dockerfile | 25 --- tests/tools/input/pbmc_1k_v3.tar.gz | 3 - tests/tools/input_json/cellranger_count.json | 5 - tests/tools/test_cellranger.yaml | 38 ---- tools/cellranger.wdl | 189 ------------------- workflows/scrnaseq/10x-bam-to-fastqs.wdl | 99 ---------- workflows/scrnaseq/scrnaseq-standard.wdl | 105 ----------- 11 files changed, 7 insertions(+), 471 deletions(-) delete mode 100755 docker/cellranger/1.1.1/Dockerfile delete mode 100755 tests/tools/input/pbmc_1k_v3.tar.gz delete mode 100644 tests/tools/input_json/cellranger_count.json delete mode 100644 tests/tools/test_cellranger.yaml delete mode 100755 tools/cellranger.wdl delete mode 100755 workflows/scrnaseq/10x-bam-to-fastqs.wdl delete mode 100755 workflows/scrnaseq/scrnaseq-standard.wdl diff --git a/.github/workflows/docker-build.yaml b/.github/workflows/docker-build.yaml index 80425dce9..b081c1615 100644 --- a/.github/workflows/docker-build.yaml +++ b/.github/workflows/docker-build.yaml @@ -18,7 +18,7 @@ jobs: fetch-depth: 0 - name: set matrix id: set-matrix - run: echo "images=$(find docker -maxdepth 2 -mindepth 2 -not -path "*/cellranger/*" | jq --raw-input --slurp --compact-output 'split("\n")[:-1]')" >> $GITHUB_OUTPUT + run: echo "images=$(find docker -maxdepth 2 -mindepth 2 | jq --raw-input --slurp --compact-output 'split("\n")[:-1]')" >> $GITHUB_OUTPUT build-images: needs: list-images diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml index 915481d93..03e7827c2 100644 --- a/.github/workflows/pytest.yaml +++ b/.github/workflows/pytest.yaml @@ -12,7 +12,7 @@ jobs: uses: actions/checkout@v4 - name: set tags id: set-tags - run: echo "tags=$(find tests -name '*.yaml' -exec yq --output-format yaml '.[].tags[] ' {} \;| sort | uniq | grep -vE 'deprecated|miniwdl|cellranger' | jq -ncR '[inputs]')" >> $GITHUB_OUTPUT + run: echo "tags=$(find tests -name '*.yaml' -exec yq --output-format yaml '.[].tags[] ' {} \;| sort | uniq | grep -vE 'deprecated|miniwdl' | jq -ncR '[inputs]')" >> $GITHUB_OUTPUT pytest_check: needs: list-tags runs-on: ubuntu-latest @@ -33,7 +33,7 @@ jobs: pip install -r requirements-dev.txt - name: filter tests run: | - find tests -name '*.yaml' -exec yq --output-format yaml -i 'del(.[] | select(.tags[] | test("reference|deprecated|cellranger") ) )' {} \; + find tests -name '*.yaml' -exec yq --output-format yaml -i 'del(.[] | select(.tags[] | test("reference|deprecated") ) )' {} \; - name: Run pytest-workflow run: | - pytest --git-aware --basetemp /home/runner/work/pytest --symlink --tag ${{ matrix.tag }} \ No newline at end of file + pytest --git-aware --basetemp /home/runner/work/pytest --symlink --tag ${{ matrix.tag }} diff --git a/.github/workflows/sprocket-check.yaml b/.github/workflows/sprocket-check.yaml index 1e7a72db7..6ed862cd2 100644 --- a/.github/workflows/sprocket-check.yaml +++ b/.github/workflows/sprocket-check.yaml @@ -10,4 +10,4 @@ jobs: - name: Run sprocket uses: stjude-rust-labs/sprocket-action@main with: - exclude-patterns: template,scrnaseq,cellranger \ No newline at end of file + exclude-patterns: template diff --git a/.github/workflows/sprocket-lint.yaml b/.github/workflows/sprocket-lint.yaml index 80ff79fdb..60d0867e5 100644 --- a/.github/workflows/sprocket-lint.yaml +++ b/.github/workflows/sprocket-lint.yaml @@ -11,7 +11,7 @@ jobs: uses: stjude-rust-labs/sprocket-action@main with: lint: true - exclude-patterns: template,scrnaseq,cellranger + exclude-patterns: template deny-warnings: true deny-notes: true - except: TrailingComma,ContainerValue \ No newline at end of file + except: TrailingComma,ContainerValue diff --git a/docker/cellranger/1.1.1/Dockerfile b/docker/cellranger/1.1.1/Dockerfile deleted file mode 100755 index a03cf692e..000000000 --- a/docker/cellranger/1.1.1/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -# Supply a valid download link and md5sum from "https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/latest" - -FROM ubuntu:20.04 - -ARG CELLRANGER_URL -ARG CELLRANGER_MD5 - -RUN apt-get update \ - && apt-get upgrade -y \ - && apt-get install curl -y \ - && rm -r /var/lib/apt/lists/* - -WORKDIR /opt - -RUN curl -o cellranger.tar.gz \ - ${CELLRANGER_URL} \ - && echo "${CELLRANGER_MD5} cellranger.tar.gz" > cellranger.tar.gz.md5 \ - && md5sum -c cellranger.tar.gz.md5 \ - && tar -xzvf cellranger.tar.gz \ - && mv cellranger-* cellranger \ - && rm cellranger.tar.gz* - -ENV PATH "/opt/cellranger:$PATH" - -ENTRYPOINT [ "cellranger" ] diff --git a/tests/tools/input/pbmc_1k_v3.tar.gz b/tests/tools/input/pbmc_1k_v3.tar.gz deleted file mode 100755 index 972318ae3..000000000 --- a/tests/tools/input/pbmc_1k_v3.tar.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7461ad873a32dc1f55931c0c36280623913368b07babf4ff97e2a71c3d918453 -size 1468996 diff --git a/tests/tools/input_json/cellranger_count.json b/tests/tools/input_json/cellranger_count.json deleted file mode 100644 index 2793d73c9..000000000 --- a/tests/tools/input_json/cellranger_count.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "fastqs_tar_gz": "tests/tools/input/pbmc_1k_v3.tar.gz", - "transcriptome_tar_gz": "tests/tools/input/GRCh38.tar.gz", - "id": "pbmc_1k_v3" -} \ No newline at end of file diff --git a/tests/tools/test_cellranger.yaml b/tests/tools/test_cellranger.yaml deleted file mode 100644 index 421047ddf..000000000 --- a/tests/tools/test_cellranger.yaml +++ /dev/null @@ -1,38 +0,0 @@ -- name: cellranger_count - tags: - - miniwdl - - cellranger - command: >- - miniwdl run --verbose -d test-output/. --task count -i tests/tools/input_json/cellranger_count.json tools/cellranger.wdl - files: - - path: test-output/out/bam/possorted_genome_bam.bam - - path: test-output/out/bam_index/possorted_genome_bam.bam.bai - - path: test-output/out/qc/metrics_summary.csv - - path: test-output/out/barcodes/barcodes.tsv.gz - - path: test-output/out/features/features.tsv.gz - - path: test-output/out/matrix/matrix.mtx.gz - - path: test-output/out/filtered_gene_h5/filtered_feature_bc_matrix.h5 - - path: test-output/out/raw_gene_h5/raw_feature_bc_matrix.h5 - - path: test-output/out/raw_barcodes/barcodes.tsv.gz - - path: test-output/out/raw_features/features.tsv.gz - - path: test-output/out/raw_matrix/matrix.mtx.gz - - path: test-output/out/mol_info_h5/molecule_info.h5 - - path: test-output/out/web_summary/web_summary.html - - path: test-output/out/cloupe/cloupe.cloupe - -- name: cellranger_bamtofastq - tags: - - miniwdl - - cellranger - command: >- - miniwdl run --verbose -d test-output/. --task bamtofastq tools/cellranger.wdl bam="tests/tools/input/possorted_genome_bam.bam" - files: - - path: test-output/out/fastqs/0/bamtofastq_S1_L001_R1_001.fastq.gz - - path: test-output/out/fastqs/1/bamtofastq_S1_L001_R2_001.fastq.gz - - path: test-output/out/fastqs/2/bamtofastq_S1_L002_R1_001.fastq.gz - - path: test-output/out/fastqs/3/bamtofastq_S1_L002_R2_001.fastq.gz - - path: test-output/out/fastqs_archive/archive.tar.gz - - path: test-output/out/read_one_fastq_gz/0/bamtofastq_S1_L001_R1_001.fastq.gz - - path: test-output/out/read_one_fastq_gz/1/bamtofastq_S1_L002_R1_001.fastq.gz - - path: test-output/out/read_two_fastq_gz/0/bamtofastq_S1_L001_R2_001.fastq.gz - - path: test-output/out/read_two_fastq_gz/1/bamtofastq_S1_L002_R2_001.fastq.gz diff --git a/tools/cellranger.wdl b/tools/cellranger.wdl deleted file mode 100755 index f97c66577..000000000 --- a/tools/cellranger.wdl +++ /dev/null @@ -1,189 +0,0 @@ -## Cell Ranger -## -## This WDL file wrap the [10x Genomics Cell Ranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) tool. -## Cell Ranger is a tool for handling scRNA-Seq data. -#@ except: LineWidth - -version 1.1 - -task count { - meta { - description: "This WDL task runs Cell Ranger count to generate an aligned BAM and feature counts from scRNA-Seq data." - outputs: { - bam: "Aligned BAM file", - bam_index: "BAM index file", - qc: "Quality control metrics in CSV format", - barcodes: "Barcodes in gzipped TSV format", - features: "Filtered features in gzipped TSV format", - matrix: "Filtered matrix of features", - filtered_gene_h5: "Filtered gene matrix in H5 format", - raw_gene_h5: "Raw gene matrix in H5 format", - raw_barcodes: "Raw barcodes in gzipped TSV format", - raw_features: "Raw features in gzipped TSV format", - raw_matrix: "Raw matrix of features", - mol_info_h5: "Molecule information in H5 format", - web_summary: "HTML summary of the run", - cloupe: "Cloupe file for visualization", - } - } - - parameter_meta { - fastqs_tar_gz: "Path to the FASTQ folder archive in .tar.gz format" - transcriptome_tar_gz: "Path to Cell Ranger-compatible transcriptome reference in .tar.gz format" - id: "A unique run ID" - use_all_cores: "Use all cores? Recommended for cloud environments." - ncpu: "Number of cores to allocate for task" - memory_gb: "RAM to allocate for task, specified in GB" - modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB." - } - - input { - File fastqs_tar_gz - File transcriptome_tar_gz - String id - Boolean use_all_cores = false - Int ncpu = 1 - Int memory_gb = 16 - Int modify_disk_size_gb = 0 - } - - Float fastq_size = size(fastqs_tar_gz, "GiB") - Float transcriptome_size = size(transcriptome_tar_gz, "GiB") - Int disk_size_gb = ( - ceil((fastq_size + transcriptome_size) * 2) + 10 + modify_disk_size_gb - ) - - command <<< - set -euo pipefail - - n_cores=~{ncpu} - if ~{use_all_cores}; then - n_cores=$(nproc) - fi - - mkdir transcriptome_dir - tar -xzf ~{transcriptome_tar_gz} \ - -C transcriptome_dir \ - --strip-components 1 \ - --no-same-owner - - mkdir fastqs - tar -xzf ~{fastqs_tar_gz} -C fastqs --no-same-owner - - files=(fastqs/*.fastq.gz) - # sample parameter to cellranger count must match - # the sample prefix contained in the FASTQ file. - # So we infer it here by manipulating the file name. - # expected sample name extension comes from: - # https://support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl2fastq/bcl2fastq2-v2-20-software-guide-15051736-03.pdf - sample_id="$(basename "${files[0]}" | sed -E 's/_S[1-9]_L[0-9]{3}_[I,R][1,2]_001.fastq.gz$//')" - - cellranger count \ - --id ~{id} \ - --transcriptome transcriptome_dir \ - --fastqs fastqs \ - --sample "${sample_id}" \ - --jobmode local \ - --localcores "$n_cores" \ - --localmem ~{memory_gb} \ - --disable-ui - >>> - - output { - File bam = glob("*/outs/possorted_genome_bam.bam")[0] - File bam_index = glob("*/outs/possorted_genome_bam.bam.bai")[0] - File qc = glob("*/outs/metrics_summary.csv")[0] - File barcodes = glob("*/outs/filtered_feature_bc_matrix/barcodes.tsv.gz")[0] - File features = glob("*/outs/filtered_feature_bc_matrix/features.tsv.gz")[0] - File matrix = glob("*/outs/filtered_feature_bc_matrix/matrix.mtx.gz")[0] - File filtered_gene_h5 = glob("*/outs/filtered_feature_bc_matrix.h5")[0] - File raw_gene_h5 = glob("*/outs/raw_feature_bc_matrix.h5")[0] - File raw_barcodes = glob("*/outs/raw_feature_bc_matrix/barcodes.tsv.gz")[0] - File raw_features = glob("*/outs/raw_feature_bc_matrix/features.tsv.gz")[0] - File raw_matrix = glob("*/outs/raw_feature_bc_matrix/matrix.mtx.gz")[0] - File mol_info_h5 = glob("*/outs/molecule_info.h5")[0] - File web_summary = glob("*/outs/web_summary.html")[0] - File cloupe = glob("*/outs/cloupe.cloupe" )[0] - } - - runtime { - cpu: ncpu - memory: "~{memory_gb} GB" - disks: "~{disk_size_gb} GB" - container: "ghcr.io/stjudecloud/cellranger:1.1.1" - maxRetries: 1 - } -} - -task bamtofastq { - meta { - description: "This WDL task runs the 10x bamtofastq tool to convert Cell Ranger generated BAM files back to FASTQ files" - outputs: { - fastqs: "FASTQ files", - fastqs_archive: "FASTQ files in a tarball", - read_one_fastq_gz: "Read 1 FASTQ files", - read_two_fastq_gz: "Read 2 FASTQ files", - } - } - - parameter_meta { - bam: "Input BAM to convert to Cell Ranger compatible fastqs" - cellranger11: "Convert a BAM produced by Cell Ranger 1.0-1.1" - longranger20: "Convert a BAM produced by Longranger 2.0" - gemcode: "Convert a BAM produced from GemCode data (Longranger 1.0 - 1.3)" - use_all_cores: "Use all cores? Recommended for cloud environments." - ncpu: "Number of cores to allocate for task" - memory_gb: "RAM to allocate for task, specified in GB" - modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB." - } - - input { - File bam - Boolean cellranger11 = false - Boolean longranger20 = false - Boolean gemcode = false - Boolean use_all_cores = false - Int ncpu = 1 - Int memory_gb = 40 - Int modify_disk_size_gb = 0 - } - - Float bam_size = size(bam, "GiB") - Int disk_size_gb = ceil(bam_size * 2) + 10 + modify_disk_size_gb - - String data_arg = ( - if (cellranger11) then "--cr11" - else if (longranger20) then "--lr10" - else if (gemcode) then "--gemcode" - else "" - ) - - command <<< - set -euo pipefail - - n_cores=~{ncpu} - if ~{use_all_cores}; then - n_cores=$(nproc) - fi - - cellranger bamtofastq --nthreads "$n_cores" ~{data_arg} ~{bam} fastqs - - cd fastqs/*/ - tar -czf archive.tar.gz ./*.fastq.gz - >>> - - output { - Array[File] fastqs = glob("fastqs/*/*fastq.gz") - File fastqs_archive = glob("fastqs/*/*.tar.gz")[0] - Array[File] read_one_fastq_gz = glob("fastqs/*/*R1*.fastq.gz") - Array[File] read_two_fastq_gz = glob("fastqs/*/*R2*.fastq.gz") - } - - runtime { - cpu: ncpu - memory: "~{memory_gb} GB" - disks: "~{disk_size_gb} GB" - container: "ghcr.io/stjudecloud/cellranger:1.1.1" - maxRetries: 1 - } -} diff --git a/workflows/scrnaseq/10x-bam-to-fastqs.wdl b/workflows/scrnaseq/10x-bam-to-fastqs.wdl deleted file mode 100755 index d57f116ba..000000000 --- a/workflows/scrnaseq/10x-bam-to-fastqs.wdl +++ /dev/null @@ -1,99 +0,0 @@ -version 1.1 - -import "../../tools/cellranger.wdl" -import "../../tools/fq.wdl" -import "../../tools/samtools.wdl" - -workflow cell_ranger_bam_to_fastqs { - meta { - description: "Convert a 10x Genomics BAM file to FASTQs." - allowNestedInputs: true - outputs: { - fastqs: "FASTQ files with reads.", - fastqs_archive: "Compressed archive of FASTQ files.", - read1s: "Gzipped read 1 FASTQ files.", - read2s: "Gzipped read 2 FASTQ files.", - } - } - - parameter_meta { - bam: "BAM file to split into FASTQs." - cellranger11: "Convert a BAM produced by Cell Ranger 1.0-1.1" - longranger20: "Convert a BAM produced by Longranger 2.0" - gemcode: "Convert a BAM produced from GemCode data (Longranger 1.0 - 1.3)" - use_all_cores: "Use all cores for multi-core steps?" - } - - input { - File bam - Boolean cellranger11 = false - Boolean longranger20 = false - Boolean gemcode = false - Boolean use_all_cores = false - } - - call samtools.quickcheck { input: bam } - call cellranger.bamtofastq { input: - bam, - cellranger11, - longranger20, - gemcode, - use_all_cores, - } - scatter (reads in zip(bamtofastq.read_one_fastq_gz, bamtofastq.read_two_fastq_gz)) { - call fq.fqlint { input: - read_one_fastq = reads.left, - read_two_fastq = reads.right, - } - } - - output { - Array[File] fastqs = bamtofastq.fastqs - File fastqs_archive = bamtofastq.fastqs_archive - Array[File] read1s = bamtofastq.read_one_fastq_gz - Array[File] read2s = bamtofastq.read_two_fastq_gz - } -} - -task parse_input { - meta { - description: "Parse 10x-bam-to-fastqs workflow inputs and validate" - outputs: { - input_check: "String indicating if input checks passed." - } - } - - parameter_meta { - cellranger11: "Convert a BAM produced by Cell Ranger 1.0-1.1" - longranger20: "Convert a BAM produced by Longranger 2.0" - gemcode: "Convert a BAM produced from GemCode data (Longranger 1.0 - 1.3)" - } - - input { - Boolean cellranger11 - Boolean longranger20 - Boolean gemcode - } - - Int exclusive_arg = (if cellranger11 then 1 else 0) - + (if longranger20 then 1 else 0) - + (if gemcode then 1 else 0) - - command <<< - if [ "~{exclusive_arg}" -gt 1 ]; then - >&2 echo "Only one of cellranger11, longranger20, or gemcode can be set" - exit 1 - fi - >>> - - output { - String input_check = "passed" - } - - runtime { - memory: "4 GB" - disks: "10 GB" - container: "ghcr.io/stjudecloud/util:1.3.0" - maxRetries: 1 - } -} diff --git a/workflows/scrnaseq/scrnaseq-standard.wdl b/workflows/scrnaseq/scrnaseq-standard.wdl deleted file mode 100755 index 1e09fa69e..000000000 --- a/workflows/scrnaseq/scrnaseq-standard.wdl +++ /dev/null @@ -1,105 +0,0 @@ -version 1.1 - -import "../../tools/cellranger.wdl" -import "../../tools/md5sum.wdl" -import "../../tools/ngsderive.wdl" -import "../../tools/picard.wdl" -import "../../tools/samtools.wdl" -import "./10x-bam-to-fastqs.wdl" as bam_to_fastqs - -workflow scrnaseq_standard { - meta { - description: "Align 10x Genomics FASTQ files to a reference genome and perform quantification." - allowNestedInputs: true - outputs: { - harmonized_bam: "Aligned BAM file", - bam_checksum: "Checksum of aligned BAM file", - bam_index: "Index of aligned BAM file", - qc: "Quality control metrics", - barcodes: "Barcode information", - features: "Feature information", - matrix: "Gene expression matrix", - filtered_gene_h5: "Filtered gene expression matrix", - raw_gene_h5: "Raw gene expression matrix", - raw_barcodes: "Raw barcode information", - raw_features: "Raw feature information", - raw_matrix: "Raw gene expression matrix", - mol_info_h5: "Molecule information", - web_summary: "HTML summary", - inferred_strandedness: "Inferred strandedness", - } - } - - parameter_meta { - bam: "Input BAM format file to quality check" - gtf: "Gzipped GTF feature file" - transcriptome_tar_gz: "Database of reference files for Cell Ranger. Can be downloaded from 10x Genomics." - prefix: "Prefix for output files" - validate_input: "Ensure input BAM is well-formed before beginning harmonization?" - use_all_cores: "Use all cores for multi-core steps?" - subsample_n_reads: "Only process a random sampling of `n` reads. <=`0` for processing entire input BAM." - } - - input { - File bam - File gtf - File transcriptome_tar_gz - String prefix = basename(bam, ".bam") - Boolean validate_input = true - Boolean use_all_cores = false - Int subsample_n_reads = -1 - } - - if (validate_input) { - call picard.validate_bam as validate_input_bam { input: - bam, - } - } - - if (subsample_n_reads > 0) { - call samtools.subsample { input: - bam, - desired_reads = subsample_n_reads, - use_all_cores, - } - } - File selected_bam = select_first([subsample.sampled_bam, bam]) - - call bam_to_fastqs.cell_ranger_bam_to_fastqs { input: - bam = selected_bam, - use_all_cores, - } - - call cellranger.count { input: - fastqs_tar_gz = cell_ranger_bam_to_fastqs.fastqs_archive, - transcriptome_tar_gz, - id = prefix, - use_all_cores, - } - call picard.validate_bam { input: bam = count.bam } - call ngsderive.strandedness { input: - bam = count.bam, - bam_index = count.bam_index, - gene_model = gtf, - } - - call md5sum.compute_checksum { input: file = count.bam } - - output { - File harmonized_bam = count.bam - File bam_checksum = compute_checksum.md5sum - File bam_index = count.bam_index - File qc = count.qc - File barcodes = count.barcodes - File features = count.features - File matrix = count.matrix - File filtered_gene_h5 = count.filtered_gene_h5 - File raw_gene_h5 = count.raw_gene_h5 - File raw_barcodes = count.raw_barcodes - File raw_features = count.raw_features - File raw_matrix = count.raw_matrix - File mol_info_h5 = count.mol_info_h5 - File web_summary = count.web_summary - File inferred_strandedness = strandedness.strandedness_file - } -}